remove useless examples

2022-09-15 11:31:58 -04:00 · 2022-09-15 11:31:58 -04:00 · 9152feb24f
parent 49adfd026c
commit 9152feb24f
140 changed files with 0 additions and 67741 deletions
--- a/examples/backprop/backprop.c
+++ b/examples/backprop/backprop.c
@ -1,454 +0,0 @@
 #include "backprop.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 //#define OPEN
 #define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
 #define fastcopy(to, from, len)                                                \
  {                                                                            \
    register char *_to, *_from;                                                \
    register int _i, _l;                                                       \
    _to = (char *)(to);                                                        \
    _from = (char *)(from);                                                    \
    _l = (len);                                                                \
    for (_i = 0; _i < _l; _i++)                                                \
      *_to++ = *_from++;                                                       \
  }
 /*** Return random number between 0.0 and 1.0 ***/
 float drnd() { return ((float)rand() / (float)BIGRND); }
 /*** Return random number between -1.0 and 1.0 ***/
 float dpn1() { return ((drnd() * 2.0) - 1.0); }
 /*** The squashing function.  Currently, it's a sigmoid. ***/
 float squash(x)
 float x;
 {
  float m;
  // x = -x;
  // m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
  // return(1.0 / (1.0 + m));
  return (1.0 / (1.0 + exp(-x)));
 }
 /*** Allocate 1d array of floats ***/
 float *alloc_1d_dbl(n)
 int n;
 {
  float *new;
  new = (float *)malloc((unsigned)(n * sizeof(float)));
  if (new == NULL) {
    printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
    return (NULL);
  }
  return (new);
 }
 /*** Allocate 2d array of floats ***/
 float **alloc_2d_dbl(m, n)
 int m, n;
 {
  int i;
  float **new;
  new = (float **)malloc((unsigned)(m * sizeof(float *)));
  if (new == NULL) {
    printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
    return (NULL);
  }
  for (i = 0; i < m; i++) {
    new[i] = alloc_1d_dbl(n);
  }
  return (new);
 }
 bpnn_randomize_weights(w, m, n) float **w;
 int m, n;
 {
  int i, j;
  for (i = 0; i <= m; i++) {
    for (j = 0; j <= n; j++) {
      w[i][j] = (float)rand() / RAND_MAX;
      //  w[i][j] = dpn1();
    }
  }
 }
 bpnn_randomize_row(w, m) float *w;
 int m;
 {
  int i;
  for (i = 0; i <= m; i++) {
    // w[i] = (float) rand()/RAND_MAX;
    w[i] = 0.1;
  }
 }
 bpnn_zero_weights(w, m, n) float **w;
 int m, n;
 {
  int i, j;
  for (i = 0; i <= m; i++) {
    for (j = 0; j <= n; j++) {
      w[i][j] = 0.0;
    }
  }
 }
 void bpnn_initialize(seed) {
  printf("Random number generator seed: %d\n", seed);
  srand(seed);
 }
 BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
 int n_in, n_hidden, n_out;
 {
  BPNN *newnet;
  newnet = (BPNN *)malloc(sizeof(BPNN));
  if (newnet == NULL) {
    printf("BPNN_CREATE: Couldn't allocate neural network\n");
    return (NULL);
  }
  newnet->input_n = n_in;
  newnet->hidden_n = n_hidden;
  newnet->output_n = n_out;
  newnet->input_units = alloc_1d_dbl(n_in + 1);
  newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
  newnet->output_units = alloc_1d_dbl(n_out + 1);
  newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
  newnet->output_delta = alloc_1d_dbl(n_out + 1);
  newnet->target = alloc_1d_dbl(n_out + 1);
  newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
  newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
  newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
  newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
  return (newnet);
 }
 void bpnn_free(net) BPNN *net;
 {
  int n1, n2, i;
  n1 = net->input_n;
  n2 = net->hidden_n;
  free((char *)net->input_units);
  free((char *)net->hidden_units);
  free((char *)net->output_units);
  free((char *)net->hidden_delta);
  free((char *)net->output_delta);
  free((char *)net->target);
  for (i = 0; i <= n1; i++) {
    free((char *)net->input_weights[i]);
    free((char *)net->input_prev_weights[i]);
  }
  free((char *)net->input_weights);
  free((char *)net->input_prev_weights);
  for (i = 0; i <= n2; i++) {
    free((char *)net->hidden_weights[i]);
    free((char *)net->hidden_prev_weights[i]);
  }
  free((char *)net->hidden_weights);
  free((char *)net->hidden_prev_weights);
  free((char *)net);
 }
 /*** Creates a new fully-connected network from scratch,
     with the given numbers of input, hidden, and output units.
     Threshold units are automatically included.  All weights are
     randomly initialized.
     Space is also allocated for temporary storage (momentum weights,
     error computations, etc).
 ***/
 BPNN *bpnn_create(n_in, n_hidden, n_out)
 int n_in, n_hidden, n_out;
 {
  BPNN *newnet;
  newnet = bpnn_internal_create(n_in, n_hidden, n_out);
 #ifdef INITZERO
  bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
 #else
  bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
 #endif
  bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
  bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
  bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
  bpnn_randomize_row(newnet->target, n_out);
  return (newnet);
 }
 void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
 int n1, n2;
 {
  float sum;
  int j, k;
  /*** Set up thresholding unit ***/
  l1[0] = 1.0;
 #ifdef OPEN
  omp_set_num_threads(NUM_THREAD);
 #pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
 #endif
  /*** For each unit in second layer ***/
  for (j = 1; j <= n2; j++) {
    /*** Compute weighted sum of its inputs ***/
    sum = 0.0;
    for (k = 0; k <= n1; k++) {
      sum += conn[k][j] * l1[k];
    }
    l2[j] = squash(sum);
  }
 }
 // extern "C"
 void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
    *output, *err;
 int nj;
 {
  int j;
  float o, t, errsum;
  errsum = 0.0;
  for (j = 1; j <= nj; j++) {
    o = output[j];
    t = target[j];
    delta[j] = o * (1.0 - o) * (t - o);
    errsum += ABS(delta[j]);
  }
  *err = errsum;
 }
 void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
                       err) float *delta_h,
    *delta_o, *hidden, **who, *err;
 int nh, no;
 {
  int j, k;
  float h, sum, errsum;
  errsum = 0.0;
  for (j = 1; j <= nh; j++) {
    h = hidden[j];
    sum = 0.0;
    for (k = 1; k <= no; k++) {
      sum += delta_o[k] * who[j][k];
    }
    delta_h[j] = h * (1.0 - h) * sum;
    errsum += ABS(delta_h[j]);
  }
  *err = errsum;
 }
 void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
    **w, **oldw;
 {
  float new_dw;
  int k, j;
  ly[0] = 1.0;
  // eta = 0.3;
  // momentum = 0.3;
 #ifdef OPEN
  omp_set_num_threads(NUM_THREAD);
 #pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw)          \
    firstprivate(ndelta, nly, momentum)
 #endif
  for (j = 1; j <= ndelta; j++) {
    for (k = 0; k <= nly; k++) {
      new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
      w[k][j] += new_dw;
      oldw[k][j] = new_dw;
    }
  }
 }
 void bpnn_feedforward(net) BPNN *net;
 {
  int in, hid, out;
  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;
  /*** Feed forward input activations. ***/
  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
                    hid);
  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
                    hid, out);
 }
 void bpnn_train(net, eo, eh) BPNN *net;
 float *eo, *eh;
 {
  int in, hid, out;
  float out_err, hid_err;
  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;
  /*** Feed forward input activations. ***/
  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
                    hid);
  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
                    hid, out);
  /*** Compute error on output and hidden units. ***/
  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
                    &out_err);
  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
                    net->hidden_weights, net->hidden_units, &hid_err);
  *eo = out_err;
  *eh = hid_err;
  /*** Adjust input and hidden weights. ***/
  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
                      net->hidden_weights, net->hidden_prev_weights);
  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
                      net->input_weights, net->input_prev_weights);
 }
 void bpnn_save(net, filename) BPNN *net;
 char *filename;
 {
  int n1, n2, n3, i, j, memcnt;
  float dvalue, **w;
  char *mem;
  /// add//
  FILE *pFile;
  pFile = fopen(filename, "w+");
  ///////
  /*
  if ((fd = creat(filename, 0644)) == -1) {
    printf("BPNN_SAVE: Cannot create '%s'\n", filename);
    return;
  }
  */
  n1 = net->input_n;
  n2 = net->hidden_n;
  n3 = net->output_n;
  printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
  // fflush(stdout);
  // write(fd, (char *) &n1, sizeof(int));
  // write(fd, (char *) &n2, sizeof(int));
  // write(fd, (char *) &n3, sizeof(int));
  fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
  fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
  fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
  memcnt = 0;
  w = net->input_weights;
  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
  for (i = 0; i <= n1; i++) {
    for (j = 0; j <= n2; j++) {
      dvalue = w[i][j];
      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
      memcnt += sizeof(float);
    }
  }
  // write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
  fwrite(mem, (unsigned)(sizeof(float)),
         (unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
  free(mem);
  memcnt = 0;
  w = net->hidden_weights;
  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
  for (i = 0; i <= n2; i++) {
    for (j = 0; j <= n3; j++) {
      dvalue = w[i][j];
      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
      memcnt += sizeof(float);
    }
  }
  // write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
  fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
         pFile);
  free(mem);
  fclose(pFile);
  return;
 }
 BPNN *bpnn_read(filename)
 char *filename;
 {
  char *mem;
  BPNN *new;
  int fd, n1, n2, n3, i, j, memcnt;
  if ((fd = open(filename, 0, 0644)) == -1) {
    return (NULL);
  }
  printf("Reading '%s'\n", filename); // fflush(stdout);
  read(fd, (char *)&n1, sizeof(int));
  read(fd, (char *)&n2, sizeof(int));
  read(fd, (char *)&n3, sizeof(int));
  new = bpnn_internal_create(n1, n2, n3);
  printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
  printf("Reading input weights..."); // fflush(stdout);
  memcnt = 0;
  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
  read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
  for (i = 0; i <= n1; i++) {
    for (j = 0; j <= n2; j++) {
      fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
      memcnt += sizeof(float);
    }
  }
  free(mem);
  printf("Done\nReading hidden weights..."); // fflush(stdout);
  memcnt = 0;
  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
  read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
  for (i = 0; i <= n2; i++) {
    for (j = 0; j <= n3; j++) {
      fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
      memcnt += sizeof(float);
    }
  }
  free(mem);
  close(fd);
  printf("Done\n"); // fflush(stdout);
  bpnn_zero_weights(new->input_prev_weights, n1, n2);
  bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
  return (new);
 }
--- a/examples/backprop/backprop.h
+++ b/examples/backprop/backprop.h
@ -1,50 +0,0 @@
 #ifndef _BACKPROP_H_
 #define _BACKPROP_H_
 #define BIGRND 0x7fffffff
 #define GPU
 #define THREADS 256
 #define WIDTH 16  // shared memory width
 #define HEIGHT 16 // shared memory height
 #define ETA 0.3      // eta value
 #define MOMENTUM 0.3 // momentum value
 #define NUM_THREAD 4 // OpenMP threads
 typedef struct {
  int input_n;  /* number of input units */
  int hidden_n; /* number of hidden units */
  int output_n; /* number of output units */
  float *input_units;  /* the input units */
  float *hidden_units; /* the hidden units */
  float *output_units; /* the output units */
  float *hidden_delta; /* storage for hidden unit error */
  float *output_delta; /* storage for output unit error */
  float *target; /* storage for target vector */
  float **input_weights;  /* weights from input to hidden layer */
  float **hidden_weights; /* weights from hidden to output layer */
  /*** The next two are for momentum ***/
  float **input_prev_weights;  /* previous change on input to hidden wgt */
  float **hidden_prev_weights; /* previous change on hidden to output wgt */
 } BPNN;
 /*** User-level functions ***/
 void bpnn_initialize();
 BPNN *bpnn_create();
 void bpnn_free();
 void bpnn_train();
 void bpnn_feedforward();
 void bpnn_save();
 BPNN *bpnn_read();
 #endif
--- a/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,615 +0,0 @@
 ; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "backprop_cuda.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
 entry:
  %input_cuda.addr = alloca float*, align 8
  %output_hidden_cuda.addr = alloca float*, align 8
  %input_hidden_cuda.addr = alloca float*, align 8
  %hidden_partial_sum.addr = alloca float*, align 8
  %in.addr = alloca i32, align 4
  %hid.addr = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %index = alloca i32, align 4
  %index_in = alloca i32, align 4
  %i = alloca i32, align 4
  %power_two = alloca i32, align 4
  store float* %input_cuda, float** %input_cuda.addr, align 8
  store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
  store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
  store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
  store i32 %in, i32* %in.addr, align 4
  store i32 %hid, i32* %hid.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call, i32* %by, align 4
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %tx, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call2, i32* %ty, align 4
  %0 = load i32, i32* %hid.addr, align 4
  %add = add nsw i32 %0, 1
  %mul = mul nsw i32 %add, 16
  %1 = load i32, i32* %by, align 4
  %mul3 = mul nsw i32 %mul, %1
  %2 = load i32, i32* %hid.addr, align 4
  %add4 = add nsw i32 %2, 1
  %3 = load i32, i32* %ty, align 4
  %mul5 = mul nsw i32 %add4, %3
  %add6 = add nsw i32 %mul3, %mul5
  %4 = load i32, i32* %tx, align 4
  %add7 = add nsw i32 %add6, %4
  %add8 = add nsw i32 %add7, 1
  %5 = load i32, i32* %hid.addr, align 4
  %add9 = add nsw i32 %5, 1
  %add10 = add nsw i32 %add8, %add9
  store i32 %add10, i32* %index, align 4
  %6 = load i32, i32* %by, align 4
  %mul11 = mul nsw i32 16, %6
  %7 = load i32, i32* %ty, align 4
  %add12 = add nsw i32 %mul11, %7
  %add13 = add nsw i32 %add12, 1
  store i32 %add13, i32* %index_in, align 4
  %8 = load i32, i32* %tx, align 4
  %cmp = icmp eq i32 %8, 0
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  %9 = load float*, float** %input_cuda.addr, align 8
  %10 = load i32, i32* %index_in, align 4
  %idxprom = sext i32 %10 to i64
  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
  %11 = load float, float* %arrayidx, align 4
  %12 = load i32, i32* %ty, align 4
  %idxprom14 = sext i32 %12 to i64
  %arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
  store float %11, float* %arrayidx15, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %entry
  call void @llvm.nvvm.barrier0()
  %13 = load float*, float** %input_hidden_cuda.addr, align 8
  %14 = load i32, i32* %index, align 4
  %idxprom16 = sext i32 %14 to i64
  %arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
  %15 = load float, float* %arrayidx17, align 4
  %16 = load i32, i32* %ty, align 4
  %idxprom18 = sext i32 %16 to i64
  %arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
  %17 = load i32, i32* %tx, align 4
  %idxprom20 = sext i32 %17 to i64
  %arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
  store float %15, float* %arrayidx21, align 4
  call void @llvm.nvvm.barrier0()
  %18 = load i32, i32* %ty, align 4
  %idxprom22 = sext i32 %18 to i64
  %arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
  %19 = load i32, i32* %tx, align 4
  %idxprom24 = sext i32 %19 to i64
  %arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
  %20 = load float, float* %arrayidx25, align 4
  %21 = load i32, i32* %ty, align 4
  %idxprom26 = sext i32 %21 to i64
  %arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
  %22 = load float, float* %arrayidx27, align 4
  %mul28 = fmul contract float %20, %22
  %23 = load i32, i32* %ty, align 4
  %idxprom29 = sext i32 %23 to i64
  %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
  %24 = load i32, i32* %tx, align 4
  %idxprom31 = sext i32 %24 to i64
  %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
  store float %mul28, float* %arrayidx32, align 4
  call void @llvm.nvvm.barrier0()
  store i32 1, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %if.end
  %25 = load i32, i32* %i, align 4
  %conv = sitofp i32 %25 to float
  %call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
  %cmp34 = fcmp ole float %conv, %call33
  br i1 %cmp34, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %26 = load i32, i32* %i, align 4
  %conv35 = sitofp i32 %26 to float
  %call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
  %conv37 = fptosi float %call36 to i32
  store i32 %conv37, i32* %power_two, align 4
  %27 = load i32, i32* %ty, align 4
  %28 = load i32, i32* %power_two, align 4
  %rem = srem i32 %27, %28
  %cmp38 = icmp eq i32 %rem, 0
  br i1 %cmp38, label %if.then39, label %if.end54
 if.then39:                                        ; preds = %for.body
  %29 = load i32, i32* %ty, align 4
  %idxprom40 = sext i32 %29 to i64
  %arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
  %30 = load i32, i32* %tx, align 4
  %idxprom42 = sext i32 %30 to i64
  %arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
  %31 = load float, float* %arrayidx43, align 4
  %32 = load i32, i32* %ty, align 4
  %33 = load i32, i32* %power_two, align 4
  %div = sdiv i32 %33, 2
  %add44 = add nsw i32 %32, %div
  %idxprom45 = sext i32 %add44 to i64
  %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
  %34 = load i32, i32* %tx, align 4
  %idxprom47 = sext i32 %34 to i64
  %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
  %35 = load float, float* %arrayidx48, align 4
  %add49 = fadd contract float %31, %35
  %36 = load i32, i32* %ty, align 4
  %idxprom50 = sext i32 %36 to i64
  %arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
  %37 = load i32, i32* %tx, align 4
  %idxprom52 = sext i32 %37 to i64
  %arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
  store float %add49, float* %arrayidx53, align 4
  br label %if.end54
 if.end54:                                         ; preds = %if.then39, %for.body
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end54
  %38 = load i32, i32* %i, align 4
  %inc = add nsw i32 %38, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %39 = load i32, i32* %ty, align 4
  %idxprom55 = sext i32 %39 to i64
  %arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
  %40 = load i32, i32* %tx, align 4
  %idxprom57 = sext i32 %40 to i64
  %arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
  %41 = load float, float* %arrayidx58, align 4
  %42 = load float*, float** %input_hidden_cuda.addr, align 8
  %43 = load i32, i32* %index, align 4
  %idxprom59 = sext i32 %43 to i64
  %arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
  store float %41, float* %arrayidx60, align 4
  call void @llvm.nvvm.barrier0()
  %44 = load i32, i32* %tx, align 4
  %cmp61 = icmp eq i32 %44, 0
  br i1 %cmp61, label %if.then62, label %if.end71
 if.then62:                                        ; preds = %for.end
  %45 = load i32, i32* %tx, align 4
  %idxprom63 = sext i32 %45 to i64
  %arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
  %46 = load i32, i32* %ty, align 4
  %idxprom65 = sext i32 %46 to i64
  %arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
  %47 = load float, float* %arrayidx66, align 4
  %48 = load float*, float** %hidden_partial_sum.addr, align 8
  %49 = load i32, i32* %by, align 4
  %50 = load i32, i32* %hid.addr, align 4
  %mul67 = mul nsw i32 %49, %50
  %51 = load i32, i32* %ty, align 4
  %add68 = add nsw i32 %mul67, %51
  %idxprom69 = sext i32 %add68 to i64
  %arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
  store float %47, float* %arrayidx70, align 4
  br label %if.end71
 if.end71:                                         ; preds = %if.then62, %for.end
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: alwaysinline convergent nounwind
 define internal float @_ZL7__log2ff(float %__a) #1 {
 entry:
  %__a.addr = alloca float, align 4
  store float %__a, float* %__a.addr, align 4
  %0 = load float, float* %__a.addr, align 4
  %call = call float @__nv_fast_log2f(float %0) #2
  ret float %call
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
 entry:
  %__a.addr = alloca float, align 4
  %__b.addr = alloca float, align 4
  store float %__a, float* %__a.addr, align 4
  store float %__b, float* %__b.addr, align 4
  %0 = load float, float* %__a.addr, align 4
  %1 = load float, float* %__b.addr, align 4
  %call = call float @__nv_fast_powf(float %0, float %1) #2
  ret float %call
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
 entry:
  %delta.addr = alloca float*, align 8
  %hid.addr = alloca i32, align 4
  %ly.addr = alloca float*, align 8
  %in.addr = alloca i32, align 4
  %w.addr = alloca float*, align 8
  %oldw.addr = alloca float*, align 8
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %index = alloca i32, align 4
  %index_y = alloca i32, align 4
  %index_x = alloca i32, align 4
  store float* %delta, float** %delta.addr, align 8
  store i32 %hid, i32* %hid.addr, align 4
  store float* %ly, float** %ly.addr, align 8
  store i32 %in, i32* %in.addr, align 4
  store float* %w, float** %w.addr, align 8
  store float* %oldw, float** %oldw.addr, align 8
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call, i32* %by, align 4
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %tx, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call2, i32* %ty, align 4
  %0 = load i32, i32* %hid.addr, align 4
  %add = add nsw i32 %0, 1
  %mul = mul nsw i32 %add, 16
  %1 = load i32, i32* %by, align 4
  %mul3 = mul nsw i32 %mul, %1
  %2 = load i32, i32* %hid.addr, align 4
  %add4 = add nsw i32 %2, 1
  %3 = load i32, i32* %ty, align 4
  %mul5 = mul nsw i32 %add4, %3
  %add6 = add nsw i32 %mul3, %mul5
  %4 = load i32, i32* %tx, align 4
  %add7 = add nsw i32 %add6, %4
  %add8 = add nsw i32 %add7, 1
  %5 = load i32, i32* %hid.addr, align 4
  %add9 = add nsw i32 %5, 1
  %add10 = add nsw i32 %add8, %add9
  store i32 %add10, i32* %index, align 4
  %6 = load i32, i32* %by, align 4
  %mul11 = mul nsw i32 16, %6
  %7 = load i32, i32* %ty, align 4
  %add12 = add nsw i32 %mul11, %7
  %add13 = add nsw i32 %add12, 1
  store i32 %add13, i32* %index_y, align 4
  %8 = load i32, i32* %tx, align 4
  %add14 = add nsw i32 %8, 1
  store i32 %add14, i32* %index_x, align 4
  %9 = load float*, float** %delta.addr, align 8
  %10 = load i32, i32* %index_x, align 4
  %idxprom = sext i32 %10 to i64
  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
  %11 = load float, float* %arrayidx, align 4
  %conv = fpext float %11 to double
  %mul15 = fmul contract double 3.000000e-01, %conv
  %12 = load float*, float** %ly.addr, align 8
  %13 = load i32, i32* %index_y, align 4
  %idxprom16 = sext i32 %13 to i64
  %arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
  %14 = load float, float* %arrayidx17, align 4
  %conv18 = fpext float %14 to double
  %mul19 = fmul contract double %mul15, %conv18
  %15 = load float*, float** %oldw.addr, align 8
  %16 = load i32, i32* %index, align 4
  %idxprom20 = sext i32 %16 to i64
  %arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
  %17 = load float, float* %arrayidx21, align 4
  %conv22 = fpext float %17 to double
  %mul23 = fmul contract double 3.000000e-01, %conv22
  %add24 = fadd contract double %mul19, %mul23
  %18 = load float*, float** %w.addr, align 8
  %19 = load i32, i32* %index, align 4
  %idxprom25 = sext i32 %19 to i64
  %arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
  %20 = load float, float* %arrayidx26, align 4
  %conv27 = fpext float %20 to double
  %add28 = fadd contract double %conv27, %add24
  %conv29 = fptrunc double %add28 to float
  store float %conv29, float* %arrayidx26, align 4
  %21 = load float*, float** %delta.addr, align 8
  %22 = load i32, i32* %index_x, align 4
  %idxprom30 = sext i32 %22 to i64
  %arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
  %23 = load float, float* %arrayidx31, align 4
  %conv32 = fpext float %23 to double
  %mul33 = fmul contract double 3.000000e-01, %conv32
  %24 = load float*, float** %ly.addr, align 8
  %25 = load i32, i32* %index_y, align 4
  %idxprom34 = sext i32 %25 to i64
  %arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
  %26 = load float, float* %arrayidx35, align 4
  %conv36 = fpext float %26 to double
  %mul37 = fmul contract double %mul33, %conv36
  %27 = load float*, float** %oldw.addr, align 8
  %28 = load i32, i32* %index, align 4
  %idxprom38 = sext i32 %28 to i64
  %arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
  %29 = load float, float* %arrayidx39, align 4
  %conv40 = fpext float %29 to double
  %mul41 = fmul contract double 3.000000e-01, %conv40
  %add42 = fadd contract double %mul37, %mul41
  %conv43 = fptrunc double %add42 to float
  %30 = load float*, float** %oldw.addr, align 8
  %31 = load i32, i32* %index, align 4
  %idxprom44 = sext i32 %31 to i64
  %arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
  store float %conv43, float* %arrayidx45, align 4
  call void @llvm.nvvm.barrier0()
  %32 = load i32, i32* %ty, align 4
  %cmp = icmp eq i32 %32, 0
  br i1 %cmp, label %land.lhs.true, label %if.end
 land.lhs.true:                                    ; preds = %entry
  %33 = load i32, i32* %by, align 4
  %cmp46 = icmp eq i32 %33, 0
  br i1 %cmp46, label %if.then, label %if.end
 if.then:                                          ; preds = %land.lhs.true
  %34 = load float*, float** %delta.addr, align 8
  %35 = load i32, i32* %index_x, align 4
  %idxprom47 = sext i32 %35 to i64
  %arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
  %36 = load float, float* %arrayidx48, align 4
  %conv49 = fpext float %36 to double
  %mul50 = fmul contract double 3.000000e-01, %conv49
  %37 = load float*, float** %oldw.addr, align 8
  %38 = load i32, i32* %index_x, align 4
  %idxprom51 = sext i32 %38 to i64
  %arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
  %39 = load float, float* %arrayidx52, align 4
  %conv53 = fpext float %39 to double
  %mul54 = fmul contract double 3.000000e-01, %conv53
  %add55 = fadd contract double %mul50, %mul54
  %40 = load float*, float** %w.addr, align 8
  %41 = load i32, i32* %index_x, align 4
  %idxprom56 = sext i32 %41 to i64
  %arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
  %42 = load float, float* %arrayidx57, align 4
  %conv58 = fpext float %42 to double
  %add59 = fadd contract double %conv58, %add55
  %conv60 = fptrunc double %add59 to float
  store float %conv60, float* %arrayidx57, align 4
  %43 = load float*, float** %delta.addr, align 8
  %44 = load i32, i32* %index_x, align 4
  %idxprom61 = sext i32 %44 to i64
  %arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
  %45 = load float, float* %arrayidx62, align 4
  %conv63 = fpext float %45 to double
  %mul64 = fmul contract double 3.000000e-01, %conv63
  %46 = load float*, float** %oldw.addr, align 8
  %47 = load i32, i32* %index_x, align 4
  %idxprom65 = sext i32 %47 to i64
  %arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
  %48 = load float, float* %arrayidx66, align 4
  %conv67 = fpext float %48 to double
  %mul68 = fmul contract double 3.000000e-01, %conv67
  %add69 = fadd contract double %mul64, %mul68
  %conv70 = fptrunc double %add69 to float
  %49 = load float*, float** %oldw.addr, align 8
  %50 = load i32, i32* %index_x, align 4
  %idxprom71 = sext i32 %50 to i64
  %arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
  store float %conv70, float* %arrayidx72, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
 ; Function Attrs: alwaysinline convergent inlinehint nounwind
 define internal float @__nv_fast_log2f(float %a) #4 {
  %call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %1 = icmp ne i32 %call.i, 0
  br i1 %1, label %2, label %4
 2:                                                ; preds = %0
  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
  br label %__nvvm_builtin_log2f.exit
 4:                                                ; preds = %0
  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
  br label %__nvvm_builtin_log2f.exit
 __nvvm_builtin_log2f.exit:                        ; preds = %4, %2
  %retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
  ret float %retval.0.i
 }
 ; Function Attrs: convergent nounwind
 declare i32 @__nvvm_reflect(i8*) #5
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.lg2.approx.f(float) #3
 ; Function Attrs: alwaysinline convergent inlinehint nounwind
 define internal float @__nv_fast_powf(float %a, float %b) #4 {
  %call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %1 = icmp ne i32 %call.i.i, 0
  br i1 %1, label %2, label %4
 2:                                                ; preds = %0
  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
  br label %__nv_fast_log2f.exit
 4:                                                ; preds = %0
  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
  br label %__nv_fast_log2f.exit
 __nv_fast_log2f.exit:                             ; preds = %4, %2
  %retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
  %6 = fmul float %b, %retval.0.i.i
  %call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %7 = icmp ne i32 %call.i.i1, 0
  br i1 %7, label %8, label %10
 8:                                                ; preds = %__nv_fast_log2f.exit
  %9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
  br label %__nv_exp2f.exit
 10:                                               ; preds = %__nv_fast_log2f.exit
  %11 = call float @llvm.nvvm.ex2.approx.f(float %6)
  br label %__nv_exp2f.exit
 __nv_exp2f.exit:                                  ; preds = %10, %8
  %retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
  ret float %retval.0.i.i2
 }
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.ex2.approx.f(float) #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
 !llvm.ident = !{!9}
 !nvvmir.version = !{!10}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
 !4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
 !5 = !{null, !"align", i32 8}
 !6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !7 = !{null, !"align", i32 16}
 !8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !10 = !{i32 1, i32 4}
--- a/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
--- a/examples/backprop/backprop_cuda.cu
+++ b/examples/backprop/backprop_cuda.cu
@ -1,195 +0,0 @@
 #include <cuda.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 // includes, kernels
 #include "backprop.h"
 #include "backprop_cuda_kernel.cu"
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
                                  int n2);
 extern "C" void bpnn_output_error(float *delta, float *target, float *output,
                                  int nj, float *err);
 extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
                                  int no, float **who, float *hidden,
                                  float *err);
 extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
                                    int nly, float **w, float **oldw);
 extern "C" int setup(int argc, char **argv);
 extern "C" float **alloc_2d_dbl(int m, int n);
 extern "C" float squash(float x);
 double gettime() {
  struct timeval t;
  gettimeofday(&t, NULL);
  return t.tv_sec + t.tv_usec * 1e-6;
 }
 unsigned int num_threads = 0;
 unsigned int num_blocks = 0;
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  setup(argc, argv);
 }
 extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
  int in, hid, out;
  float out_err, hid_err;
  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;
 #ifdef GPU
  int m = 0;
  float *input_hidden_cuda;
  float *input_cuda;
  float *output_hidden_cuda;
  float *partial_sum;
  float *hidden_partial_sum;
  float *hidden_delta_cuda;
  float *input_prev_weights_cuda;
  float sum;
  float *input_weights_one_dim;
  float *input_weights_prev_one_dim;
  num_blocks = in / 16;
  dim3 grid(1, num_blocks);
  dim3 threads(16, 16);
  input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
  input_weights_prev_one_dim =
      (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
  partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
  // this preprocessing stage is added to correct the bugs of wrong memcopy
  // using two-dimensional net->inputweights
  for (int k = 0; k <= in; k++) {
    for (int j = 0; j <= hid; j++) {
      input_weights_one_dim[m] = net->input_weights[k][j];
      input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
      m++;
    }
  }
  cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
  cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
  cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
  cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
 #endif
 #ifdef CPU
  printf("Performing CPU computation\n");
  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
                    hid);
 #endif
 #ifdef GPU
  printf("Performing GPU computation\n");
  // printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
  cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
                                            input_hidden_cuda,
                                            hidden_partial_sum, in, hid);
  cudaThreadSynchronize();
  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) {
    printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
    exit(EXIT_FAILURE);
  }
  cudaMemcpy(partial_sum, hidden_partial_sum,
             num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
  for (int j = 1; j <= hid; j++) {
    sum = 0.0;
    for (int k = 0; k < num_blocks; k++) {
      sum += partial_sum[k * hid + j - 1];
    }
    sum += net->input_weights[0][j];
    net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
  }
 #endif
  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
                    hid, out);
  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
                    &out_err);
  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
                    net->hidden_weights, net->hidden_units, &hid_err);
  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
                      net->hidden_weights, net->hidden_prev_weights);
 #ifdef CPU
  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
                      net->input_weights, net->input_prev_weights);
 #endif
 #ifdef GPU
  cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
  cudaMalloc((void **)&input_prev_weights_cuda,
             (in + 1) * (hid + 1) * sizeof(float));
  cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
                                              input_cuda, in, input_hidden_cuda,
                                              input_prev_weights_cuda);
  cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
             cudaMemcpyDeviceToHost);
  cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < (in + 1) * (hid + 1); i++) {
    printf("%f ", input_weights_one_dim[i]);
  }
  printf("\n");
  cudaFree(input_cuda);
  cudaFree(output_hidden_cuda);
  cudaFree(input_hidden_cuda);
  cudaFree(hidden_partial_sum);
  cudaFree(input_prev_weights_cuda);
  cudaFree(hidden_delta_cuda);
  free(partial_sum);
  free(input_weights_one_dim);
  free(input_weights_prev_one_dim);
 #endif
 }
--- a/examples/backprop/backprop_cuda_kernel.cu
+++ b/examples/backprop/backprop_cuda_kernel.cu
@ -1,96 +0,0 @@
 #ifndef _BACKPROP_CUDA_KERNEL_H_
 #define _BACKPROP_CUDA_KERNEL_H_
 #include "backprop.h"
 #include "cuda.h"
 #include "math.h"
 #include <stdio.h>
 __global__ void bpnn_layerforward_CUDA(float *input_cuda,
                                       float *output_hidden_cuda,
                                       float *input_hidden_cuda,
                                       float *hidden_partial_sum, int in,
                                       int hid) {
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
  int index_in = HEIGHT * by + ty + 1;
  __shared__ float input_node[HEIGHT];
  __shared__ float weight_matrix[HEIGHT][WIDTH];
  if (tx == 0)
    input_node[ty] = input_cuda[index_in];
  __syncthreads();
  weight_matrix[ty][tx] = input_hidden_cuda[index];
  __syncthreads();
  weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
  __syncthreads();
  for (int i = 1; i <= __log2f(HEIGHT); i++) {
    int power_two = __powf(2, i);
    if (ty % power_two == 0)
      weight_matrix[ty][tx] =
          weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
    __syncthreads();
  }
  //__syncthreads();
  input_hidden_cuda[index] = weight_matrix[ty][tx];
  /*
     for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
             unsigned int power_two = i - 1;
             if( (ty & power_two) == 0 ) {
                  weight_matrix[ty][tx] = weight_matrix[ty][tx] +
     weight_matrix[ty + power_two/2][tx];
             }
     }
     */
  __syncthreads();
  if (tx == 0) {
    hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
  }
 }
 __global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
                                         int in, float *w, float *oldw) {
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
  int index_y = HEIGHT * by + ty + 1;
  int index_x = tx + 1;
  // eta = 0.3;
  // momentum = 0.3;
  w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
  oldw[index] =
      ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
  __syncthreads();
  if (ty == 0 && by == 0) {
    w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
    oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
  }
 }
 #endif
--- a/examples/backprop/facetrain.c
+++ b/examples/backprop/facetrain.c
@ -1,48 +0,0 @@
 #include "backprop.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 extern char *strcpy();
 extern void exit();
 int layer_size = 0;
 backprop_face() {
  BPNN *net;
  int i;
  float out_err, hid_err;
  net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
  printf("Input layer size : %d\n", layer_size);
  load(net);
  // entering the training kernel, only one iteration
  printf("Starting training kernel\n");
  bpnn_train_cuda(net, &out_err, &hid_err);
  bpnn_free(net);
  printf("Training done\n");
 }
 int setup(argc, argv)
 int argc;
 char *argv[];
 {
  int seed;
  if (argc != 2) {
    fprintf(stderr, "usage: backprop <num of input elements>\n");
    exit(0);
  }
  layer_size = atoi(argv[1]);
  if (layer_size % 16 != 0) {
    fprintf(stderr, "The number of input points must be divided by 16\n");
    exit(0);
  }
  seed = 7;
  bpnn_initialize(seed);
  backprop_face();
  exit(0);
 }
--- a/examples/backprop/imagenet.c
+++ b/examples/backprop/imagenet.c
@ -1,22 +0,0 @@
 #include "backprop.h"
 #include <stdio.h>
 #include <stdlib.h>
 extern layer_size;
 load(net) BPNN *net;
 {
  float *units;
  int nr, nc, imgsize, i, j, k;
  nr = layer_size;
  imgsize = nr * nc;
  units = net->input_units;
  k = 1;
  for (i = 0; i < nr; i++) {
    units[k] = (float)rand() / RAND_MAX;
    k++;
  }
 }
--- a/examples/backprop/run.sh
+++ b/examples/backprop/run.sh
@ -1,28 +0,0 @@
 #!/bin/bash
 set -e
 clang -c -emit-llvm backprop.c
 clang -c -emit-llvm facetrain.c
 clang -c -emit-llvm imagenet.c
 llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 llc --relocation-model=pic --filetype=obj  backprop.bc
 llc --relocation-model=pic --filetype=obj  facetrain.bc
 llc --relocation-model=pic --filetype=obj  imagenet.bc
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o demo \
    -fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
    -lc -lx86Runtime -lthreadPool -lpthread
 ./demo 1024 > res.log
 if grep -q -e "0.173289 0.259645 0.350836" res.log; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,307 +0,0 @@
 ; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "bfs.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 %struct.Node = type { i32, i32 }
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
 entry:
  %g_graph_nodes.addr = alloca %struct.Node*, align 8
  %g_graph_edges.addr = alloca i32*, align 8
  %g_graph_mask.addr = alloca i8*, align 8
  %g_updating_graph_mask.addr = alloca i8*, align 8
  %g_graph_visited.addr = alloca i8*, align 8
  %g_cost.addr = alloca i32*, align 8
  %no_of_nodes.addr = alloca i32, align 4
  %tid = alloca i32, align 4
  %i = alloca i32, align 4
  %id = alloca i32, align 4
  store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
  store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
  store i32* %g_cost, i32** %g_cost.addr, align 8
  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call, 512
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add = add i32 %mul, %call1
  store i32 %add, i32* %tid, align 4
  %0 = load i32, i32* %tid, align 4
  %1 = load i32, i32* %no_of_nodes.addr, align 4
  %cmp = icmp slt i32 %0, %1
  br i1 %cmp, label %land.lhs.true, label %if.end26
 land.lhs.true:                                    ; preds = %entry
  %2 = load i8*, i8** %g_graph_mask.addr, align 8
  %3 = load i32, i32* %tid, align 4
  %idxprom = sext i32 %3 to i64
  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
  %4 = load i8, i8* %arrayidx, align 1
  %tobool = trunc i8 %4 to i1
  br i1 %tobool, label %if.then, label %if.end26
 if.then:                                          ; preds = %land.lhs.true
  %5 = load i8*, i8** %g_graph_mask.addr, align 8
  %6 = load i32, i32* %tid, align 4
  %idxprom2 = sext i32 %6 to i64
  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
  store i8 0, i8* %arrayidx3, align 1
  %7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
  %8 = load i32, i32* %tid, align 4
  %idxprom4 = sext i32 %8 to i64
  %arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
  %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
  %9 = load i32, i32* %starting, align 4
  store i32 %9, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %if.then
  %10 = load i32, i32* %i, align 4
  %11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
  %12 = load i32, i32* %tid, align 4
  %idxprom6 = sext i32 %12 to i64
  %arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
  %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
  %13 = load i32, i32* %no_of_edges, align 4
  %14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
  %15 = load i32, i32* %tid, align 4
  %idxprom8 = sext i32 %15 to i64
  %arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
  %starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
  %16 = load i32, i32* %starting10, align 4
  %add11 = add nsw i32 %13, %16
  %cmp12 = icmp slt i32 %10, %add11
  br i1 %cmp12, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %17 = load i32*, i32** %g_graph_edges.addr, align 8
  %18 = load i32, i32* %i, align 4
  %idxprom13 = sext i32 %18 to i64
  %arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
  %19 = load i32, i32* %arrayidx14, align 4
  store i32 %19, i32* %id, align 4
  %20 = load i8*, i8** %g_graph_visited.addr, align 8
  %21 = load i32, i32* %id, align 4
  %idxprom15 = sext i32 %21 to i64
  %arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
  %22 = load i8, i8* %arrayidx16, align 1
  %tobool17 = trunc i8 %22 to i1
  br i1 %tobool17, label %if.end, label %if.then18
 if.then18:                                        ; preds = %for.body
  %23 = load i32*, i32** %g_cost.addr, align 8
  %24 = load i32, i32* %tid, align 4
  %idxprom19 = sext i32 %24 to i64
  %arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
  %25 = load i32, i32* %arrayidx20, align 4
  %add21 = add nsw i32 %25, 1
  %26 = load i32*, i32** %g_cost.addr, align 8
  %27 = load i32, i32* %id, align 4
  %idxprom22 = sext i32 %27 to i64
  %arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
  store i32 %add21, i32* %arrayidx23, align 4
  %28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
  %29 = load i32, i32* %id, align 4
  %idxprom24 = sext i32 %29 to i64
  %arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
  store i8 1, i8* %arrayidx25, align 1
  br label %if.end
 if.end:                                           ; preds = %if.then18, %for.body
  br label %for.inc
 for.inc:                                          ; preds = %if.end
  %30 = load i32, i32* %i, align 4
  %inc = add nsw i32 %30, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  br label %if.end26
 if.end26:                                         ; preds = %for.end, %land.lhs.true, %entry
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
 entry:
  %g_graph_mask.addr = alloca i8*, align 8
  %g_updating_graph_mask.addr = alloca i8*, align 8
  %g_graph_visited.addr = alloca i8*, align 8
  %g_over.addr = alloca i8*, align 8
  %no_of_nodes.addr = alloca i32, align 4
  %tid = alloca i32, align 4
  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
  store i8* %g_over, i8** %g_over.addr, align 8
  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call, 512
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add = add i32 %mul, %call1
  store i32 %add, i32* %tid, align 4
  %0 = load i32, i32* %tid, align 4
  %1 = load i32, i32* %no_of_nodes.addr, align 4
  %cmp = icmp slt i32 %0, %1
  br i1 %cmp, label %land.lhs.true, label %if.end
 land.lhs.true:                                    ; preds = %entry
  %2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
  %3 = load i32, i32* %tid, align 4
  %idxprom = sext i32 %3 to i64
  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
  %4 = load i8, i8* %arrayidx, align 1
  %tobool = trunc i8 %4 to i1
  br i1 %tobool, label %if.then, label %if.end
 if.then:                                          ; preds = %land.lhs.true
  %5 = load i8*, i8** %g_graph_mask.addr, align 8
  %6 = load i32, i32* %tid, align 4
  %idxprom2 = sext i32 %6 to i64
  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
  store i8 1, i8* %arrayidx3, align 1
  %7 = load i8*, i8** %g_graph_visited.addr, align 8
  %8 = load i32, i32* %tid, align 4
  %idxprom4 = sext i32 %8 to i64
  %arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
  store i8 1, i8* %arrayidx5, align 1
  %9 = load i8*, i8** %g_over.addr, align 8
  store i8 1, i8* %9, align 1
  %10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
  %11 = load i32, i32* %tid, align 4
  %idxprom6 = sext i32 %11 to i64
  %arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
  store i8 0, i8* %arrayidx7, align 1
  br label %if.end
 if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
 !llvm.ident = !{!9}
 !nvvmir.version = !{!10}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
 !4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
 !5 = !{null, !"align", i32 8}
 !6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !7 = !{null, !"align", i32 16}
 !8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !10 = !{i32 1, i32 4}
--- a/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
--- a/examples/bfs/bfs.cu
+++ b/examples/bfs/bfs.cu
@ -1,213 +0,0 @@
 #include <cuda.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define MAX_THREADS_PER_BLOCK 512
 int no_of_nodes;
 int edge_list_size;
 FILE *fp;
 // Structure to hold a node information
 struct Node {
  int starting;
  int no_of_edges;
 };
 #include "kernel.cu"
 #include "kernel2.cu"
 void BFSGraph(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  no_of_nodes = 0;
  edge_list_size = 0;
  BFSGraph(argc, argv);
 }
 void Usage(int argc, char **argv) {
  fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Apply BFS on a Graph using CUDA
 ////////////////////////////////////////////////////////////////////////////////
 void BFSGraph(int argc, char **argv) {
  char *input_f;
  if (argc != 2) {
    Usage(argc, argv);
    exit(0);
  }
  input_f = argv[1];
  printf("Reading File\n");
  // Read in Graph from a file
  fp = fopen(input_f, "r");
  if (!fp) {
    printf("Error Reading graph file\n");
    return;
  }
  int source = 0;
  fscanf(fp, "%d", &no_of_nodes);
  int num_of_blocks = 1;
  int num_of_threads_per_block = no_of_nodes;
  // Make execution Parameters according to the number of nodes
  // Distribute threads across multiple Blocks if necessary
  if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
    num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
    num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
  }
  // allocate host memory
  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
  bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
  bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
  bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
  int start, edgeno;
  // initalize the memory
  for (unsigned int i = 0; i < no_of_nodes; i++) {
    fscanf(fp, "%d %d", &start, &edgeno);
    h_graph_nodes[i].starting = start;
    h_graph_nodes[i].no_of_edges = edgeno;
    h_graph_mask[i] = false;
    h_updating_graph_mask[i] = false;
    h_graph_visited[i] = false;
  }
  // read the source node from the file
  fscanf(fp, "%d", &source);
  source = 0;
  // set the source node as true in the mask
  h_graph_mask[source] = true;
  h_graph_visited[source] = true;
  fscanf(fp, "%d", &edge_list_size);
  int id, cost;
  int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
  for (int i = 0; i < edge_list_size; i++) {
    fscanf(fp, "%d", &id);
    fscanf(fp, "%d", &cost);
    h_graph_edges[i] = id;
  }
  if (fp)
    fclose(fp);
  printf("Read File\n");
  // Copy the Node list to device memory
  Node *d_graph_nodes;
  cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
  cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
             cudaMemcpyHostToDevice);
  // Copy the Edge List to device Memory
  int *d_graph_edges;
  cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
  cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
             cudaMemcpyHostToDevice);
  // Copy the Mask to device memory
  bool *d_graph_mask;
  cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
             cudaMemcpyHostToDevice);
  bool *d_updating_graph_mask;
  cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
             sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
  // Copy the Visited nodes array to device memory
  bool *d_graph_visited;
  cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
             cudaMemcpyHostToDevice);
  // allocate mem for the result on host side
  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
  for (int i = 0; i < no_of_nodes; i++)
    h_cost[i] = -1;
  h_cost[source] = 0;
  // allocate device memory for result
  int *d_cost;
  cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
  cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
  // make a bool to check if the execution is over
  bool *d_over;
  cudaMalloc((void **)&d_over, sizeof(bool));
  printf("Copied Everything to GPU memory\n");
  // setup execution parameters
  dim3 grid(num_of_blocks, 1, 1);
  dim3 threads(num_of_threads_per_block, 1, 1);
  int k = 0;
  printf("Start traversing the tree\n");
  bool stop;
  // Call the Kernel untill all the elements of Frontier are not false
  do {
    // if no thread changes this value then the loop stops
    stop = false;
    cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
    Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
                                 d_updating_graph_mask, d_graph_visited, d_cost,
                                 no_of_nodes);
    cudaDeviceSynchronize();
    // check if kernel execution generated and error
    Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
                                  d_graph_visited, d_over, no_of_nodes);
    cudaDeviceSynchronize();
    // check if kernel execution generated and error
    cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
    k++;
  } while (stop);
  printf("Kernel Executed %d times\n", k);
  // copy result from device to host
  cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
  // Store the result into a file
  FILE *fpo = fopen("result.txt", "w");
  for (int i = 0; i < no_of_nodes; i++)
    fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
  fclose(fpo);
  printf("Result stored in result.txt\n");
  // cleanup memory
  free(h_graph_nodes);
  free(h_graph_edges);
  free(h_graph_mask);
  free(h_updating_graph_mask);
  free(h_graph_visited);
  free(h_cost);
  cudaFree(d_graph_nodes);
  cudaFree(d_graph_edges);
  cudaFree(d_graph_mask);
  cudaFree(d_updating_graph_mask);
  cudaFree(d_graph_visited);
  cudaFree(d_cost);
 }
--- a/examples/bfs/kernel.cu
+++ b/examples/bfs/kernel.cu
@ -1,23 +0,0 @@
 #ifndef _KERNEL_H_
 #define _KERNEL_H_
 __global__ void
 Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
 {
 	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
 	if( tid<no_of_nodes && g_graph_mask[tid])
 	{
 		g_graph_mask[tid]=false;
 		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
 			{
 			int id = g_graph_edges[i];
 			if(!g_graph_visited[id])
 				{
 				g_cost[id]=g_cost[tid]+1;
 				g_updating_graph_mask[id]=true;
 				}
 			}
 	}
 }
 #endif
--- a/examples/bfs/kernel2.cu
+++ b/examples/bfs/kernel2.cu
@ -1,18 +0,0 @@
 #ifndef _KERNEL2_H_
 #define _KERNEL2_H_
 __global__ void
 Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
 {
 	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
 	if( tid<no_of_nodes && g_updating_graph_mask[tid])
 	{
 		g_graph_mask[tid]=true;
 		g_graph_visited[tid]=true;
 		*g_over=true;
 		g_updating_graph_mask[tid]=false;
 	}
 }
 #endif
--- a/examples/bfs/run.sh
+++ b/examples/bfs/run.sh
@ -1,21 +0,0 @@
 #!/bin/bash
 set -e
 llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
    -o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./bfs.out ../../rodinia-data/bfs/graph65536.txt
 if grep -q "0) cost:0" result.txt; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/btree/common.h
+++ b/examples/btree/common.h
@ -1,343 +0,0 @@
 // # ifdef __cplusplus
 // extern "C" {
 // # endif
 // #ifndef LIST_H
 // # define LIST_H
 //===============================================================================================================================================================================================================200
 //	DEFINE/INCLUDE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE (for some reason these are not recognized when defined in main
 // file before this one is included)
 //======================================================================================================================================================150
 #include <stdbool.h> // (in path known to compiler)			needed by true/false, bool
 #include <stdint.h>  // (in path known to compiler)			needed by uint32_t
 #include <stdlib.h>  // (in path known to compiler)			needed by malloc
 //======================================================================================================================================================150
 //	DEFINE
 //======================================================================================================================================================150
 #define fp float
 #define Version "1.5"
 #ifdef WINDOWS
 #define bool char
 #define false 0
 #define true 1
 #endif
 /* #define DEFAULT_ORDER 256 */
 #ifdef RD_WG_SIZE_0_0
 #define DEFAULT_ORDER RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define DEFAULT_ORDER RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define DEFAULT_ORDER RD_WG_SIZE
 #else
 #define DEFAULT_ORDER 256
 #endif
 /* #ifdef RD_WG_SIZE_1_0 */
 /*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
 /* #elif defined(RD_WG_SIZE_1) */
 /*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1 */
 /* #elif defined(RD_WG_SIZE) */
 /*         #define  DEFAULT_ORDER_2 RD_WG_SIZE */
 /* #else */
 /*         #define  DEFAULT_ORDER_2 256 */
 /* #endif */
 /* #define DEFAULT_ORDER 508 */
 #define malloc(size)                                                           \
  ({                                                                           \
    void *_tmp;                                                                \
                                                                               \
    if (!(_tmp = malloc(size))) {                                              \
      fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__);    \
      exit(-1);                                                                \
    }                                                                          \
                                                                               \
    _tmp;                                                                      \
  })
 //======================================================================================================================================================150
 //	STRUCTURES
 //======================================================================================================================================================150
 // struct list_item;
 typedef struct list_item list_item_t;
 typedef struct list_t {
  list_item_t *head, *tail;
  uint32_t length;
  int32_t (*compare)(const void *key, const void *with);
  void (*datum_delete)(void *);
 } list_t;
 typedef list_item_t *list_iterator_t;
 typedef list_item_t *list_reverse_iterator_t;
 /* Type representing the record
 * to which a given key refers.
 * In a real B+ tree system, the
 * record would hold data (in a database)
 * or a file (in an operating system)
 * or some other information.
 * Users can rewrite this part of the code
 * to change the type and content
 * of the value field.
 */
 typedef struct record {
  int value;
 } record;
 /* Type representing a node in the B+ tree.
 * This type is general enough to serve for both
 * the leaf and the internal node.
 * The heart of the node is the array
 * of keys and the array of corresponding
 * pointers.  The relation between keys
 * and pointers differs between leaves and
 * internal nodes.  In a leaf, the index
 * of each key equals the index of its corresponding
 * pointer, with a maximum of order - 1 key-pointer
 * pairs.  The last pointer points to the
 * leaf to the right (or NULL in the case
 * of the rightmost leaf).
 * In an internal node, the first pointer
 * refers to lower nodes with keys less than
 * the smallest key in the keys array.  Then,
 * with indices i starting at 0, the pointer
 * at i + 1 points to the subtree with keys
 * greater than or equal to the key in this
 * node at index i.
 * The num_keys field is used to keep
 * track of the number of valid keys.
 * In an internal node, the number of valid
 * pointers is always num_keys + 1.
 * In a leaf, the number of valid pointers
 * to data is always num_keys.  The
 * last leaf pointer points to the next leaf.
 */
 typedef struct node {
  void **pointers;
  int *keys;
  struct node *parent;
  bool is_leaf;
  int num_keys;
  struct node *next; // Used for queue.
 } node;
 //
 typedef struct knode {
  int location;
  int indices[DEFAULT_ORDER + 1];
  int keys[DEFAULT_ORDER + 1];
  bool is_leaf;
  int num_keys;
 } knode;
 struct list_item {
  struct list_item *pred, *next;
  void *datum;
 };
 //===============================================================================================================================================================================================================200
 //	PROTOTYPES
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 // Other
 //======================================================================================================================================================150
 void list_item_init(list_item_t *li, void *datum);
 void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
 void list_insert_item_tail(list_t *l, list_item_t *i);
 void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
 void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
 void list_insert_item_sorted(list_t *l, list_item_t *i);
 //======================================================================================================================================================150
 // ???
 //======================================================================================================================================================150
 void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
               void (*datum_delete)(void *datum));
 void list_delete(list_t *l);
 void list_reset(list_t *l);
 void list_insert_head(list_t *l, void *v);
 void list_insert_tail(list_t *l, void *v);
 void list_insert_before(list_t *l, list_item_t *next, void *v);
 void list_insert_after(list_t *l, list_item_t *pred, void *v);
 void list_insert_sorted(list_t *l, void *v);
 void list_insert_item_head(list_t *l, list_item_t *i);
 void list_remove_item(list_t *l, list_item_t *i);
 void list_remove_head(list_t *l);
 void list_remove_tail(list_t *l);
 list_item_t *list_find_item(list_t *l, void *datum);
 list_item_t *list_get_head_item(list_t *l);
 list_item_t *list_get_tail_item(list_t *l);
 void *list_find(list_t *l, void *datum);
 void *list_get_head(list_t *l);
 void *list_get_tail(list_t *l);
 uint32_t list_get_length(list_t *l);
 bool list_is_empty(list_t *l);
 bool list_not_empty(list_t *l);
 void list_visit_items(list_t *l, void (*visitor)(void *v));
 void *list_item_get_datum(list_item_t *li);
 void list_iterator_init(list_t *l, list_iterator_t *li);
 void list_iterator_delete(list_iterator_t *li);
 void list_iterator_next(list_iterator_t *li);
 void list_iterator_prev(list_iterator_t *li);
 void *list_iterator_get_datum(list_iterator_t *li);
 bool list_iterator_is_valid(list_iterator_t *li);
 void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
 void list_reverse_iterator_delete(list_iterator_t *li);
 void list_reverse_iterator_next(list_iterator_t *li);
 void list_reverse_iterator_prev(list_iterator_t *li);
 void *list_reverse_iterator_get_datum(list_iterator_t *li);
 bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
 //======================================================================================================================================================150
 // Output and utility
 //======================================================================================================================================================150
 void *kmalloc(int size);
 long transform_to_cuda(node *n,
                       bool verbose); // returns actual mem used in a long
 void usage_1(void);
 void usage_2(void);
 void enqueue(node *new_node);
 node *dequeue(void);
 int height(node *root);
 int path_to_root(node *root, node *child);
 void print_leaves(node *root);
 void print_tree(node *root);
 node *find_leaf(node *root, int key, bool verbose);
 record *find(node *root, int key, bool verbose);
 int cut(int length);
 //======================================================================================================================================================150
 // Insertion
 //======================================================================================================================================================150
 record *make_record(int value);
 node *make_node(void);
 node *make_leaf(void);
 int get_left_index(node *parent, node *left);
 node *insert_into_leaf(node *leaf, int key, record *pointer);
 node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
                                       record *pointer);
 node *insert_into_node(node *root, node *parent, int left_index, int key,
                       node *right);
 node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
                                       int key, node *right);
 node *insert_into_parent(node *root, node *left, int key, node *right);
 node *insert_into_new_root(node *left, int key, node *right);
 node *start_new_tree(int key, record *pointer);
 node *insert(node *root, int key, int value);
 //======================================================================================================================================================150
 // Deletion
 //======================================================================================================================================================150
 int get_neighbor_index(node *n);
 node *adjust_root(node *root);
 node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
                     int k_prime);
 node *redistribute_nodes(node *root, node *n, node *neighbor,
                         int neighbor_index, int k_prime_index, int k_prime);
 node *delete_entry(node *root, node *n, int key, void *pointer);
 node *deleteVal(node *root, int key);
 //===============================================================================================================================================================================================================200
 //	HEADER
 //===============================================================================================================================================================================================================200
 // int main(	int argc,
 // char *argv []);
 //===============================================================================================================================================================================================================200
 //	END
 //===============================================================================================================================================================================================================200
 // #endif
 // # ifdef __cplusplus
 // }
 // # endif
--- a/examples/btree/kernel/kernel_gpu_cuda.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda.cu
@ -1,54 +0,0 @@
 //========================================================================================================================================================================================================200
 //	findK function
 //========================================================================================================================================================================================================200
 __global__ void
 findK(	long height,
 		knode *knodesD,
 		long knodes_elem,
 		record *recordsD,
 		long *currKnodeD,
 		long *offsetD,
 		int *keysD,
 		record *ansD)
 {
 	// private thread IDs
 	int thid = threadIdx.x;
 	int bid = blockIdx.x;
 	// processtree levels
 	int i;
 	for(i = 0; i < height; i++){
 		// if value is between the two keys
 		if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
 			// this conditional statement is inserted to avoid crush due to but in original code
 			// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
 			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
 			if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
 				offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
 			}
 		}
 		__syncthreads();
 		// set for next tree level
 		if(thid==0){
 			currKnodeD[bid] = offsetD[bid];
 		}
 		__syncthreads();
 	}
 	//At this point, we have a candidate leaf node which may contain
 	//the target record.  Check each key to hopefully find the record
 	if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
 		ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
 	}
 }
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_2.cu
@ -1,70 +0,0 @@
 //========================================================================================================================================================================================================200
 //	findRangeK function
 //========================================================================================================================================================================================================200
 __global__ void
 findRangeK(	long height,
 			knode *knodesD,
 			long knodes_elem,
 			long *currKnodeD,
 			long *offsetD,
 			long *lastKnodeD,
 			long *offset_2D,
 			int *startD,
 			int *endD,
 			int *RecstartD,
 			int *ReclenD)
 {
 	// private thread IDs
 	int thid = threadIdx.x;
 	int bid = blockIdx.x;
 	// ???
 	int i;
 	for(i = 0; i < height; i++){
 		if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
 			// this conditional statement is inserted to avoid crush due to but in original code
 			// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
 			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
 			if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
 				offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
 			}
 		}
 		if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
 			// this conditional statement is inserted to avoid crush due to but in original code
 			// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
 			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
 			if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
 				offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
 			}
 		}
 		__syncthreads();
 		// set for next tree level
 		if(thid==0){
 			currKnodeD[bid] = offsetD[bid];
 			lastKnodeD[bid] = offset_2D[bid];
 		}
 		__syncthreads();
 	}
 	// Find the index of the starting record
 	if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
 		RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
 	}
 	__syncthreads();
 	// Find the index of the ending record
 	if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
 		ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
 	}
 }
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
@ -1,292 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	DEFINE/INCLUDE
 //========================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	COMMON
 //======================================================================================================================================================150
 #include "../common.h"								// (in main program directory)			needed to recognized input variables
 //======================================================================================================================================================150
 //	UTILITIES
 //======================================================================================================================================================150
 #include "../util/cuda/cuda.h"					// (in path specified to compiler)	needed by for device functions
 #include "../util/timer/timer.h"					// (in path specified to compiler)	needed by timer
 //======================================================================================================================================================150
 //	KERNEL
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda.cu"						// (in current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
 //======================================================================================================================================================150
 //	HEADER
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda_wrapper.h"				// (in current directory)
 //========================================================================================================================================================================================================200
 //	KERNEL_GPU_CUDA_WRAPPER FUNCTION
 //========================================================================================================================================================================================================200
 void
 kernel_gpu_cuda_wrapper(record *records,
 						long records_mem,
 						knode *knodes,
 						long knodes_elem,
 						long knodes_mem,
 						int order,
 						long maxheight,
 						int count,
 						long *currKnode,
 						long *offset,
 						int *keys,
 						record *ans)
 {
 	//======================================================================================================================================================150
 	//	CPU VARIABLES
 	//======================================================================================================================================================150
 	// timer
 	long long time0;
 	long long time1;
 	long long time2;
 	long long time3;
 	long long time4;
 	long long time5;
 	long long time6;
 	time0 = get_time();
 	//======================================================================================================================================================150
 	//	GPU SETUP
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 	cudaThreadSynchronize();
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
 	//====================================================================================================100
 	int numBlocks;
 	numBlocks = count;									// max # of blocks can be 65,535
 	int threadsPerBlock;
 	threadsPerBlock = order < 1024 ? order : 1024;
 	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
 	time1 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY				(MALLOC)
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN
 	//====================================================================================================100
 	//==================================================50
 	//	recordsD
 	//==================================================50
 	record *recordsD;
 	cudaMalloc((void**)&recordsD, records_mem);
 	checkCUDAError("cudaMalloc  recordsD");
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	knode *knodesD;
 	cudaMalloc((void**)&knodesD, knodes_mem);
 	checkCUDAError("cudaMalloc  recordsD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	long *currKnodeD;
 	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	long *offsetD;
 	cudaMalloc((void**)&offsetD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  offsetD");
 	//==================================================50
 	//	keysD
 	//==================================================50
 	int *keysD;
 	cudaMalloc((void**)&keysD, count*sizeof(int));
 	checkCUDAError("cudaMalloc  keysD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansD
 	//==================================================50
 	record *ansD;
 	cudaMalloc((void**)&ansD, count*sizeof(record));
 	checkCUDAError("cudaMalloc ansD");
 	time2 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	GPU MEMORY				(MALLOC) COPY IN
 	//====================================================================================================100
 	//==================================================50
 	//	recordsD
 	//==================================================50
 	cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy memD");
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy memD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
 	//==================================================50
 	//	keysD
 	//==================================================50
 	cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy keysD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansD
 	//==================================================50
 	cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy ansD");
 	time3 = get_time();
 	//======================================================================================================================================================150
 	// findK kernel
 	//======================================================================================================================================================150
 	findK<<<numBlocks, threadsPerBlock>>>(	maxheight,
 											knodesD,
 											knodes_elem,
 											recordsD,
 											currKnodeD,
 											offsetD,
 											keysD,
 											ansD);
 	cudaThreadSynchronize();
 	checkCUDAError("findK");
 	time4 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY (CONTD.)
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansD
 	//==================================================50
 	cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
 	checkCUDAError("cudaMemcpy ansD");
 	time5 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY DEALLOCATION
 	//======================================================================================================================================================150
 	cudaFree(recordsD);
 	cudaFree(knodesD);
 	cudaFree(currKnodeD);
 	cudaFree(offsetD);
 	cudaFree(keysD);
 	cudaFree(ansD);
 	time6 = get_time();
 	//======================================================================================================================================================150
 	//	DISPLAY TIMING
 	//======================================================================================================================================================150
 	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
 	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
 	printf("Total time:\n");
 	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
 }
 //========================================================================================================================================================================================================200
 //	END
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
@ -1,23 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	KERNEL_GPU_CUDA_WRAPPER HEADER
 //========================================================================================================================================================================================================200
 void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
                             long knodes_elem, long knodes_mem,
                             int order, long maxheight, int count,
                             long *currKnode, long *offset, int *keys,
                             record *ans);
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
@ -1,347 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	INCLUDE
 //========================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	COMMON
 //======================================================================================================================================================150
 #include "../common.h"									// (in the main program folder)	needed to recognized input parameters
 //======================================================================================================================================================150
 //	UTILITIES
 //======================================================================================================================================================150
 #include "../util/cuda/cuda.h"							// (in library path specified to compiler)	needed by for device functions
 #include "../util/timer/timer.h"						// (in library path specified to compiler)	needed by timer
 //======================================================================================================================================================150
 //	KERNEL
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda_2.cu"						// (in the current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
 //======================================================================================================================================================150
 //	HEADER
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda_wrapper_2.h"				// (in the current directory)
 //========================================================================================================================================================================================================200
 //	FUNCTION
 //========================================================================================================================================================================================================200
 void
 kernel_gpu_cuda_wrapper_2(	knode *knodes,
 							long knodes_elem,
 							long knodes_mem,
 							int order,
 							long maxheight,
 							int count,
 							long *currKnode,
 							long *offset,
 							long *lastKnode,
 							long *offset_2,
 							int *start,
 							int *end,
 							int *recstart,
 							int *reclength)
 {
 	//======================================================================================================================================================150
 	//	CPU VARIABLES
 	//======================================================================================================================================================150
 	// timer
 	long long time0;
 	long long time1;
 	long long time2;
 	long long time3;
 	long long time4;
 	long long time5;
 	long long time6;
 	time0 = get_time();
 	//======================================================================================================================================================150
 	//	GPU SETUP
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 	cudaThreadSynchronize();
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
 	//====================================================================================================100
 	int numBlocks;
 	numBlocks = count;
 	int threadsPerBlock;
 	threadsPerBlock = order < 1024 ? order : 1024;
 	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
 	time1 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY				MALLOC
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN
 	//====================================================================================================100
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	knode *knodesD;
 	cudaMalloc((void**)&knodesD, knodes_mem);
 	checkCUDAError("cudaMalloc  recordsD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	long *currKnodeD;
 	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	long *offsetD;
 	cudaMalloc((void**)&offsetD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  offsetD");
 	//==================================================50
 	//	lastKnodeD
 	//==================================================50
 	long *lastKnodeD;
 	cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  lastKnodeD");
 	//==================================================50
 	//	offset_2D
 	//==================================================50
 	long *offset_2D;
 	cudaMalloc((void**)&offset_2D, count*sizeof(long));
 	checkCUDAError("cudaMalloc  offset_2D");
 	//==================================================50
 	//	startD
 	//==================================================50
 	int *startD;
 	cudaMalloc((void**)&startD, count*sizeof(int));
 	checkCUDAError("cudaMalloc startD");
 	//==================================================50
 	//	endD
 	//==================================================50
 	int *endD;
 	cudaMalloc((void**)&endD, count*sizeof(int));
 	checkCUDAError("cudaMalloc endD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansDStart
 	//==================================================50
 	int *ansDStart;
 	cudaMalloc((void**)&ansDStart, count*sizeof(int));
 	checkCUDAError("cudaMalloc ansDStart");
 	//==================================================50
 	//	ansDLength
 	//==================================================50
 	int *ansDLength;
 	cudaMalloc((void**)&ansDLength, count*sizeof(int));
 	checkCUDAError("cudaMalloc ansDLength");
 	time2 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN
 	//====================================================================================================100
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy memD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
 	//==================================================50
 	//	lastKnodeD
 	//==================================================50
 	cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
 	//==================================================50
 	//	offset_2D
 	//==================================================50
 	cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
 	//==================================================50
 	//	startD
 	//==================================================50
 	cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy startD");
 	//==================================================50
 	//	endD
 	//==================================================50
 	cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy endD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansDStart
 	//==================================================50
 	cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy ansDStart");
 	//==================================================50
 	//	ansDLength
 	//==================================================50
 	cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy ansDLength");
 	time3 = get_time();
 	//======================================================================================================================================================150
 	//	KERNEL
 	//======================================================================================================================================================150
 	// [GPU] findRangeK kernel
 	findRangeK<<<numBlocks, threadsPerBlock>>>(	maxheight,
 												knodesD,
 												knodes_elem,
 												currKnodeD,
 												offsetD,
 												lastKnodeD,
 												offset_2D,
 												startD,
 												endD,
 												ansDStart,
 												ansDLength);
 	cudaThreadSynchronize();
 	checkCUDAError("findRangeK");
 	time4 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY (CONTD.)
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansDStart
 	//==================================================50
 	cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
 	checkCUDAError("cudaMemcpy ansDStart");
 	//==================================================50
 	//	ansDLength
 	//==================================================50
 	cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
 	checkCUDAError("cudaMemcpy ansDLength");
 	time5 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY DEALLOCATION
 	//======================================================================================================================================================150
 	cudaFree(knodesD);
 	cudaFree(currKnodeD);
 	cudaFree(offsetD);
 	cudaFree(lastKnodeD);
 	cudaFree(offset_2D);
 	cudaFree(startD);
 	cudaFree(endD);
 	cudaFree(ansDStart);
 	cudaFree(ansDLength);
 	time6 = get_time();
 	//======================================================================================================================================================150
 	//	DISPLAY TIMING
 	//======================================================================================================================================================150
 	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
 	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
 	printf("Total time:\n");
 	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
 }
 //========================================================================================================================================================================================================200
 //	END
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
@ -1,23 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	KERNEL_GPU_CUDA_WRAPPER HEADER
 //========================================================================================================================================================================================================200
 void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
                               int order, long maxheight, int count,
                               long *currKnode, long *offset, long *lastKnode,
                               long *offset_2, int *start, int *end,
                               int *recstart, int *reclength);
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,332 +0,0 @@
 ; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 %struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
 %struct.record = type { i32 }
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
 entry:
  %height.addr = alloca i64, align 8
  %knodesD.addr = alloca %struct.knode*, align 8
  %knodes_elem.addr = alloca i64, align 8
  %recordsD.addr = alloca %struct.record*, align 8
  %currKnodeD.addr = alloca i64*, align 8
  %offsetD.addr = alloca i64*, align 8
  %keysD.addr = alloca i32*, align 8
  %ansD.addr = alloca %struct.record*, align 8
  %thid = alloca i32, align 4
  %bid = alloca i32, align 4
  %i = alloca i32, align 4
  store i64 %height, i64* %height.addr, align 8
  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
  store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
  store i64* %offsetD, i64** %offsetD.addr, align 8
  store i32* %keysD, i32** %keysD.addr, align 8
  store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %thid, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %bid, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, i32* %i, align 4
  %conv = sext i32 %0 to i64
  %1 = load i64, i64* %height.addr, align 8
  %cmp = icmp slt i64 %conv, %1
  br i1 %cmp, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %3 = load i64*, i64** %currKnodeD.addr, align 8
  %4 = load i32, i32* %bid, align 4
  %idxprom = sext i32 %4 to i64
  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
  %5 = load i64, i64* %arrayidx, align 8
  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
  %6 = load i32, i32* %thid, align 4
  %idxprom3 = sext i32 %6 to i64
  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
  %7 = load i32, i32* %arrayidx4, align 4
  %8 = load i32*, i32** %keysD.addr, align 8
  %9 = load i32, i32* %bid, align 4
  %idxprom5 = sext i32 %9 to i64
  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
  %10 = load i32, i32* %arrayidx6, align 4
  %cmp7 = icmp sle i32 %7, %10
  br i1 %cmp7, label %land.lhs.true, label %if.end34
 land.lhs.true:                                    ; preds = %for.body
  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %12 = load i64*, i64** %currKnodeD.addr, align 8
  %13 = load i32, i32* %bid, align 4
  %idxprom8 = sext i32 %13 to i64
  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
  %14 = load i64, i64* %arrayidx9, align 8
  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
  %15 = load i32, i32* %thid, align 4
  %add = add nsw i32 %15, 1
  %idxprom12 = sext i32 %add to i64
  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
  %16 = load i32, i32* %arrayidx13, align 4
  %17 = load i32*, i32** %keysD.addr, align 8
  %18 = load i32, i32* %bid, align 4
  %idxprom14 = sext i32 %18 to i64
  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
  %19 = load i32, i32* %arrayidx15, align 4
  %cmp16 = icmp sgt i32 %16, %19
  br i1 %cmp16, label %if.then, label %if.end34
 if.then:                                          ; preds = %land.lhs.true
  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %21 = load i64*, i64** %offsetD.addr, align 8
  %22 = load i32, i32* %bid, align 4
  %idxprom17 = sext i32 %22 to i64
  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
  %23 = load i64, i64* %arrayidx18, align 8
  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
  %24 = load i32, i32* %thid, align 4
  %idxprom20 = sext i32 %24 to i64
  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
  %25 = load i32, i32* %arrayidx21, align 4
  %conv22 = sext i32 %25 to i64
  %26 = load i64, i64* %knodes_elem.addr, align 8
  %cmp23 = icmp slt i64 %conv22, %26
  br i1 %cmp23, label %if.then24, label %if.end
 if.then24:                                        ; preds = %if.then
  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %28 = load i64*, i64** %offsetD.addr, align 8
  %29 = load i32, i32* %bid, align 4
  %idxprom25 = sext i32 %29 to i64
  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
  %30 = load i64, i64* %arrayidx26, align 8
  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
  %31 = load i32, i32* %thid, align 4
  %idxprom29 = sext i32 %31 to i64
  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
  %32 = load i32, i32* %arrayidx30, align 4
  %conv31 = sext i32 %32 to i64
  %33 = load i64*, i64** %offsetD.addr, align 8
  %34 = load i32, i32* %bid, align 4
  %idxprom32 = sext i32 %34 to i64
  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
  store i64 %conv31, i64* %arrayidx33, align 8
  br label %if.end
 if.end:                                           ; preds = %if.then24, %if.then
  br label %if.end34
 if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
  call void @llvm.nvvm.barrier0()
  %35 = load i32, i32* %thid, align 4
  %cmp35 = icmp eq i32 %35, 0
  br i1 %cmp35, label %if.then36, label %if.end41
 if.then36:                                        ; preds = %if.end34
  %36 = load i64*, i64** %offsetD.addr, align 8
  %37 = load i32, i32* %bid, align 4
  %idxprom37 = sext i32 %37 to i64
  %arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
  %38 = load i64, i64* %arrayidx38, align 8
  %39 = load i64*, i64** %currKnodeD.addr, align 8
  %40 = load i32, i32* %bid, align 4
  %idxprom39 = sext i32 %40 to i64
  %arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
  store i64 %38, i64* %arrayidx40, align 8
  br label %if.end41
 if.end41:                                         ; preds = %if.then36, %if.end34
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end41
  %41 = load i32, i32* %i, align 4
  %inc = add nsw i32 %41, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %43 = load i64*, i64** %currKnodeD.addr, align 8
  %44 = load i32, i32* %bid, align 4
  %idxprom42 = sext i32 %44 to i64
  %arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
  %45 = load i64, i64* %arrayidx43, align 8
  %arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
  %keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
  %46 = load i32, i32* %thid, align 4
  %idxprom46 = sext i32 %46 to i64
  %arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
  %47 = load i32, i32* %arrayidx47, align 4
  %48 = load i32*, i32** %keysD.addr, align 8
  %49 = load i32, i32* %bid, align 4
  %idxprom48 = sext i32 %49 to i64
  %arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
  %50 = load i32, i32* %arrayidx49, align 4
  %cmp50 = icmp eq i32 %47, %50
  br i1 %cmp50, label %if.then51, label %if.end63
 if.then51:                                        ; preds = %for.end
  %51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
  %52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %53 = load i64*, i64** %currKnodeD.addr, align 8
  %54 = load i32, i32* %bid, align 4
  %idxprom52 = sext i32 %54 to i64
  %arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
  %55 = load i64, i64* %arrayidx53, align 8
  %arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
  %indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
  %56 = load i32, i32* %thid, align 4
  %idxprom56 = sext i32 %56 to i64
  %arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
  %57 = load i32, i32* %arrayidx57, align 4
  %idxprom58 = sext i32 %57 to i64
  %arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
  %value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
  %58 = load i32, i32* %value, align 4
  %59 = load %struct.record*, %struct.record** %ansD.addr, align 8
  %60 = load i32, i32* %bid, align 4
  %idxprom60 = sext i32 %60 to i64
  %arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
  %value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
  store i32 %58, i32* %value62, align 4
  br label %if.end63
 if.end63:                                         ; preds = %if.then51, %for.end
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,475 +0,0 @@
 ; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 %struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
 entry:
  %height.addr = alloca i64, align 8
  %knodesD.addr = alloca %struct.knode*, align 8
  %knodes_elem.addr = alloca i64, align 8
  %currKnodeD.addr = alloca i64*, align 8
  %offsetD.addr = alloca i64*, align 8
  %lastKnodeD.addr = alloca i64*, align 8
  %offset_2D.addr = alloca i64*, align 8
  %startD.addr = alloca i32*, align 8
  %endD.addr = alloca i32*, align 8
  %RecstartD.addr = alloca i32*, align 8
  %ReclenD.addr = alloca i32*, align 8
  %thid = alloca i32, align 4
  %bid = alloca i32, align 4
  %i = alloca i32, align 4
  store i64 %height, i64* %height.addr, align 8
  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
  store i64* %offsetD, i64** %offsetD.addr, align 8
  store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
  store i64* %offset_2D, i64** %offset_2D.addr, align 8
  store i32* %startD, i32** %startD.addr, align 8
  store i32* %endD, i32** %endD.addr, align 8
  store i32* %RecstartD, i32** %RecstartD.addr, align 8
  store i32* %ReclenD, i32** %ReclenD.addr, align 8
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %thid, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %bid, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, i32* %i, align 4
  %conv = sext i32 %0 to i64
  %1 = load i64, i64* %height.addr, align 8
  %cmp = icmp slt i64 %conv, %1
  br i1 %cmp, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %3 = load i64*, i64** %currKnodeD.addr, align 8
  %4 = load i32, i32* %bid, align 4
  %idxprom = sext i32 %4 to i64
  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
  %5 = load i64, i64* %arrayidx, align 8
  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
  %6 = load i32, i32* %thid, align 4
  %idxprom3 = sext i32 %6 to i64
  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
  %7 = load i32, i32* %arrayidx4, align 4
  %8 = load i32*, i32** %startD.addr, align 8
  %9 = load i32, i32* %bid, align 4
  %idxprom5 = sext i32 %9 to i64
  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
  %10 = load i32, i32* %arrayidx6, align 4
  %cmp7 = icmp sle i32 %7, %10
  br i1 %cmp7, label %land.lhs.true, label %if.end34
 land.lhs.true:                                    ; preds = %for.body
  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %12 = load i64*, i64** %currKnodeD.addr, align 8
  %13 = load i32, i32* %bid, align 4
  %idxprom8 = sext i32 %13 to i64
  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
  %14 = load i64, i64* %arrayidx9, align 8
  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
  %15 = load i32, i32* %thid, align 4
  %add = add nsw i32 %15, 1
  %idxprom12 = sext i32 %add to i64
  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
  %16 = load i32, i32* %arrayidx13, align 4
  %17 = load i32*, i32** %startD.addr, align 8
  %18 = load i32, i32* %bid, align 4
  %idxprom14 = sext i32 %18 to i64
  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
  %19 = load i32, i32* %arrayidx15, align 4
  %cmp16 = icmp sgt i32 %16, %19
  br i1 %cmp16, label %if.then, label %if.end34
 if.then:                                          ; preds = %land.lhs.true
  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %21 = load i64*, i64** %currKnodeD.addr, align 8
  %22 = load i32, i32* %bid, align 4
  %idxprom17 = sext i32 %22 to i64
  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
  %23 = load i64, i64* %arrayidx18, align 8
  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
  %24 = load i32, i32* %thid, align 4
  %idxprom20 = sext i32 %24 to i64
  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
  %25 = load i32, i32* %arrayidx21, align 4
  %conv22 = sext i32 %25 to i64
  %26 = load i64, i64* %knodes_elem.addr, align 8
  %cmp23 = icmp slt i64 %conv22, %26
  br i1 %cmp23, label %if.then24, label %if.end
 if.then24:                                        ; preds = %if.then
  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %28 = load i64*, i64** %currKnodeD.addr, align 8
  %29 = load i32, i32* %bid, align 4
  %idxprom25 = sext i32 %29 to i64
  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
  %30 = load i64, i64* %arrayidx26, align 8
  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
  %31 = load i32, i32* %thid, align 4
  %idxprom29 = sext i32 %31 to i64
  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
  %32 = load i32, i32* %arrayidx30, align 4
  %conv31 = sext i32 %32 to i64
  %33 = load i64*, i64** %offsetD.addr, align 8
  %34 = load i32, i32* %bid, align 4
  %idxprom32 = sext i32 %34 to i64
  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
  store i64 %conv31, i64* %arrayidx33, align 8
  br label %if.end
 if.end:                                           ; preds = %if.then24, %if.then
  br label %if.end34
 if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
  %35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %36 = load i64*, i64** %lastKnodeD.addr, align 8
  %37 = load i32, i32* %bid, align 4
  %idxprom35 = sext i32 %37 to i64
  %arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
  %38 = load i64, i64* %arrayidx36, align 8
  %arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
  %keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
  %39 = load i32, i32* %thid, align 4
  %idxprom39 = sext i32 %39 to i64
  %arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
  %40 = load i32, i32* %arrayidx40, align 4
  %41 = load i32*, i32** %endD.addr, align 8
  %42 = load i32, i32* %bid, align 4
  %idxprom41 = sext i32 %42 to i64
  %arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
  %43 = load i32, i32* %arrayidx42, align 4
  %cmp43 = icmp sle i32 %40, %43
  br i1 %cmp43, label %land.lhs.true44, label %if.end75
 land.lhs.true44:                                  ; preds = %if.end34
  %44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %45 = load i64*, i64** %lastKnodeD.addr, align 8
  %46 = load i32, i32* %bid, align 4
  %idxprom45 = sext i32 %46 to i64
  %arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
  %47 = load i64, i64* %arrayidx46, align 8
  %arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
  %keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
  %48 = load i32, i32* %thid, align 4
  %add49 = add nsw i32 %48, 1
  %idxprom50 = sext i32 %add49 to i64
  %arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
  %49 = load i32, i32* %arrayidx51, align 4
  %50 = load i32*, i32** %endD.addr, align 8
  %51 = load i32, i32* %bid, align 4
  %idxprom52 = sext i32 %51 to i64
  %arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
  %52 = load i32, i32* %arrayidx53, align 4
  %cmp54 = icmp sgt i32 %49, %52
  br i1 %cmp54, label %if.then55, label %if.end75
 if.then55:                                        ; preds = %land.lhs.true44
  %53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %54 = load i64*, i64** %lastKnodeD.addr, align 8
  %55 = load i32, i32* %bid, align 4
  %idxprom56 = sext i32 %55 to i64
  %arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
  %56 = load i64, i64* %arrayidx57, align 8
  %arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
  %indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
  %57 = load i32, i32* %thid, align 4
  %idxprom60 = sext i32 %57 to i64
  %arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
  %58 = load i32, i32* %arrayidx61, align 4
  %conv62 = sext i32 %58 to i64
  %59 = load i64, i64* %knodes_elem.addr, align 8
  %cmp63 = icmp slt i64 %conv62, %59
  br i1 %cmp63, label %if.then64, label %if.end74
 if.then64:                                        ; preds = %if.then55
  %60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %61 = load i64*, i64** %lastKnodeD.addr, align 8
  %62 = load i32, i32* %bid, align 4
  %idxprom65 = sext i32 %62 to i64
  %arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
  %63 = load i64, i64* %arrayidx66, align 8
  %arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
  %indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
  %64 = load i32, i32* %thid, align 4
  %idxprom69 = sext i32 %64 to i64
  %arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
  %65 = load i32, i32* %arrayidx70, align 4
  %conv71 = sext i32 %65 to i64
  %66 = load i64*, i64** %offset_2D.addr, align 8
  %67 = load i32, i32* %bid, align 4
  %idxprom72 = sext i32 %67 to i64
  %arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
  store i64 %conv71, i64* %arrayidx73, align 8
  br label %if.end74
 if.end74:                                         ; preds = %if.then64, %if.then55
  br label %if.end75
 if.end75:                                         ; preds = %if.end74, %land.lhs.true44, %if.end34
  call void @llvm.nvvm.barrier0()
  %68 = load i32, i32* %thid, align 4
  %cmp76 = icmp eq i32 %68, 0
  br i1 %cmp76, label %if.then77, label %if.end86
 if.then77:                                        ; preds = %if.end75
  %69 = load i64*, i64** %offsetD.addr, align 8
  %70 = load i32, i32* %bid, align 4
  %idxprom78 = sext i32 %70 to i64
  %arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
  %71 = load i64, i64* %arrayidx79, align 8
  %72 = load i64*, i64** %currKnodeD.addr, align 8
  %73 = load i32, i32* %bid, align 4
  %idxprom80 = sext i32 %73 to i64
  %arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
  store i64 %71, i64* %arrayidx81, align 8
  %74 = load i64*, i64** %offset_2D.addr, align 8
  %75 = load i32, i32* %bid, align 4
  %idxprom82 = sext i32 %75 to i64
  %arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
  %76 = load i64, i64* %arrayidx83, align 8
  %77 = load i64*, i64** %lastKnodeD.addr, align 8
  %78 = load i32, i32* %bid, align 4
  %idxprom84 = sext i32 %78 to i64
  %arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
  store i64 %76, i64* %arrayidx85, align 8
  br label %if.end86
 if.end86:                                         ; preds = %if.then77, %if.end75
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end86
  %79 = load i32, i32* %i, align 4
  %inc = add nsw i32 %79, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %81 = load i64*, i64** %currKnodeD.addr, align 8
  %82 = load i32, i32* %bid, align 4
  %idxprom87 = sext i32 %82 to i64
  %arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
  %83 = load i64, i64* %arrayidx88, align 8
  %arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
  %keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
  %84 = load i32, i32* %thid, align 4
  %idxprom91 = sext i32 %84 to i64
  %arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
  %85 = load i32, i32* %arrayidx92, align 4
  %86 = load i32*, i32** %startD.addr, align 8
  %87 = load i32, i32* %bid, align 4
  %idxprom93 = sext i32 %87 to i64
  %arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
  %88 = load i32, i32* %arrayidx94, align 4
  %cmp95 = icmp eq i32 %85, %88
  br i1 %cmp95, label %if.then96, label %if.end105
 if.then96:                                        ; preds = %for.end
  %89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %90 = load i64*, i64** %currKnodeD.addr, align 8
  %91 = load i32, i32* %bid, align 4
  %idxprom97 = sext i32 %91 to i64
  %arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
  %92 = load i64, i64* %arrayidx98, align 8
  %arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
  %indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
  %93 = load i32, i32* %thid, align 4
  %idxprom101 = sext i32 %93 to i64
  %arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
  %94 = load i32, i32* %arrayidx102, align 4
  %95 = load i32*, i32** %RecstartD.addr, align 8
  %96 = load i32, i32* %bid, align 4
  %idxprom103 = sext i32 %96 to i64
  %arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
  store i32 %94, i32* %arrayidx104, align 4
  br label %if.end105
 if.end105:                                        ; preds = %if.then96, %for.end
  call void @llvm.nvvm.barrier0()
  %97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %98 = load i64*, i64** %lastKnodeD.addr, align 8
  %99 = load i32, i32* %bid, align 4
  %idxprom106 = sext i32 %99 to i64
  %arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
  %100 = load i64, i64* %arrayidx107, align 8
  %arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
  %keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
  %101 = load i32, i32* %thid, align 4
  %idxprom110 = sext i32 %101 to i64
  %arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
  %102 = load i32, i32* %arrayidx111, align 4
  %103 = load i32*, i32** %endD.addr, align 8
  %104 = load i32, i32* %bid, align 4
  %idxprom112 = sext i32 %104 to i64
  %arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
  %105 = load i32, i32* %arrayidx113, align 4
  %cmp114 = icmp eq i32 %102, %105
  br i1 %cmp114, label %if.then115, label %if.end127
 if.then115:                                       ; preds = %if.end105
  %106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %107 = load i64*, i64** %lastKnodeD.addr, align 8
  %108 = load i32, i32* %bid, align 4
  %idxprom116 = sext i32 %108 to i64
  %arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
  %109 = load i64, i64* %arrayidx117, align 8
  %arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
  %indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
  %110 = load i32, i32* %thid, align 4
  %idxprom120 = sext i32 %110 to i64
  %arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
  %111 = load i32, i32* %arrayidx121, align 4
  %112 = load i32*, i32** %RecstartD.addr, align 8
  %113 = load i32, i32* %bid, align 4
  %idxprom122 = sext i32 %113 to i64
  %arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
  %114 = load i32, i32* %arrayidx123, align 4
  %sub = sub nsw i32 %111, %114
  %add124 = add nsw i32 %sub, 1
  %115 = load i32*, i32** %ReclenD.addr, align 8
  %116 = load i32, i32* %bid, align 4
  %idxprom125 = sext i32 %116 to i64
  %arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
  store i32 %add124, i32* %arrayidx126, align 4
  br label %if.end127
 if.end127:                                        ; preds = %if.then115, %if.end105
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/main.c
+++ b/examples/btree/main.c
--- a/examples/btree/run.sh
+++ b/examples/btree/run.sh
@ -1,40 +0,0 @@
 #!/bin/bash
 set -e
 clang -c -emit-llvm util/timer/timer.c
 clang -c -emit-llvm util/num/num.c
 #clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
 #clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
 #clang++ kernel/kernel_gpu_cuda_wrapper.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 #clang++ kernel/kernel_gpu_cuda_wrapper_2.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 clang -c -emit-llvm main.c
 llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
 llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
 ../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
 ../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
 ../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
 llc --relocation-model=pic --filetype=obj  main.bc
 llc --relocation-model=pic --filetype=obj  cuda.bc
 llc --relocation-model=pic --filetype=obj  num.bc
 llc --relocation-model=pic --filetype=obj  timer.bc
 llc --relocation-model=pic --filetype=obj  kernel1.bc
 llc --relocation-model=pic --filetype=obj  kernel2.bc
 llc --relocation-model=pic --filetype=obj  host1.bc
 llc --relocation-model=pic --filetype=obj  host2.bc
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o b+tree.out \
    -fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
    -lc -lx86Runtime -lthreadPool -lpthread
 ./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
    command ../../rodinia-data/b+tree/command.txt
 if grep -q "0    840187    6001" output.txt; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/btree/util/cuda/cuda.cu
+++ b/examples/btree/util/cuda/cuda.cu
@ -1,75 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	SET_DEVICE CODE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE/DEFINE
 //======================================================================================================================================================150
 #include "cuda.h"					// (in library path specified to compiler)
 //======================================================================================================================================================150
 //	FUNCTIONS
 //======================================================================================================================================================150
 //====================================================================================================100
 //	SET DEVICE
 //====================================================================================================100
 void setdevice(void){
 	// variables
 	int num_devices;
 	int device;
 	// work
 	cudaGetDeviceCount(&num_devices);
 	if (num_devices > 1) {
 		// variables
 		int max_multiprocessors;
 		int max_device;
 		cudaDeviceProp properties;
 		// initialize variables
 		max_multiprocessors = 0;
 		max_device = 0;
 		for (device = 0; device < num_devices; device++) {
 			cudaGetDeviceProperties(&properties, device);
 			if (max_multiprocessors < properties.multiProcessorCount) {
 				max_multiprocessors = properties.multiProcessorCount;
 				max_device = device;
 			}
 		}
 		cudaSetDevice(max_device);
 	}
 }
 //====================================================================================================100
 //	GET LAST ERROR
 //====================================================================================================100
 void checkCUDAError(const char *msg)
 {
 	cudaError_t err = cudaGetLastError();
 	if( cudaSuccess != err) {
 		// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
 		printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
 		fflush(NULL);
 		exit(EXIT_FAILURE);
 	}
 }
 //===============================================================================================================================================================================================================200
 //	END
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/cuda/cuda.h
+++ b/examples/btree/util/cuda/cuda.h
@ -1,37 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	SET_DEVICE HEADER
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE/DEFINE
 //======================================================================================================================================================150
 #include <stdio.h> // (in library path known to compiler)		needed by printf
 //======================================================================================================================================================150
 //	FUNCTION PROTOTYPES
 //======================================================================================================================================================150
 //====================================================================================================100
 //	SET DEVICE
 //====================================================================================================100
 void setdevice(void);
 //====================================================================================================100
 //	GET LAST ERROR
 //====================================================================================================100
 void checkCUDAError(const char *msg);
 //===============================================================================================================================================================================================================200
 //	END SET_DEVICE HEADER
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/num/num.c
+++ b/examples/btree/util/num/num.c
@ -1,55 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	DESCRIPTION
 //===============================================================================================================================================================================================================200
 // Returns:	0 if string does not represent integer
 //			1 if string represents integer
 //===============================================================================================================================================================================================================200
 //	NUM CODE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	ISINTEGER FUNCTION
 //======================================================================================================================================================150
 int isInteger(char *str) {
  //====================================================================================================100
  //	make sure it's not empty
  //====================================================================================================100
  if (*str == '\0') {
    return 0;
  }
  //====================================================================================================100
  //	if any digit is not a number, return false
  //====================================================================================================100
  for (; *str != '\0'; str++) {
    if (*str < 48 ||
        *str >
            57) { // digit characters (need to include . if checking for float)
      return 0;
    }
  }
  //====================================================================================================100
  //	it got past all my checks so I think it's a number
  //====================================================================================================100
  return 1;
 }
 //===============================================================================================================================================================================================================200
 //	END NUM CODE
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/num/num.h
+++ b/examples/btree/util/num/num.h
@ -1,21 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	FILE HEADER
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	ISINTEGER FUNCTION PROTOTYPE
 //======================================================================================================================================================150
 int isInteger(char *str);
 //===============================================================================================================================================================================================================200
 //	END FILE HEADER
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/timer/timer.c
+++ b/examples/btree/util/timer/timer.c
@ -1,36 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	TIMER CODE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE/DEFINE
 //======================================================================================================================================================150
 #include <stdlib.h>
 //======================================================================================================================================================150
 //	FUNCTIONS
 //======================================================================================================================================================150
 //====================================================================================================100
 //	DISPLAY TIME
 //====================================================================================================100
 // Returns the current system time in microseconds
 long long get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (tv.tv_sec * 1000000) + tv.tv_usec;
 }
 //===============================================================================================================================================================================================================200
 //	END TIMER CODE
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/timer/timer.h
+++ b/examples/btree/util/timer/timer.h
@ -1,21 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	TIMER HEADER
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	FUNCTION PROTOTYPES
 //======================================================================================================================================================150
 long long get_time();
 //===============================================================================================================================================================================================================200
 //	END TIMER HEADER
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/cfd/euler3d.cu
+++ b/examples/cfd/euler3d.cu
@ -1,662 +0,0 @@
 #include <fstream>
 #include <helper_cuda.h>
 #include <helper_timer.h>
 #include <iostream>
 /*
 * Options
 *
 */
 #define GAMMA 1.4f
 #define iterations 2
 // #ifndef block_length
 // 	#define block_length 192
 // #endif
 #define NDIM 3
 #define NNB 4
 #define RK 3 // 3rd order RK
 #define ff_mach 1.2f
 #define deg_angle_of_attack 0.0f
 /*
 * not options
 */
 #ifdef RD_WG_SIZE_0_0
 #define BLOCK_SIZE_0 RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define BLOCK_SIZE_0 RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_0 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_0 192
 #endif
 #ifdef RD_WG_SIZE_1_0
 #define BLOCK_SIZE_1 RD_WG_SIZE_1_0
 #elif defined(RD_WG_SIZE_1)
 #define BLOCK_SIZE_1 RD_WG_SIZE_1
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_1 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_1 192
 #endif
 #ifdef RD_WG_SIZE_2_0
 #define BLOCK_SIZE_2 RD_WG_SIZE_2_0
 #elif defined(RD_WG_SIZE_1)
 #define BLOCK_SIZE_2 RD_WG_SIZE_2
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_2 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_2 192
 #endif
 #ifdef RD_WG_SIZE_3_0
 #define BLOCK_SIZE_3 RD_WG_SIZE_3_0
 #elif defined(RD_WG_SIZE_3)
 #define BLOCK_SIZE_3 RD_WG_SIZE_3
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_3 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_3 192
 #endif
 #ifdef RD_WG_SIZE_4_0
 #define BLOCK_SIZE_4 RD_WG_SIZE_4_0
 #elif defined(RD_WG_SIZE_4)
 #define BLOCK_SIZE_4 RD_WG_SIZE_4
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_4 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_4 192
 #endif
 // #if block_length > 128
 // #warning "the kernels may fail too launch on some systems if the block length
 // is too large" #endif
 #define VAR_DENSITY 0
 #define VAR_MOMENTUM 1
 #define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
 #define NVAR (VAR_DENSITY_ENERGY + 1)
 /*
 * Generic functions
 */
 template <typename T> T *alloc(int N) {
  T *t;
  checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
  return t;
 }
 template <typename T> void dealloc(T *array) {
  checkCudaErrors(cudaFree((void *)array));
 }
 template <typename T> void copy(T *dst, T *src, int N) {
  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
                             cudaMemcpyDeviceToDevice));
 }
 template <typename T> void upload(T *dst, T *src, int N) {
  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
                             cudaMemcpyHostToDevice));
 }
 template <typename T> void download(T *dst, T *src, int N) {
  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
                             cudaMemcpyDeviceToHost));
 }
 void dump(float *variables, int nel, int nelr) {
  float *h_variables = new float[nelr * NVAR];
  download(h_variables, variables, nelr * NVAR);
  {
    std::ofstream file("density");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++)
      file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
  }
  {
    std::ofstream file("momentum");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++) {
      for (int j = 0; j != NDIM; j++)
        file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
      file << std::endl;
    }
  }
  {
    std::ofstream file("density_energy");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++)
      file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
  }
  delete[] h_variables;
 }
 /*
 * Element-based Cell-centered FVM solver functions
 */
 __constant__ float ff_variable[NVAR];
 __constant__ float3 ff_flux_contribution_momentum_x[1];
 __constant__ float3 ff_flux_contribution_momentum_y[1];
 __constant__ float3 ff_flux_contribution_momentum_z[1];
 __constant__ float3 ff_flux_contribution_density_energy[1];
 __global__ void cuda_initialize_variables(int nelr, float *variables) {
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  for (int j = 0; j < NVAR; j++)
    variables[i + j * nelr] = ff_variable[j];
 }
 void initialize_variables(int nelr, float *variables) {
  dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
  cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
  getLastCudaError("initialize_variables failed");
 }
 __device__ __host__ inline void compute_flux_contribution(
    float &density, float3 &momentum, float &density_energy, float &pressure,
    float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
    float3 &fc_momentum_z, float3 &fc_density_energy) {
  fc_momentum_x.x = velocity.x * momentum.x + pressure;
  fc_momentum_x.y = velocity.x * momentum.y;
  fc_momentum_x.z = velocity.x * momentum.z;
  fc_momentum_y.x = fc_momentum_x.y;
  fc_momentum_y.y = velocity.y * momentum.y + pressure;
  fc_momentum_y.z = velocity.y * momentum.z;
  fc_momentum_z.x = fc_momentum_x.z;
  fc_momentum_z.y = fc_momentum_y.z;
  fc_momentum_z.z = velocity.z * momentum.z + pressure;
  float de_p = density_energy + pressure;
  fc_density_energy.x = velocity.x * de_p;
  fc_density_energy.y = velocity.y * de_p;
  fc_density_energy.z = velocity.z * de_p;
 }
 __device__ inline void compute_velocity(float &density, float3 &momentum,
                                        float3 &velocity) {
  velocity.x = momentum.x / density;
  velocity.y = momentum.y / density;
  velocity.z = momentum.z / density;
 }
 __device__ inline float compute_speed_sqd(float3 &velocity) {
  return velocity.x * velocity.x + velocity.y * velocity.y +
         velocity.z * velocity.z;
 }
 __device__ inline float compute_pressure(float &density, float &density_energy,
                                         float &speed_sqd) {
  return (float(GAMMA) - float(1.0f)) *
         (density_energy - float(0.5f) * density * speed_sqd);
 }
 __device__ inline float compute_speed_of_sound(float &density,
                                               float &pressure) {
  return sqrtf(float(GAMMA) * pressure / density);
 }
 __global__ void cuda_compute_step_factor(int nelr, float *variables,
                                         float *areas, float *step_factors) {
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  float density = variables[i + VAR_DENSITY * nelr];
  float3 momentum;
  momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
  momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
  momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
  float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
  float3 velocity;
  compute_velocity(density, momentum, velocity);
  float speed_sqd = compute_speed_sqd(velocity);
  float pressure = compute_pressure(density, density_energy, speed_sqd);
  float speed_of_sound = compute_speed_of_sound(density, pressure);
  // dt = float(0.5f) * sqrtf(areas[i]) /  (||v|| + c).... but when we do time
  // stepping, this later would need to be divided by the area, so we just do it
  // all at once
  step_factors[i] =
      float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
 }
 void compute_step_factor(int nelr, float *variables, float *areas,
                         float *step_factors) {
  dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
  cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
  getLastCudaError("compute_step_factor failed");
 }
 /*
 *
 *
 */
 __global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
                                  float *normals, float *variables,
                                  float *fluxes) {
  const float smoothing_coefficient = float(0.2f);
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  int j, nb;
  float3 normal;
  float normal_len;
  float factor;
  float density_i = variables[i + VAR_DENSITY * nelr];
  float3 momentum_i;
  momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
  momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
  momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
  float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
  float3 velocity_i;
  compute_velocity(density_i, momentum_i, velocity_i);
  float speed_sqd_i = compute_speed_sqd(velocity_i);
  float speed_i = sqrtf(speed_sqd_i);
  float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
  float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
  float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
      flux_contribution_i_momentum_z;
  float3 flux_contribution_i_density_energy;
  compute_flux_contribution(
      density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
      flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
      flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
  float flux_i_density = float(0.0f);
  float3 flux_i_momentum;
  flux_i_momentum.x = float(0.0f);
  flux_i_momentum.y = float(0.0f);
  flux_i_momentum.z = float(0.0f);
  float flux_i_density_energy = float(0.0f);
  float3 velocity_nb;
  float density_nb, density_energy_nb;
  float3 momentum_nb;
  float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
      flux_contribution_nb_momentum_z;
  float3 flux_contribution_nb_density_energy;
  float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
 #pragma unroll
  for (j = 0; j < NNB; j++) {
    nb = elements_surrounding_elements[i + j * nelr];
    normal.x = normals[i + (j + 0 * NNB) * nelr];
    normal.y = normals[i + (j + 1 * NNB) * nelr];
    normal.z = normals[i + (j + 2 * NNB) * nelr];
    normal_len =
        sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
    if (nb >= 0) // a legitimate neighbor
    {
      density_nb = variables[nb + VAR_DENSITY * nelr];
      momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
      momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
      momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
      density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
      compute_velocity(density_nb, momentum_nb, velocity_nb);
      speed_sqd_nb = compute_speed_sqd(velocity_nb);
      pressure_nb =
          compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
      speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
      compute_flux_contribution(
          density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
          flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
          flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
      // artificial viscosity
      factor = -normal_len * smoothing_coefficient * float(0.5f) *
               (speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
                speed_of_sound_nb);
      flux_i_density += factor * (density_i - density_nb);
      flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
      flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
      flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
      flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
      // accumulate cell-centered fluxes
      factor = float(0.5f) * normal.x;
      flux_i_density += factor * (momentum_nb.x + momentum_i.x);
      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
                                         flux_contribution_i_density_energy.x);
      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
                                     flux_contribution_i_momentum_x.x);
      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
                                     flux_contribution_i_momentum_y.x);
      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
                                     flux_contribution_i_momentum_z.x);
      factor = float(0.5f) * normal.y;
      flux_i_density += factor * (momentum_nb.y + momentum_i.y);
      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
                                         flux_contribution_i_density_energy.y);
      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
                                     flux_contribution_i_momentum_x.y);
      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
                                     flux_contribution_i_momentum_y.y);
      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
                                     flux_contribution_i_momentum_z.y);
      factor = float(0.5f) * normal.z;
      flux_i_density += factor * (momentum_nb.z + momentum_i.z);
      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
                                         flux_contribution_i_density_energy.z);
      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
                                     flux_contribution_i_momentum_x.z);
      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
                                     flux_contribution_i_momentum_y.z);
      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
                                     flux_contribution_i_momentum_z.z);
    } else if (nb == -1) // a wing boundary
    {
      flux_i_momentum.x += normal.x * pressure_i;
      flux_i_momentum.y += normal.y * pressure_i;
      flux_i_momentum.z += normal.z * pressure_i;
    } else if (nb == -2) // a far field boundary
    {
      factor = float(0.5f) * normal.x;
      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
      flux_i_density_energy +=
          factor * (ff_flux_contribution_density_energy[0].x +
                    flux_contribution_i_density_energy.x);
      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
                                     flux_contribution_i_momentum_x.x);
      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
                                     flux_contribution_i_momentum_y.x);
      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
                                     flux_contribution_i_momentum_z.x);
      factor = float(0.5f) * normal.y;
      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
      flux_i_density_energy +=
          factor * (ff_flux_contribution_density_energy[0].y +
                    flux_contribution_i_density_energy.y);
      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
                                     flux_contribution_i_momentum_x.y);
      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
                                     flux_contribution_i_momentum_y.y);
      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
                                     flux_contribution_i_momentum_z.y);
      factor = float(0.5f) * normal.z;
      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
      flux_i_density_energy +=
          factor * (ff_flux_contribution_density_energy[0].z +
                    flux_contribution_i_density_energy.z);
      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
                                     flux_contribution_i_momentum_x.z);
      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
                                     flux_contribution_i_momentum_y.z);
      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
                                     flux_contribution_i_momentum_z.z);
    }
  }
  fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
  fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
  fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
  fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
  fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
 }
 void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
                  float *variables, float *fluxes) {
  dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
  cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
                                variables, fluxes);
  getLastCudaError("compute_flux failed");
 }
 __global__ void cuda_time_step(int j, int nelr, float *old_variables,
                               float *variables, float *step_factors,
                               float *fluxes) {
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  float factor = step_factors[i] / float(RK + 1 - j);
  variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
                                      factor * fluxes[i + VAR_DENSITY * nelr];
  variables[i + VAR_DENSITY_ENERGY * nelr] =
      old_variables[i + VAR_DENSITY_ENERGY * nelr] +
      factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
  variables[i + (VAR_MOMENTUM + 0) * nelr] =
      old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
      factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
  variables[i + (VAR_MOMENTUM + 1) * nelr] =
      old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
      factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
  variables[i + (VAR_MOMENTUM + 2) * nelr] =
      old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
      factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
 }
 void time_step(int j, int nelr, float *old_variables, float *variables,
               float *step_factors, float *fluxes) {
  dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
  cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
                             fluxes);
  getLastCudaError("update failed");
 }
 /*
 * Main function
 */
 int main(int argc, char **argv) {
  printf("WG size of kernel:initialize = %d, WG size of "
         "kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
         "%d, WG size of kernel:time_step = %d\n",
         BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
  if (argc < 2) {
    std::cout << "specify data file name" << std::endl;
    return 0;
  }
  const char *data_file_name = argv[1];
  cudaDeviceProp prop;
  int dev;
  checkCudaErrors(cudaSetDevice(0));
  // set far field conditions and load them into constant memory on the gpu
  {
    float h_ff_variable[NVAR];
    const float angle_of_attack =
        float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
    h_ff_variable[VAR_DENSITY] = float(1.4);
    float ff_pressure = float(1.0f);
    float ff_speed_of_sound =
        sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
    float ff_speed = float(ff_mach) * ff_speed_of_sound;
    float3 ff_velocity;
    ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
    ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
    ff_velocity.z = 0.0f;
    h_ff_variable[VAR_MOMENTUM + 0] =
        h_ff_variable[VAR_DENSITY] * ff_velocity.x;
    h_ff_variable[VAR_MOMENTUM + 1] =
        h_ff_variable[VAR_DENSITY] * ff_velocity.y;
    h_ff_variable[VAR_MOMENTUM + 2] =
        h_ff_variable[VAR_DENSITY] * ff_velocity.z;
    h_ff_variable[VAR_DENSITY_ENERGY] =
        h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
        (ff_pressure / float(GAMMA - 1.0f));
    float3 h_ff_momentum;
    h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
    h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
    h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
    float3 h_ff_flux_contribution_momentum_x;
    float3 h_ff_flux_contribution_momentum_y;
    float3 h_ff_flux_contribution_momentum_z;
    float3 h_ff_flux_contribution_density_energy;
    compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
                              h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
                              ff_velocity, h_ff_flux_contribution_momentum_x,
                              h_ff_flux_contribution_momentum_y,
                              h_ff_flux_contribution_momentum_z,
                              h_ff_flux_contribution_density_energy);
    // copy far field conditions to the gpu
    checkCudaErrors(
        cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
                                       &h_ff_flux_contribution_momentum_x,
                                       sizeof(float3)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
                                       &h_ff_flux_contribution_momentum_y,
                                       sizeof(float3)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
                                       &h_ff_flux_contribution_momentum_z,
                                       sizeof(float3)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
                                       &h_ff_flux_contribution_density_energy,
                                       sizeof(float3)));
  }
  int nel;
  int nelr;
  // read in domain geometry
  float *areas;
  int *elements_surrounding_elements;
  float *normals;
  {
    std::ifstream file(data_file_name);
    file >> nel;
    nelr =
        BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
    float *h_areas = new float[nelr];
    int *h_elements_surrounding_elements = new int[nelr * NNB];
    float *h_normals = new float[nelr * NDIM * NNB];
    // read in data
    for (int i = 0; i < nel; i++) {
      file >> h_areas[i];
      for (int j = 0; j < NNB; j++) {
        file >> h_elements_surrounding_elements[i + j * nelr];
        if (h_elements_surrounding_elements[i + j * nelr] < 0)
          h_elements_surrounding_elements[i + j * nelr] = -1;
        h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
                                                         // Fortran numbering
        for (int k = 0; k < NDIM; k++) {
          file >> h_normals[i + (j + k * NNB) * nelr];
          h_normals[i + (j + k * NNB) * nelr] =
              -h_normals[i + (j + k * NNB) * nelr];
        }
      }
    }
    // fill in remaining data
    int last = nel - 1;
    for (int i = nel; i < nelr; i++) {
      h_areas[i] = h_areas[last];
      for (int j = 0; j < NNB; j++) {
        // duplicate the last element
        h_elements_surrounding_elements[i + j * nelr] =
            h_elements_surrounding_elements[last + j * nelr];
        for (int k = 0; k < NDIM; k++)
          h_normals[last + (j + k * NNB) * nelr] =
              h_normals[last + (j + k * NNB) * nelr];
      }
    }
    areas = alloc<float>(nelr);
    upload<float>(areas, h_areas, nelr);
    elements_surrounding_elements = alloc<int>(nelr * NNB);
    upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
                nelr * NNB);
    normals = alloc<float>(nelr * NDIM * NNB);
    upload<float>(normals, h_normals, nelr * NDIM * NNB);
    delete[] h_areas;
    delete[] h_elements_surrounding_elements;
    delete[] h_normals;
  }
  // Create arrays and set initial conditions
  float *variables = alloc<float>(nelr * NVAR);
  initialize_variables(nelr, variables);
  float *old_variables = alloc<float>(nelr * NVAR);
  float *fluxes = alloc<float>(nelr * NVAR);
  float *step_factors = alloc<float>(nelr);
  // make sure all memory is floatly allocated before we start timing
  initialize_variables(nelr, old_variables);
  initialize_variables(nelr, fluxes);
  cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
  // make sure CUDA isn't still doing something before we start timing
  cudaThreadSynchronize();
  // these need to be computed the first time in order to compute time step
  std::cout << "Starting..." << std::endl;
  StopWatchInterface *timer = 0;
  //	unsigned int timer = 0;
  // CUT_SAFE_CALL( cutCreateTimer( &timer));
  // CUT_SAFE_CALL( cutStartTimer( timer));
  sdkCreateTimer(&timer);
  sdkStartTimer(&timer);
  // Begin iterations
  for (int i = 0; i < iterations; i++) {
    copy<float>(old_variables, variables, nelr * NVAR);
    // for the first iteration we compute the time step
    compute_step_factor(nelr, variables, areas, step_factors);
    getLastCudaError("compute_step_factor failed");
    for (int j = 0; j < RK; j++) {
      compute_flux(nelr, elements_surrounding_elements, normals, variables,
                   fluxes);
      getLastCudaError("compute_flux failed");
      time_step(j, nelr, old_variables, variables, step_factors, fluxes);
      getLastCudaError("time_step failed");
    }
  }
  cudaThreadSynchronize();
  //	CUT_SAFE_CALL( cutStopTimer(timer) );
  sdkStopTimer(&timer);
  std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
            << " seconds per iteration" << std::endl;
  std::cout << "Saving solution..." << std::endl;
  dump(variables, nel, nelr);
  std::cout << "Saved solution..." << std::endl;
  std::cout << "Cleaning up..." << std::endl;
  dealloc<float>(areas);
  dealloc<int>(elements_surrounding_elements);
  dealloc<float>(normals);
  dealloc<float>(variables);
  dealloc<float>(old_variables);
  dealloc<float>(fluxes);
  dealloc<float>(step_factors);
  std::cout << "Done..." << std::endl;
  return 0;
 }
--- a/examples/cfd/run.sh
+++ b/examples/cfd/run.sh
@ -1,15 +0,0 @@
 # # #!/bin/bash
 clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 /home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
 /home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc  host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 ./a.out ../rodinia-data/cfd/fvcorr.domn.097K
 # ./demo 1024
 # # # ./demo -f ../../data/matrix3.txt
 # # # run -f ../../data/gaussian/matrix3.txt
--- a/examples/dwt2d/common.h
+++ b/examples/dwt2d/common.h
@ -1,64 +0,0 @@
 /*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef _COMMON_H
 #define _COMMON_H
 // 24-bit multiplication is faster on G80,
 // but we must be sure to multiply integers
 // only within [-8M, 8M - 1] range
 #define IMUL(a, b) __mul24(a, b)
 ////cuda timing macros
 //#define CTIMERINIT  cudaEvent_t cstart, cstop; \
 //                    cudaEventCreate(&cstart); \
 //                    cudaEventCreate(&cstop); \
 //                    float elapsedTime
 //#define CTIMERSTART(cstart) cudaEventRecord(cstart,0)
 //#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \
 //                          cudaEventSynchronize(cstop); \
 //                          cudaEventElapsedTime(&elapsedTime, cstart, cstop)
 // divide and round up macro
 #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
 #define cudaCheckError(msg)                                                    \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg,            \
              cudaGetErrorString(err));                                        \
      exit(-1);                                                                \
    }                                                                          \
  }
 #define cudaCheckAsyncError(msg)                                               \
  {                                                                            \
    cudaThreadSynchronize();                                                   \
    cudaCheckError(msg);                                                       \
  }
 #endif
--- a/examples/dwt2d/components.cu
+++ b/examples/dwt2d/components.cu
@ -1,193 +0,0 @@
 /*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include <unistd.h>
 #include <error.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <assert.h>
 #include "components.h"
 #include "common.h"
 #define THREADS 256
 /* Store 3 RGB float components */
 __device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
 {
    d_r[pos] = (r/255.0f) - 0.5f;
    d_g[pos] = (g/255.0f) - 0.5f;
    d_b[pos] = (b/255.0f) - 0.5f;
 }
 /* Store 3 RGB intege components */
 __device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
 {
    d_r[pos] = r - 128;
    d_g[pos] = g - 128;
    d_b[pos] = b - 128;
 }
 /* Store float component */
 __device__ void storeComponent(float *d_c, float c, int pos)
 {
    d_c[pos] = (c/255.0f) - 0.5f;
 }
 /* Store integer component */
 __device__ void storeComponent(int *d_c, int c, int pos)
 {
    d_c[pos] = c - 128;
 }
 /* Copy img src data into three separated component buffers */
 template<typename T>
 __global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
                                  unsigned char * d_src,
                                  int pixels)
 {
    int x  = threadIdx.x;
    int gX = blockDim.x*blockIdx.x;
    __shared__ unsigned char sData[THREADS*3];
    /* Copy data to shared mem by 4bytes
       other checks are not necessary, since
       d_src buffer is aligned to sharedDataSize */
    if ( (x*4) < THREADS*3 ) {
        float *s = (float *)d_src;
        float *d = (float *)sData;
        d[x] = s[((gX*3)>>2) + x];
    }
    __syncthreads();
    T r, g, b;
    int offset = x*3;
    r = (T)(sData[offset]);
    g = (T)(sData[offset+1]);
    b = (T)(sData[offset+2]);
    int globalOutputPosition = gX + x;
    if (globalOutputPosition < pixels) {
        storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
    }
 }
 /* Copy img src data into three separated component buffers */
 template<typename T>
 __global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
 {
    int x  = threadIdx.x;
    int gX = blockDim.x*blockIdx.x;
    __shared__ unsigned char sData[THREADS];
    /* Copy data to shared mem by 4bytes
       other checks are not necessary, since
       d_src buffer is aligned to sharedDataSize */
    if ( (x*4) < THREADS) {
        float *s = (float *)d_src;
        float *d = (float *)sData;
        d[x] = s[(gX>>2) + x];
    }
    __syncthreads();
    T c;
    c = (T)(sData[x]);
    int globalOutputPosition = gX + x;
    if (globalOutputPosition < pixels) {
        storeComponent(d_c, c, globalOutputPosition);
    }
 }
 /* Separate compoents of 8bit RGB source image */
 template<typename T>
 void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
 {
    unsigned char * d_src;
    int pixels      = width*height;
    int alignedSize =  DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
    /* Alloc d_src buffer */
    cudaMalloc((void **)&d_src, alignedSize);
    cudaCheckAsyncError("Cuda malloc")
    cudaMemset(d_src, 0, alignedSize);
    /* Copy data to device */
    cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
    cudaCheckError("Copy data to device")
    /* Kernel */
    dim3 threads(THREADS);
    dim3 grid(alignedSize/(THREADS*3));
    assert(alignedSize%(THREADS*3) == 0);
    c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
    cudaCheckAsyncError("CopySrcToComponents kernel")
    /* Free Memory */
    cudaFree(d_src);
    cudaCheckAsyncError("Free memory")
 }
 template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
 template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
 /* Copy a 8bit source image data into a color compoment of type T */
 template<typename T>
 void bwToComponent(T *d_c, unsigned char * src, int width, int height)
 {
    unsigned char * d_src;
    int pixels      = width*height;
    int alignedSize =  DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
    /* Alloc d_src buffer */
    cudaMalloc((void **)&d_src, alignedSize);
    cudaCheckAsyncError("Cuda malloc")
    cudaMemset(d_src, 0, alignedSize);
    /* Copy data to device */
    cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
    cudaCheckError("Copy data to device")
    /* Kernel */
    dim3 threads(THREADS);
    dim3 grid(alignedSize/(THREADS));
    assert(alignedSize%(THREADS) == 0);
    c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
    cudaCheckAsyncError("CopySrcToComponent kernel")
    /* Free Memory */
    cudaFree(d_src);
    cudaCheckAsyncError("Free memory")
 }
 template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
 template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);
--- a/examples/dwt2d/components.h
+++ b/examples/dwt2d/components.h
@ -1,39 +0,0 @@
 /*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef _COMPONENTS_H
 #define _COMPONENTS_H
 /* Separate compoents of source 8bit RGB image */
 template <typename T>
 void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
                     int height);
 /* Copy a 8bit source image data into a color compoment of type T */
 template <typename T>
 void bwToComponent(T *d_c, unsigned char *src, int width, int height);
 #endif
--- a/examples/dwt2d/dwt.cu
+++ b/examples/dwt2d/dwt.cu
@ -1,385 +0,0 @@
 /*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include <stdio.h>
 #include <fcntl.h>
 #include <assert.h>
 #include <errno.h>
 #include <sys/time.h>
 #include <unistd.h>
 #include <error.h>
 #include "dwt_cuda/dwt.h"
 #include "dwt_cuda/common.h"
 #include "dwt.h"
 #include "common.h"
 #include <iostream>
 #include <fstream>
 inline void fdwt(float *in, float *out, int width, int height, int levels)
 {
        printf(" Running fdwt97 Float \n");
        dwt_cuda::fdwt97(in, out, width, height, levels);
 }
 /*
 inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
 {
        dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
 }
 */
 inline void fdwt(int *in, int *out, int width, int height, int levels)
 {
        printf(" Running fdwt53 Int \n");
        dwt_cuda::fdwt53(in, out, width, height, levels);
 }
 /*
 inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
 {
        dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
 }
 */
 inline void rdwt(float *in, float *out, int width, int height, int levels)
 {
        printf(" Running rdwt97 Float \n");
        dwt_cuda::rdwt97(in, out, width, height, levels);
 }
 inline void rdwt(int *in, int *out, int width, int height, int levels)
 {
        printf(" Running rdwt53 Int \n");
        dwt_cuda::rdwt53(in, out, width, height, levels);
 }
 template<typename T>
 int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
 {
    printf("\n*** %d stages of 2D forward DWT:\n", stages);
    /* create backup of input, because each test iteration overwrites it */
    const int size = pixHeight * pixWidth * sizeof(T);
    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
    cudaCheckError("Memcopy device to device");
    /* Measure time of individual levels. */
    if(forward)
        fdwt(in, out, pixWidth, pixHeight, stages);
    else
        rdwt(in, out, pixWidth, pixHeight, stages);
    // Measure overall time of DWT.
 /*    #ifdef GPU_DWT_TESTING_1
    dwt_cuda::CudaDWTTester tester;
    for(int i = tester.getNumIterations(); i--; ) {
        // Recover input and measure one overall DWT run.
        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
        cudaCheckError("Memcopy device to device");
        tester.beginTestIteration();
        if(forward)
            fdwt(in, out, pixWidth, pixHeight, stages);
        else
            rdwt(in, out, pixWidth, pixHeight, stages);
        tester.endTestIteration();
    }
    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
    #endif  // GPU_DWT_TESTING
    cudaCheckAsyncError("DWT Kernel calls");
 */    return 0;
 }
 template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
 template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
 /*
 template<typename T>
 int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
 {
    printf("*** %d stages of 2D forward DWT:\n", stages);
    // create backup of input, because each test iteration overwrites it
    const int size = pixHeight * pixWidth * sizeof(T);
    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
    cudaCheckError("Memcopy device to device");
    // Measure time of individual levels.
    if(forward)
        fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
    else
        rdwt(in, out, pixWidth, pixHeight, stages);
    // Measure overall time of DWT.
    #ifdef GPU_DWT_TESTING_1
    dwt_cuda::CudaDWTTester tester;
    for(int i = tester.getNumIterations(); i--; ) {
        // Recover input and measure one overall DWT run.
        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
        cudaCheckError("Memcopy device to device");
        tester.beginTestIteration();
        if(forward)
            fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
        else
            rdwt(in, out, pixWidth, pixHeight, stages);
        tester.endTestIteration();
    }
    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
    #endif  // GPU_DWT_TESTING
    cudaCheckAsyncError("DWT Kernel calls");
    return 0;
 }
 template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
 template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
 */
 void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
 {
    int i;
    std::ofstream outputFile;
    char outfile[strlen(filename)+strlen(".txt")];
    strcpy(outfile, filename);
    strcpy(outfile+strlen(filename), ".txt");
    outputFile.open(outfile);
    for(i = 0; i < samplesNum; i++) {
        float r = (src[i]+0.5f) * 255;
        if (r > 255) r = 255;
        if (r < 0)   r = 0;
        dst[i] = (unsigned char)r;
        outputFile << "index: " << i  << " val: "<< r <<" \n";
    }
    outputFile.close();
 }
 void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
 {
    int i;
    std::ofstream outputFile;
    char outfile[strlen(filename)+strlen(".txt")];
    strcpy(outfile, filename);
    strcpy(outfile+strlen(filename), ".txt");
    outputFile.open(outfile);
    for(i = 0; i < samplesNum; i++) {
        int r = src[i]+128;
        if (r > 255) r = 255;
        if (r < 0)   r = 0;
        dst[i] = (unsigned char)r;
        // added this line to output check
        outputFile << "index: " << i  << " val: "<< r <<" \n";
    }
    outputFile.close();
 }
 ///* Write output linear orderd*/
 template<typename T>
 int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
                const char * filename, const char * suffix)
 {
    unsigned char * result;
    T *gpu_output;
    int i;
    int size;
    int samplesNum = pixWidth*pixHeight;
    size = samplesNum*sizeof(T);
    cudaMallocHost((void **)&gpu_output, size);
    cudaCheckError("Malloc host");
    memset(gpu_output, 0, size);
    result = (unsigned char *)malloc(samplesNum);
    cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
    cudaCheckError("Memcopy device to host");
    /* T to char */
    samplesToChar(result, gpu_output, samplesNum, filename);
    /* Write component */
    char outfile[strlen(filename)+strlen(suffix)];
    strcpy(outfile, filename);
    strcpy(outfile+strlen(filename), suffix);
    i = open(outfile, O_CREAT|O_WRONLY, 0644);
    if (i == -1) {
        error(0,errno,"cannot access %s", outfile);
        return -1;
    }
    printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
    ssize_t x ;
    x = write(i, result, samplesNum);
    close(i);
    /* Clean up */
    cudaFreeHost(gpu_output);
    cudaCheckError("Cuda free host memory");
    free(result);
    if(x == 0) return 1;
    return 0;
 }
 template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
 template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
 /* Write output visual ordered */
 template<typename T>
 int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
                     int stages, const char * filename, const char * suffix)
 {
    struct band {
        int dimX;
        int dimY;
    };
    struct dimensions {
        struct band LL;
        struct band HL;
        struct band LH;
        struct band HH;
    };
    unsigned char * result;
    T *src, *dst;
    int i,s;
    int size;
    int offset;
    int yOffset;
    int samplesNum = pixWidth*pixHeight;
    struct dimensions * bandDims;
    bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
    bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
    bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
    bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
    bandDims[0].HL.dimY = bandDims[0].LL.dimY;
    bandDims[0].LH.dimX = bandDims[0].LL.dimX;
    bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
    bandDims[0].HH.dimX = bandDims[0].HL.dimX;
    bandDims[0].HH.dimY = bandDims[0].LH.dimY;
    for (i = 1; i < stages; i++) {
        bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
        bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
        bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
        bandDims[i].HL.dimY = bandDims[i].LL.dimY;
        bandDims[i].LH.dimX = bandDims[i].LL.dimX;
        bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
        bandDims[i].HH.dimX = bandDims[i].HL.dimX;
        bandDims[i].HH.dimY = bandDims[i].LH.dimY;
    }
 #if 0
    printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
    for (i = 0; i < stages; i++) {
        printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
        printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
        printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
        printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
    }
 #endif
    size = samplesNum*sizeof(T);
    cudaMallocHost((void **)&src, size);
    cudaCheckError("Malloc host");
    dst = (T*)malloc(size);
    memset(src, 0, size);
    memset(dst, 0, size);
    result = (unsigned char *)malloc(samplesNum);
    cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
    cudaCheckError("Memcopy device to host");
    // LL Band
    size = bandDims[stages-1].LL.dimX * sizeof(T);
    for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
        memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
    }
    for (s = stages - 1; s >= 0; s--) {
        // HL Band
        size = bandDims[s].HL.dimX * sizeof(T);
        offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
        for (i = 0; i < bandDims[s].HL.dimY; i++) {
            memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
                src+offset+i*bandDims[s].HL.dimX,
                size);
        }
        // LH band
        size = bandDims[s].LH.dimX * sizeof(T);
        offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
        yOffset = bandDims[s].LL.dimY;
        for (i = 0; i < bandDims[s].HL.dimY; i++) {
            memcpy(dst+(yOffset+i)*pixWidth,
                src+offset+i*bandDims[s].LH.dimX,
                size);
        }
        //HH band
        size = bandDims[s].HH.dimX * sizeof(T);
        offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
        yOffset = bandDims[s].HL.dimY;
        for (i = 0; i < bandDims[s].HH.dimY; i++) {
            memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
                src+offset+i*bandDims[s].HH.dimX,
                size);
        }
    }
    /* Write component */
    samplesToChar(result, dst, samplesNum, filename);
    char outfile[strlen(filename)+strlen(suffix)];
    strcpy(outfile, filename);
    strcpy(outfile+strlen(filename), suffix);
    i = open(outfile, O_CREAT|O_WRONLY, 0644);
    if (i == -1) {
        error(0,errno,"cannot access %s", outfile);
        return -1;
    }
    printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
    ssize_t x;
    x = write(i, result, samplesNum);
    close(i);
    cudaFreeHost(src);
    cudaCheckError("Cuda free host memory");
    free(dst);
    free(result);
    free(bandDims);
    if (x == 0) return 1;
    return 0;
 }
 template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
 template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
--- a/examples/dwt2d/dwt.h
+++ b/examples/dwt2d/dwt.h
@ -1,41 +0,0 @@
 /*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef _DWT_H
 #define _DWT_H
 template <typename T>
 int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
                int stages, bool forward);
 template <typename T>
 int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
                     const char *filename, const char *suffix);
 template <typename T>
 int writeLinear(T *component_cuda, int width, int height, const char *filename,
                const char *suffix);
 #endif
--- a/examples/dwt2d/dwt_cuda/common.cu
+++ b/examples/dwt2d/dwt_cuda/common.cu
@ -1,35 +0,0 @@
 ///
 /// @file    common.cu
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-01-20 14:37
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #include "common.h"
 namespace dwt_cuda {
  bool CudaDWTTester::testRunning = false;
 }
--- a/examples/dwt2d/dwt_cuda/common.h
+++ b/examples/dwt2d/dwt_cuda/common.h
@ -1,232 +0,0 @@
 ///
 /// @file    common.h
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @brief   Common stuff for all CUDA dwt functions.
 /// @date    2011-01-20 14:19
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef DWT_COMMON_H
 #define DWT_COMMON_H
 #include <algorithm>
 #include <cstdio>
 #include <vector>
 // compile time minimum macro
 #define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
 // performance testing macros
 #if defined(GPU_DWT_TESTING)
 #define PERF_BEGIN                                                             \
  {                                                                            \
    dwt_cuda::CudaDWTTester PERF_TESTER;                                       \
    for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) {             \
      PERF_TESTER.beginTestIteration();
 #define PERF_END(PERF_NAME, PERF_W, PERF_H)                                    \
  PERF_TESTER.endTestIteration();                                              \
  }                                                                            \
  PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H);                      \
  }
 #else // GPU_DWT_TESTING
 #define PERF_BEGIN
 #define PERF_END(PERF_NAME, PERF_W, PERF_H)
 #endif // GPU_DWT_TESTING
 namespace dwt_cuda {
 /// Divide and round up.
 template <typename T>
 __device__ __host__ inline T divRndUp(const T &n, const T &d) {
  return (n / d) + ((n % d) ? 1 : 0);
 }
 // 9/7 forward DWT lifting schema coefficients
 const float f97Predict1 = -1.586134342;  ///< forward 9/7 predict 1
 const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
 const float f97Predict2 = 0.8829110762;  ///< forward 9/7 predict 2
 const float f97Update2 = 0.4435068522;   ///< forward 9/7 update 2
 // 9/7 reverse DWT lifting schema coefficients
 const float r97update2 = -f97Update2;   ///< undo 9/7 update 2
 const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
 const float r97update1 = -f97Update1;   ///< undo 9/7 update 1
 const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
 // FDWT 9/7 scaling coefficients
 const float scale97Mul = 1.23017410491400f;
 const float scale97Div = 1.0 / scale97Mul;
 // 5/3 forward DWT lifting schema coefficients
 const float forward53Predict = -0.5f; /// forward 5/3 predict
 const float forward53Update = 0.25f;  /// forward 5/3 update
 // 5/3 forward DWT lifting schema coefficients
 const float reverse53Update = -forward53Update;   /// undo 5/3 update
 const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
 /// Functor which adds scaled sum of neighbors to given central pixel.
 struct AddScaledSum {
  const float scale; // scale of neighbors
  __device__ AddScaledSum(const float scale) : scale(scale) {}
  __device__ void operator()(const float p, float &c, const float n) const {
    // if(threadIdx.x == 0) {
    //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
    //   scale * (p + n) );
    // }
    c += scale * (p + n);
  }
 };
 /// Returns index ranging from 0 to num threads, such that first half
 /// of threads get even indices and others get odd indices. Each thread
 /// gets different index.
 /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
 ///                              parityIdx:   0  2  4  6  1  3  5  7
 /// @tparam THREADS  total count of participating threads
 /// @return parity-separated index of thread
 template <int THREADS> __device__ inline int parityIdx() {
  return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
 }
 /// size of shared memory
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
 const int SHM_SIZE = 48 * 1024;
 #else
 const int SHM_SIZE = 16 * 1024;
 #endif
 /// Perrformance and return code tester.
 class CudaDWTTester {
 private:
  static bool testRunning;  ///< true if any test is currently running
  cudaEvent_t beginEvent;   ///< begin CUDA event
  cudaEvent_t endEvent;     ///< end CUDA event
  std::vector<float> times; ///< collected times
  const bool disabled;      ///< true if this object is disabled
 public:
  /// Checks CUDA related error.
  /// @param status   return code to be checked
  /// @param message  message to be shown if there was an error
  /// @return true if there was no error, false otherwise
  static bool check(const cudaError_t &status, const char *message) {
 #if defined(GPU_DWT_TESTING)
    if ((!testRunning) && status != cudaSuccess) {
      const char *errorString = cudaGetErrorString(status);
      fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
      fflush(stderr);
      return false;
    }
 #endif // GPU_DWT_TESTING
    return true;
  }
  /// Checks last kernel call for errors.
  /// @param message  description of the kernel call
  /// @return true if there was no error, false otherwise
  static bool checkLastKernelCall(const char *message) {
 #if defined(GPU_DWT_TESTING)
    return testRunning ? true : check(cudaThreadSynchronize(), message);
 #else  // GPU_DWT_TESTING
    return true;
 #endif // GPU_DWT_TESTING
  }
  /// Initializes DWT tester for time measurement
  CudaDWTTester() : disabled(testRunning) {}
  /// Gets rpefered number of iterations
  int getNumIterations() { return disabled ? 1 : 31; }
  /// Starts one test iteration.
  void beginTestIteration() {
    if (!disabled) {
      cudaEventCreate(&beginEvent);
      cudaEventCreate(&endEvent);
      cudaEventRecord(beginEvent, 0);
      testRunning = true;
    }
  }
  /// Ends on etest iteration.
  void endTestIteration() {
    if (!disabled) {
      float time;
      testRunning = false;
      cudaEventRecord(endEvent, 0);
      cudaEventSynchronize(endEvent);
      cudaEventElapsedTime(&time, beginEvent, endEvent);
      cudaEventDestroy(beginEvent);
      cudaEventDestroy(endEvent);
      times.push_back(time);
    }
  }
  /// Shows brief info about all iterations.
  /// @param name   name of processing method
  /// @param sizeX  width of processed image
  /// @param sizeY  height of processed image
  void showPerformance(const char *name, const int sizeX, const int sizeY) {
    if (!disabled) {
      // compute mean and median
      std::sort(times.begin(), times.end());
      double sum = 0;
      for (int i = times.size(); i--;) {
        sum += times[i];
      }
      const double median =
          (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
      printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
             "(%d x %d)\n",
             name, (sum / times.size()), median, times[times.size() - 1], sizeX,
             sizeY);
    }
  }
 };
 /// Simple cudaMemcpy wrapped in performance tester.
 /// @param dest  destination bufer
 /// @param src   source buffer
 /// @param sx    width of copied image
 /// @param sy    height of copied image
 template <typename T>
 inline void memCopy(T *const dest, const T *const src, const size_t sx,
                    const size_t sy) {
  cudaError_t status;
  PERF_BEGIN
  status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
  PERF_END("        memcpy", sx, sy)
  CudaDWTTester::check(status, "memcpy device > device");
 }
 } // end of namespace dwt_cuda
 #endif // DWT_COMMON_CUDA_H
--- a/examples/dwt2d/dwt_cuda/dwt.h
+++ b/examples/dwt2d/dwt_cuda/dwt.h
@ -1,103 +0,0 @@
 ///
 /// @file    dwt.h
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @brief   Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
 /// @date    2011-01-20 11:41
 ///
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 ///
 ///
 /// Following conditions are common for all four DWT functions:
 /// - Both input and output images are stored in GPU memory with no padding
 ///   of lines or interleaving of pixels.
 /// - DWT coefficients are stored as follows: Each band is saved as one
 ///   consecutive chunk (no padding/stride/interleaving). Deepest level bands
 ///   (smallest ones) are stored first (at the beginning of the input/output
 ///   buffers), less deep bands follow. There is no padding between stored
 ///   bands in the buffer. Order of bands of the same level in the buffer is
 ///   following: Low-low band (or deeper level subbands) is stored first.
 ///   Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low
 ///   band is saved next and finally, the high-high band is saved. Out of all
 ///   low-low bands, only th edeepest one is saved (right at the beginning of
 ///   the buffer), others are replaced with deeper level subbands.
 /// - Input images of all functions won't be preserved (will be overwritten).
 /// - Input and output buffers can't overlap.
 /// - Size of output buffer must be greater or equal to size of input buffer.
 ///
 /// There are no common compile time settings (buffer size, etc...) for
 /// all DWTs, because each DTW type needs different amount of GPU resources.
 /// Instead, each DWT type has its own compile time settings, which can be
 /// found in *.cu file, where it is implemented.
 ///
 #ifndef DWT_CUDA_H
 #define DWT_CUDA_H
 namespace dwt_cuda {
 /// Forward 5/3 2D DWT. See common rules (above) for more details.
 /// @param in      Expected to be normalized into range [-128, 127].
 ///                Will not be preserved (will be overwritten).
 /// @param out     output buffer on GPU
 /// @param sizeX   width of input image (in pixels)
 /// @param sizeY   height of input image (in pixels)
 /// @param levels  number of recursive DWT levels
 void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
 /// Reverse 5/3 2D DWT. See common rules (above) for more details.
 /// @param in      Input DWT coefficients. Format described in common rules.
 ///                Will not be preserved (will be overwritten).
 /// @param out     output buffer on GPU - will contain original image
 ///                in normalized range [-128, 127].
 /// @param sizeX   width of input image (in pixels)
 /// @param sizeY   height of input image (in pixels)
 /// @param levels  number of recursive DWT levels
 void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
 /// Forward 9/7 2D DWT. See common rules (above) for more details.
 /// @param in      Input DWT coefficients. Should be normalized (in range
 ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
 /// @param out     output buffer on GPU - format specified in common rules
 /// @param sizeX   width of input image (in pixels)
 /// @param sizeY   height of input image (in pixels)
 /// @param levels  number of recursive DWT levels
 void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
 /// Reverse 9/7 2D DWT. See common rules (above) for more details.
 /// @param in      Input DWT coefficients. Format described in common rules.
 ///                Will not be preserved (will be overwritten).
 /// @param out     output buffer on GPU - will contain original image
 ///                in normalized range [-0.5, 0.5].
 /// @param sizeX   width of input image (in pixels)
 /// @param sizeY   height of input image (in pixels)
 /// @param levels  number of recursive DWT levels
 void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
 } // namespace dwt_cuda
 #endif // DWT_CUDA_H
--- a/examples/dwt2d/dwt_cuda/fdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt53.cu
@ -1,400 +0,0 @@
 /// @file    fdwt53.cu
 /// @brief   CUDA implementation of forward 5/3 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-02-04 13:23
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #include "common.h"
 #include "transform_buffer.h"
 #include "io.h"
 namespace dwt_cuda {
  /// Wraps buffer and methods needed for computing one level of 5/3 FDWT
  /// using sliding window approach.
  /// @tparam WIN_SIZE_X  width of sliding window
  /// @tparam WIN_SIZE_Y  height of sliding window
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class FDWT53 {
  private:
    /// Info needed for processing of one input column.
    /// @tparam CHECKED_LOADER  true if column's loader should check boundaries
    ///                         false if there are no near boudnaries to check
    template <bool CHECKED_LOADER>
    struct FDWT53Column {
      /// loader for the column
      VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
      /// offset of the column in shared buffer
      int offset;
      // backup of first 3 loaded pixels (not transformed)
      int pixel0, pixel1, pixel2;
      /// Sets all fields to anything to prevent 'uninitialized' warnings.
      __device__ void clear() {
        offset = pixel0 = pixel1 = pixel2 = 0;
        loader.clear();
      }
    };
    /// Type of shared memory buffer for 5/3 FDWT transforms.
    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
    /// Actual shared buffer used for forward 5/3 DWT.
    FDWT53Buffer buffer;
    /// Difference between indices of two vertical neighbors in buffer.
    enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
    /// Forward 5/3 DWT predict operation.
    struct Forward53Predict {
      __device__ void operator() (const int p, int & c, const int n) const {
        // c = n;
        c -= (p + n) / 2;      // F.8, page 126, ITU-T Rec. T.800 final draft the real one
      }
    };
    /// Forward 5/3 DWT update operation.
    struct Forward53Update {
      __device__ void operator() (const int p, int & c, const int n) const {
        c += (p + n + 2) / 4;  // F.9, page 126, ITU-T Rec. T.800 final draft
      }
    };
    /// Initializes one column: computes offset of the column in shared memory
    /// buffer, initializes loader and finally uses it to load first 3 pixels.
    /// @tparam CHECKED  true if loader of the column checks boundaries
    /// @param column    (uninitialized) column info to be initialized
    /// @param input     input image
    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param colIndex  x-axis coordinate of the column (relative to the left
    ///                  side of this threadblock's block of input pixels)
    /// @param firstY    y-axis coordinate of first image row to be transformed
 	template <bool CHECKED>
    __device__ void initColumn(FDWT53Column<CHECKED> & column,
                               const int * const input,
                               const int sizeX, const int sizeY,
                               const int colIndex, const int firstY) {
      // get offset of the column with index 'cId'
      column.offset = buffer.getColumnOffset(colIndex);
      // coordinates of the first pixel to be loaded
      const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
      if(blockIdx.y == 0) {
        // topmost block - apply mirroring rules when loading first 3 rows
        column.loader.init(sizeX, sizeY, firstX, firstY);
        // load pixels in mirrored way
        column.pixel2 = column.loader.loadFrom(input);  // loaded pixel #0
        column.pixel1 = column.loader.loadFrom(input);  // loaded pixel #1
        column.pixel0 = column.loader.loadFrom(input);  // loaded pixel #2
        // reinitialize loader to start with pixel #1 again
        column.loader.init(sizeX, sizeY, firstX, firstY + 1);
      } else {
        // non-topmost row - regular loading:
        column.loader.init(sizeX, sizeY, firstX, firstY - 2);
        // load 3 rows into the column
        column.pixel0 = column.loader.loadFrom(input);
        column.pixel1 = column.loader.loadFrom(input);
        column.pixel2 = column.loader.loadFrom(input);
        // Now, the next pixel, which will be loaded by loader, is pixel #1.
      }
 	}
    /// Loads and vertically transforms given column. Assumes that first 3
    /// pixels are already loaded in column fields pixel0 ... pixel2.
    /// @tparam CHECKED  true if loader of the column checks boundaries
    /// @param column    column to be loaded and vertically transformed
    /// @param input     pointer to input image data
    template <bool CHECKED>
    __device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
                                               const int * const input) {
 	  // take 3 loaded pixels and put them into shared memory transform buffer
      buffer[column.offset + 0 * STRIDE] = column.pixel0;
      buffer[column.offset + 1 * STRIDE] = column.pixel1;
      buffer[column.offset + 2 * STRIDE] = column.pixel2;
      // load remaining pixels to be able to vertically transform the window
      for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
      {
        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
      }
      // remember last 3 pixels for use in next iteration
      column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
      column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
      column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
      // vertically transform the column in transform buffer
 	  buffer.forEachVerticalOdd(column.offset, Forward53Predict());
      buffer.forEachVerticalEven(column.offset, Forward53Update());
    }
    /// Actual implementation of 5/3 FDWT.
    /// @tparam CHECK_LOADS   true if input loader must check boundaries
    /// @tparam CHECK_WRITES  true if output writer must check boundaries
    /// @param in        input image
    /// @param out       output buffer
    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param winSteps  number of sliding window steps
    template <bool CHECK_LOADS, bool CHECK_WRITES>
    __device__ void transform(const int * const in, int * const out,
                              const int sizeX, const int sizeY,
                              const int winSteps) {
      // info about one main and one boundary columns processed by this thread
      FDWT53Column<CHECK_LOADS> column;
      FDWT53Column<CHECK_LOADS> boundaryColumn;  // only few threads use this
      // Initialize all column info: initialize loaders, compute offset of
      // column in shared buffer and initialize loader of column.
      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
 	    initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
      // first 3 threads initialize boundary columns, others do not use them
      boundaryColumn.clear();
      if(threadIdx.x < 3) {
        // index of boundary column (relative x-axis coordinate of the column)
        const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
        // initialize the column
        initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
      }
      // index of column which will be written into output by this thread
 	  const int outColumnIndex = parityIdx<WIN_SIZE_X>();
      // offset of column which will be written by this thread into output
      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
      // initialize output writer for this thread
      const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
      VerticalDWTBandWriter<int, CHECK_WRITES> writer;
 	    writer.init(sizeX, sizeY, outputFirstX, firstY);
 			__syncthreads();
      // Sliding window iterations:
      // Each iteration assumes that first 3 pixels of each column are loaded.
     for(int w = 0; w < winSteps; w++) {
 	 // For each column (including boundary columns): load and vertically
        // transform another WIN_SIZE_Y lines.
        loadAndVerticallyTransform(column, in);
        if(threadIdx.x < 3) {
          loadAndVerticallyTransform(boundaryColumn, in);
        }
        // wait for all columns to be vertically transformed and transform all
        // output rows horizontally
        __syncthreads();
 		buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
        __syncthreads();
        buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
        // wait for all output rows to be transformed horizontally and write
        // them into output buffer
        __syncthreads();
        for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
          // Write low coefficients from output column into low band ...
 			writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
          // ... and high coeficients into the high band.
 			writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
        }
        // before proceeding to next iteration, wait for all output columns
        // to be written into the output
        __syncthreads();
 	    }
    }
  public:
    /// Determines, whether this block's pixels touch boundary and selects
    /// right version of algorithm according to it - for many threadblocks, it
    /// selects version which does not deal with boundary mirroring and thus is
    /// slightly faster.
    /// @param in     input image
    /// @param out    output buffer
    /// @param sx     width of the input image
    /// @param sy     height of the input image
    /// @param steps  number of sliding window steps
    __device__ static void run(const int * const in, int * const out,
                               const int sx, const int sy, const int steps) {
        // if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
      // object with transform buffer in shared memory
      __shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
 	  // Compute limits of this threadblock's block of pixels and use them to
      // determine, whether this threadblock will have to deal with boundary.
      // (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
      const bool atRightBoudary = maxX >= sx;
      const bool atBottomBoudary = maxY >= sy;
      // Select specialized version of code according to distance of this
      // threadblock's pixels from image boundary.
      // if(threadIdx.x == 0) {
      //   printf("fdwt53 run");
      // }
      if(atBottomBoudary)
      {
        // near bottom boundary => check both writing and reading
        fdwt53.transform<true, true>(in, out, sx, sy, steps);
      } else if(atRightBoudary)
      {
        // near right boundary only => check writing only
        fdwt53.transform<false, true>(in, out, sx, sy, steps);
      } else
      {
        // no nearby boundary => check nothing
        fdwt53.transform<false, false>(in, out, sx, sy, steps);
      }
    }
    // }
  }; // end of class FDWT53
  /// Main GPU 5/3 FDWT entry point.
  /// @tparam WIN_SX   width of sliding window to be used
  /// @tparam WIN_SY   height of sliding window to be used
  /// @param input     input image
  /// @param output    output buffer
  /// @param sizeX     width of the input image
  /// @param sizeY     height of the input image
  /// @param winSteps  number of sliding window steps
  template <int WIN_SX, int WIN_SY>
  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
  __global__ void fdwt53Kernel(const int * const input, int * const output,
                               const int sizeX, const int sizeY,
                               const int winSteps) {
    FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
  }
  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 5/3 FDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
 	int gx = divRndUp(sx, WIN_SX);
 	int gy = divRndUp(sy, WIN_SY * steps);
 	printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
    // run kernel, possibly measure time and finally check the call
    // PERF_BEGIN
    fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    // PERF_END("        FDWT53", sx, sy)
    // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
    printf("fdwt53Kernel in launchFDWT53Kernel has finished");
  }
  /// Forward 5/3 2D DWT. See common rules (above) for more details.
  /// @param in      Expected to be normalized into range [-128, 127].
  ///                Will not be preserved (will be overwritten).
  /// @param out     output buffer on GPU
  /// @param sizeX   width of input image (in pixels)
  /// @param sizeY   height of input image (in pixels)
  /// @param levels  number of recursive DWT levels
  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
    } else if (sizeX >= 480) {
      launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
    } else {
      launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
    }
    // if this was not the last level, continue recursively with other levels
    if(levels > 1) {
      // copy output's LL band back into input buffer
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
 	 // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
      memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
      // run remaining levels of FDWT
      fdwt53(in, out, llSizeX, llSizeY, levels - 1);
    }
  }
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/fdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt97.cu
@ -1,383 +0,0 @@
 ///
 /// @file    fdwt97.cu
 /// @brief   CUDA implementation of forward 9/7 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-01-20 13:18
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #include "common.h"
 #include "transform_buffer.h"
 #include "io.h"
 namespace dwt_cuda {
  /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
  /// of specified size. Template arguments specify this size.
  /// @tparam WIN_SIZE_X  width of sliding window
  /// @tparam WIN_SIZE_Y  height of sliding window
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class FDWT97 {
  private:
    /// Type of shared memory buffer used for 9/7 DWT.
    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;
    /// Actual shared buffer used for forward 9/7 DWT.
    FDWT97Buffer buffer;
    /// Difference of indices of two vertically neighboring items in buffer.
    enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };
    /// One thread's info about loading input image
    /// @tparam CHECKED  true if loader should check for image boundaries
    template <bool CHECKED>
    struct FDWT97ColumnLoadingInfo {
      /// Loader of pixels from some input image.
      VerticalDWTPixelLoader<float, CHECKED> loader;
      /// Offset of column loaded by loader. (Offset in shared buffer.)
      int offset;
    };
    /// Horizontal 9/7 FDWT on specified lines of transform buffer.
    /// @param lines      number of lines to be transformed
    /// @param firstLine  index of the first line to be transformed
    __device__ void horizontalFDWT97(const int lines, const int firstLine) {
      __syncthreads();
      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
      __syncthreads();
      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
      __syncthreads();
      buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);
      __syncthreads();
    }
    /// Initializes one column of shared transform buffer with 7 input pixels.
    /// Those 7 pixels will not be transformed. Also initializes given loader.
    /// @tparam CHECKED     true if loader should check for image boundaries
    /// @param column       (uninitialized) object for loading input pixels
    /// @param columnIndex  index (not offset!) of the column to be loaded
    ///                     (relative to threadblock's first column)
    /// @param input        pointer to input image in GPU memory
    /// @param sizeX        width of the input image
    /// @param sizeY        height of the input image
    /// @param firstY       index of first row to be loaded from image
    template <bool CHECKED>
    __device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
                              const int columnIndex, const float * const input,
                              const int sizeX, const int sizeY,
                              const int firstY) {
      // get offset of the column with index 'columnIndex'
      column.offset = buffer.getColumnOffset(columnIndex);
      // printf(" offset: %d  , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);
      // x-coordinate of the first pixel to be loaded by given loader
      const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
      if(blockIdx.y == 0) {
        // topmost block - apply mirroring rules when loading first 7 rows
        column.loader.init(sizeX, sizeY, firstX, firstY);
        // load pixels in mirrored way
        buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 3 * STRIDE] =
        buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 2 * STRIDE] =
        buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);
        // reinitialize loader to start with pixel #3 again
        column.loader.init(sizeX, sizeY, firstX, firstY + 3);
      } else {
        // non-topmost row - regular loading:
        column.loader.init(sizeX, sizeY, firstX, firstY - 4);
        // load 7 rows into the transform buffer
        for(int i = 0; i < 7; i++) {
          buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
        }
      }
      // Now, the next pixel, which will be loaded by loader, is pixel #3.
    }
    /// Loads another WIN_SIZE_Y pixels into given column using given loader.
    /// @tparam CHECKED  true if loader should check for image boundaries
    /// @param input     input image to load from
    /// @param column    loader and offset of loaded column in shared buffer
    template <bool CHECKED>
    inline __device__ void loadWindowIntoColumn(const float * const input,
                                  FDWT97ColumnLoadingInfo<CHECKED> & column) {
      for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
      }
    }
    /// Main GPU 9/7 FDWT entry point.
    /// @tparam CHECK_LOADS   true if boundaries should be checked when loading
    /// @tparam CHECK_WRITES  true if boundaries should be checked when writing
    /// @param in        input image
    /// @param out       output buffer
    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param winSteps  number of steps of sliding window
    template <bool CHECK_LOADS, bool CHECK_WRITES>
    __device__ void transform(const float * const in, float * const out,
                              const int sizeX, const int sizeY,
                              const int winSteps) {
      // info about columns loaded by this thread: one main column and possibly
      // one boundary column. (Only some threads load some boundary column.)
      FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
      FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;
      // Initialize first 7 lines of transform buffer.
      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
      initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);
      // Some threads initialize boundary columns.
      boundaryColumn.offset = 0;
      boundaryColumn.loader.clear();
      if(threadIdx.x < 7) {
        // each thread among first 7 ones gets index of one of boundary columns
        const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);
        // Thread initializes offset of the boundary column (in shared buffer),
        // first 7 pixels of the column and a loader for this column.
        initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
      }
      // horizontally transform first 7 rows in all columns
      horizontalFDWT97(7, 0);
      // Index of column handled by this thread. (First half of threads handle
      // even columns and others handle odd columns.)
      const int outColumnIndex = parityIdx<WIN_SIZE_X>();
      // writer of output linear bands - initialize it
      const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
      VerticalDWTBandWriter<float, CHECK_WRITES> writer;
      writer.init(sizeX, sizeY, firstX, firstY);
      // transform buffer offset of column transformed and saved by this thread
      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
      // (Each iteration of this loop assumes that first 7 rows of transform
      // buffer are already loaded with horizontally transformed coefficients.)
      for(int w = 0; w < winSteps; w++) {
        // Load another WIN_SIZE_Y lines of thread's column into the buffer.
        loadWindowIntoColumn(in, loadedColumn);
        // some threads also load boundary columns
        if(threadIdx.x < 7) {
          loadWindowIntoColumn(in, boundaryColumn);
        }
        // horizontally transform all newly loaded lines
        horizontalFDWT97(WIN_SIZE_Y, 7);
        // Using 7 registers, remember current values of last 7 rows of
        // transform buffer. These rows are transformed horizontally only
        // and will be used in next iteration.
        float last7Lines[7];
        for(int i = 0; i < 7; i++) {
          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
        }
        // vertically transform all central columns (do not scale yet)
        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));
        // Save all results of current window. Results are in transform buffer
        // at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
        // (They only served as a boundary for vertical FDWT.)
        for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
          const int index = outColumnOffset + i * STRIDE;
          // Write low coefficients from column into low band ...
          writer.writeLowInto(out, buffer[index] * scale97Div);
          // ... and high coeficients into the high band.
          writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
        }
        // Use last 7 remembered lines as first 7 lines for next iteration.
        // As expected, these lines are already horizontally transformed.
        for(int i = 0; i < 7; i++) {
          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
        }
        // Wait for all writing threads before proceeding to loading new
        // pixels in next iteration. (Not to overwrite those which
        // are not written yet.)
        __syncthreads();
      }
    }
  public:
    /// Runs one of specialized variants of 9/7 FDWT according to distance of
    /// processed pixels to image boudnary. Some variants do not check for
    /// boudnary and thus are slightly faster.
    /// @param in     input image
    /// @param out    output buffer
    /// @param sx     width of the input image
    /// @param sy     height of the input image
    /// @param steps  number of steps of sliding window
    __device__ static void run(const float * const input, float * const output,
                               const int sx, const int sy, const int steps) {
      // object with transform buffer in shared memory
      __shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;
      // Compute limits of this threadblock's block of pixels and use them to
      // determine, whether this threadblock will have to deal with boundary.
      // (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
      const bool atRightBoudary = maxX >= sx;
      const bool atBottomBoudary = maxY >= sy;
      // Select specialized version of code according to distance of this
      // threadblock's pixels from image boundary.
      if(atBottomBoudary) {
        // near bottom boundary => check both writing and reading
        // printf("\n atBottomBoudary \n ");
        fdwt97.transform<true, true>(input, output, sx, sy, steps);
      } else if(atRightBoudary) {
        // near right boundary only => check writing only
        fdwt97.transform<false, true>(input, output, sx, sy, steps);
      } else {
        // no nearby boundary => check nothing
        fdwt97.transform<false, false>(input, output, sx, sy, steps);
      }
    }
  }; // end of class FDWT97
  /// Main GPU 9/7 FDWT entry point.
  /// @param input   input image
  /// @parma output  output buffer
  /// @param sx      width of the input image
  /// @param sy      height of the input image
  /// @param steps   number of steps of sliding window
  template <int WIN_SX, int WIN_SY>
  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
  __global__ void fdwt97Kernel(const float * const input, float * const output,
                               const int sx, const int sy, const int steps) {
    // Excuse me, dear reader of this code - this call have to be here. If you
    // try to simply put contents of following method right here, CUDA compiler
    // (version 3.2) will spit tons of nonsense messy errors ...
    // Hope they will not break it even more in future releases.
    FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
  }
  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 9/7 FDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
    // run kernel, possibly measure time and finally check the call
    PERF_BEGIN
    fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    PERF_END("        FDWT97", sx, sy)
    CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
  }
  /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
  /// @param in      Input DWT coefficients. Should be normalized (in range
  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
  /// @param out     output buffer on GPU - format specified in common rules
  /// @param sizeX   width of input image (in pixels)
  /// @param sizeY   height of input image (in pixels)
  /// @param levels  number of recursive DWT levels
  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
    } else if (sizeX >= 480) {
      launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
    } else {
      launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
    }
    // if this was not the last level, continue recursively with other levels
    if(levels > 1) {
      // copy output's LL band back into input buffer
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      memCopy(in, out, llSizeX, llSizeY);
      // run remaining levels of FDWT
      fdwt97(in, out, llSizeX, llSizeY, levels - 1);
    }
  }
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/io.h
+++ b/examples/dwt2d/dwt_cuda/io.h
@ -1,440 +0,0 @@
 ///
 /// @file:   io.h
 /// @brief   Manages loading and saving lineary stored bands and input images.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-01-20 22:38
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef IO_H
 #define IO_H
 #include "common.h"
 namespace dwt_cuda {
 /// Base for all IO classes - manages mirroring.
 class DWTIO {
 protected:
  /// Handles mirroring of image at edges in a DWT correct way.
  /// @param d      a position in the image (will be replaced by mirrored d)
  /// @param sizeD  size of the image along the dimension of 'd'
  __device__ static void mirror(int &d, const int &sizeD) {
    // TODO: enable multiple mirroring:
    //      if(sizeD > 1) {
    //        if(d < 0) {
    //          const int underflow = -1 - d;
    //          const int phase = (underflow / (sizeD - 1)) & 1;
    //          const int remainder = underflow % (sizeD - 1);
    //          if(phase == 0) {
    //            d = remainder + 1;
    //          } else {
    //            d = sizeD - 2 - remainder;
    //          }
    //        } else if(d >= sizeD) {
    //          const int overflow = d - sizeD;
    //          const int phase = (overflow / (sizeD - 1)) & 1;
    //          const int remainder = overflow % (sizeD - 1);
    //          if(phase == 0) {
    //            d = sizeD - 2 - remainder;
    //          } else {
    //            d = remainder + 1;
    //          }
    //        }
    //      } else {
    //        d = 0;
    //      }
    // for test the mirror's use Feb 17
    if (d >= sizeD) {
      d = 2 * sizeD - 2 - d;
    } else if (d < 0) {
      d = -d;
    }
  }
 };
 /// Base class for pixel loader and writer - manages computing start index,
 /// stride and end of image for loading column of pixels.
 /// @tparam T        type of image pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
 protected:
  int end;    ///< index of bottom neightbor of last pixel of column
  int stride; ///< increment of pointer to get to next pixel
  /// Initializes pixel IO - sets end index and a position of first pixel.
  /// @param sizeX   width of the image
  /// @param sizeY   height of the image
  /// @param firstX  x-coordinate of first pixel to use
  /// @param firstY  y-coordinate of first pixel to use
  /// @return index of pixel at position [x, y] in the image
  __device__ int initialize(const int sizeX, const int sizeY, int firstX,
                            int firstY) {
    // initialize all pointers and stride
    end = CHECKED ? (sizeY * sizeX + firstX) : 0;
    stride = sizeX;
    return firstX + sizeX * firstY;
  }
 };
 /// Writes reverse transformed pixels directly into output image.
 /// @tparam T        type of output pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
 private:
  int next; // index of the next pixel to be loaded
 public:
  /// Initializes writer - sets output buffer and a position of first pixel.
  /// @param sizeX   width of the image
  /// @param sizeY   height of the image
  /// @param firstX  x-coordinate of first pixel to write into
  /// @param firstY  y-coordinate of first pixel to write into
  __device__ void init(const int sizeX, const int sizeY, int firstX,
                       int firstY) {
    if (firstX < sizeX) {
      next = this->initialize(sizeX, sizeY, firstX, firstY);
    } else {
      this->end = 0;
      this->stride = 0;
      next = 0;
    }
  }
  /// Writes given value at next position and advances internal pointer while
  /// correctly handling mirroring.
  /// @param output  output image to write pixel into
  /// @param value   value of the pixel to be written
  __device__ void writeInto(T *const output, const T &value) {
    if ((!CHECKED) || (next != this->end)) {
      output[next] = value;
      next += this->stride;
    }
  }
 };
 /// Loads pixels from input image.
 /// @tparam T        type of image input pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
 private:
  int last; ///< index of last loaded pixel
 public:
  //******************* FOR TEST **********************
  __device__ int getlast() { return last; }
  __device__ int getend() { return this->end; }
  __device__ int getstride() { return this->stride; }
  __device__ void setend(int a) { this->end = a; }
  //******************* FOR TEST **********************
  /// Initializes loader - sets input size and a position of first pixel.
  /// @param sizeX   width of the image
  /// @param sizeY   height of the image
  /// @param firstX  x-coordinate of first pixel to load
  /// @param firstY  y-coordinate of first pixel to load
  __device__ void init(const int sizeX, const int sizeY, int firstX,
                       int firstY) {
    // correctly mirror x coordinate
    this->mirror(firstX, sizeX);
    // 'last' always points to already loaded pixel (subtract sizeX = stride)
    last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
    // last = (FirstX + sizeX * FirstY) - sizeX
  }
  /// Sets all fields to zeros, for compiler not to complain about
  /// uninitialized stuff.
  __device__ void clear() {
    this->end = 0;
    this->stride = 0;
    this->last = 0;
  }
  /// Gets another pixel and advancees internal pointer to following one.
  /// @param input  input image to load next pixel from
  /// @return next pixel from given image
  __device__ T loadFrom(const T *const input) {
    last += this->stride;
    if (CHECKED && (last == this->end)) {
      last -= 2 * this->stride;
      this->stride = -this->stride; // reverse loader's direction
    }
    // avoid reading from negative indices if loader is checked
    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
    // checked variant later
    if (last < 0) {
      return 0;
    }
    return input[last];
    // return this->end;
    // return last;
    // return this->stride;
  }
 };
 /// Base for band write and loader. Manages computing strides and pointers
 /// to first and last pixels in a linearly-stored-bands correct way.
 /// @tparam T        type of band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
 protected:
  /// index of bottom neighbor of last pixel of loaded column
  int end;
  /// increment of index to get from highpass band to the lowpass one
  int strideHighToLow;
  /// increment of index to get from the lowpass band to the highpass one
  int strideLowToHigh;
  /// Initializes IO - sets size of image and a position of first pixel.
  /// @param imageSizeX   width of the image
  /// @param imageSizeY   height of the image
  /// @param firstX       x-coordinate of first pixel to use
  ///                     (Parity determines vertically low or high band.)
  /// @param firstY       y-coordinate of first pixel to use
  ///                     (Parity determines horizontally low or high band.)
  /// @return index of first item specified by firstX and firstY
  __device__ int initialize(const int imageSizeX, const int imageSizeY,
                            int firstX, int firstY) {
    // index of first pixel (topmost one) of the column with index firstX
    int columnOffset = firstX / 2;
    // difference between indices of two vertically neighboring pixels
    // in the same band
    int verticalStride;
    // resolve index of first pixel according to horizontal parity
    if (firstX & 1) {
      // first pixel in one of right bands
      verticalStride = imageSizeX / 2;
      columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
      strideLowToHigh = (imageSizeX * imageSizeY) / 2;
    } else {
      // first pixel in one of left bands
      verticalStride = imageSizeX / 2 + (imageSizeX & 1);
      strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
    }
    // set the other stride
    strideHighToLow = verticalStride - strideLowToHigh;
    // compute index of coefficient which indicates end of image
    if (CHECKED) {
      end = columnOffset                          // right column
            + (imageSizeY / 2) * verticalStride   // right row
            + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
    } else {
      end = 0;
    }
    //***********for test**************
    //	end = CHECKED;
    //***********for test**************
    // finally, return index of the first item
    return columnOffset                      // right column
           + (firstY / 2) * verticalStride   // right row
           + (firstY & 1) * strideLowToHigh; // possibly in high band
  }
 };
 /// Directly loads coefficients from four consecutively stored transformed
 /// bands.
 /// @tparam T        type of input band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
 private:
  int last; ///< index of last loaded pixel
  /// Checks internal index and possibly reverses direction of loader.
  /// (Handles mirroring at the bottom of the image.)
  /// @param input   input image to load next coefficient from
  /// @param stride  stride to use now (one of two loader's strides)
  /// @return loaded coefficient
  __device__ T updateAndLoad(const T *const input, const int &stride) {
    last += stride;
    if (CHECKED && (last == this->end)) {
      // undo last two updates of index (to get to previous mirrored item)
      last -= (this->strideLowToHigh + this->strideHighToLow);
      // swap and reverse strides (to move up in the loaded column now)
      const int temp = this->strideLowToHigh;
      this->strideLowToHigh = -this->strideHighToLow;
      this->strideHighToLow = -temp;
    }
    if (last < 0) {
      return 0;
    }
    // avoid reading from negative indices if loader is checked
    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
    // checked variant later
    return input[last];
  }
 public:
  /// Initializes loader - sets input size and a position of first pixel.
  /// @param imageSizeX   width of the image
  /// @param imageSizeY   height of the image
  /// @param firstX       x-coordinate of first pixel to load
  ///                     (Parity determines vertically low or high band.)
  /// @param firstY       y-coordinate of first pixel to load
  ///                     (Parity determines horizontally low or high band.)
  __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
                       const int firstY) {
    this->mirror(firstX, imageSizeX);
    last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
    // adjust to point to previous item
    last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
  }
  /// Sets all fields to zeros, for compiler not to complain about
  /// uninitialized stuff.
  __device__ void clear() {
    this->end = 0;
    this->strideHighToLow = 0;
    this->strideLowToHigh = 0;
    this->last = 0;
  }
  /// Gets another coefficient from lowpass band and advances internal index.
  /// Call this method first if position of first pixel passed to init
  /// was in high band.
  /// @param input   input image to load next coefficient from
  /// @return next coefficient from the lowpass band of the given image
  __device__ T loadLowFrom(const T *const input) {
    return updateAndLoad(input, this->strideHighToLow);
  }
  /// Gets another coefficient from the highpass band and advances index.
  /// Call this method first if position of first pixel passed to init
  /// was in high band.
  /// @param input   input image to load next coefficient from
  /// @return next coefficient from the highbass band of the given image
  __device__ T loadHighFrom(const T *const input) {
    return updateAndLoad(input, this->strideLowToHigh);
  }
 };
 /// Directly saves coefficients into four transformed bands.
 /// @tparam T        type of output band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
 private:
  int next; ///< index of last loaded pixel
  /// Checks internal index and possibly stops the writer.
  /// (Handles mirroring at edges of the image.)
  /// @param output  output buffer
  /// @param item    item to put into the output
  /// @param stride  increment of the pointer to get to next output index
  __device__ int saveAndUpdate(T *const output, const T &item,
                               const int &stride) {
    //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
    ////test, Mar 20
    if ((!CHECKED) || (next != this->end)) {
      // if(next == 4) {
      //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
      // }
      output[next] = item;
      next += stride;
    }
    //	}
    // if((!CHECKED) || (next != this->end)) { //the real one
    // output[next] = item;
    // next += stride;  //stride has been test
    // }
    return next;
  }
 public:
  /// Initializes writer - sets output size and a position of first pixel.
  /// @param output       output image
  /// @param imageSizeX   width of the image
  /// @param imageSizeY   height of the image
  /// @param firstX       x-coordinate of first pixel to write
  ///                     (Parity determines vertically low or high band.)
  /// @param firstY       y-coordinate of first pixel to write
  ///                     (Parity determines horizontally low or high band.)
  __device__ void init(const int imageSizeX, const int imageSizeY,
                       const int firstX, const int firstY) {
    if (firstX < imageSizeX) {
      next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
    } else {
      clear();
    }
  }
  /// Sets all fields to zeros, for compiler not to complain about
  /// uninitialized stuff.
  __device__ void clear() {
    this->end = 0;
    this->strideHighToLow = 0;
    this->strideLowToHigh = 0;
    this->next = 0;
  }
  /// Writes another coefficient into the band which was specified using
  /// init's firstX and firstY parameters and advances internal pointer.
  /// Call this method first if position of first pixel passed to init
  /// was in lowpass band.
  /// @param output  output image
  /// @param low     lowpass coefficient to save into the lowpass band
  __device__ int writeLowInto(T *const output, const T &primary) {
    return saveAndUpdate(output, primary, this->strideLowToHigh);
  }
  /// Writes another coefficient from the other band and advances pointer.
  /// Call this method first if position of first pixel passed to init
  /// was in highpass band.
  /// @param output  output image
  /// @param high    highpass coefficient to save into the highpass band
  __device__ int writeHighInto(T *const output, const T &other) {
    return saveAndUpdate(output, other, this->strideHighToLow);
  }
  //*******Add three functions to get private values*******
  __device__ int getnext() { return next; }
  __device__ int getend() { return this->end; }
  __device__ int getstrideHighToLow() { return this->strideHighToLow; }
  __device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
  //*******Add three functions to get private values*******
 };
 } // namespace dwt_cuda
 #endif // IO_H
--- a/examples/dwt2d/dwt_cuda/rdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt53.cu
@ -1,360 +0,0 @@
 ///
 /// @file    rdwt53.cu
 /// @brief   CUDA implementation of reverse 5/3 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-02-04 14:19
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #include "common.h"
 #include "transform_buffer.h"
 #include "io.h"
 namespace dwt_cuda {
  /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
  /// using sliding window and lifting schema.
  /// @tparam WIN_SIZE_X  width of sliding window
  /// @tparam WIN_SIZE_Y  height of sliding window
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class RDWT53 {
  private:
    /// Shared memory buffer used for 5/3 DWT transforms.
    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
    /// Shared buffer used for reverse 5/3 DWT.
    RDWT53Buffer buffer;
    /// Difference between indices of two vertically neighboring items in buffer.
    enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
    /// Info needed for loading of one input column from input image.
    /// @tparam CHECKED  true if loader should check boundaries
    template <bool CHECKED>
    struct RDWT53Column {
      /// loader of pixels from column in input image
      VerticalDWTBandLoader<int, CHECKED> loader;
      /// Offset of corresponding column in shared buffer.
      int offset;
      /// Sets all fields to some values to avoid 'uninitialized' warnings.
      __device__ void clear() {
        offset = 0;
        loader.clear();
      }
    };
    /// 5/3 DWT reverse update operation.
    struct Reverse53Update {
      __device__ void operator() (const int p, int & c, const int n) const {
        c -= (p + n + 2) / 4;  // F.3, page 118, ITU-T Rec. T.800 final draft
      }
    };
    /// 5/3 DWT reverse predict operation.
    struct Reverse53Predict {
      __device__ void operator() (const int p, int & c, const int n) const {
        c += (p + n) / 2;      // F.4, page 118, ITU-T Rec. T.800 final draft
      }
    };
    /// Horizontal 5/3 RDWT on specified lines of transform buffer.
    /// @param lines      number of lines to be transformed
    /// @param firstLine  index of the first line to be transformed
    __device__ void horizontalTransform(const int lines, const int firstLine) {
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
      __syncthreads();
      buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
      __syncthreads();
    }
    /// Using given loader, it loads another WIN_SIZE_Y coefficients
    /// into specified column.
    /// @tparam CHECKED  true if loader should check image boundaries
    /// @param input     input coefficients to load from
    /// @param col       info about loaded column
    template <bool CHECKED>
    inline __device__ void loadWindowIntoColumn(const int * const input,
                                                RDWT53Column<CHECKED> & col) {
      for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
        buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
        buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
      }
    }
    /// Initializes one column of shared transform buffer with 7 input pixels.
    /// Those 7 pixels will not be transformed. Also initializes given loader.
    /// @tparam CHECKED  true if loader should check image boundaries
    /// @param columnX   x coordinate of column in shared transform buffer
    /// @param input     input image
    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param loader    (uninitialized) info about loaded column
    template <bool CHECKED>
    __device__ void initColumn(const int columnX, const int * const input,
                               const int sizeX, const int sizeY,
                               RDWT53Column<CHECKED> & column,
                               const int firstY) {
      // coordinates of the first coefficient to be loaded
      const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
      // offset of the column with index 'colIndex' in the transform buffer
      column.offset = buffer.getColumnOffset(columnX);
      if(blockIdx.y == 0) {
        // topmost block - apply mirroring rules when loading first 3 rows
        column.loader.init(sizeX, sizeY, firstX, firstY);
        // load pixels in mirrored way
        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 0 * STRIDE] =
        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
      } else {
        // non-topmost row - regular loading:
        column.loader.init(sizeX, sizeY, firstX, firstY - 1);
        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
      }
      // Now, the next coefficient, which will be loaded by loader, is #2.
    }
    /// Actual GPU 5/3 RDWT implementation.
    /// @tparam CHECKED_LOADS   true if boundaries must be checked when reading
    /// @tparam CHECKED_WRITES  true if boundaries must be checked when writing
    /// @param in        input image (5/3 transformed coefficients)
    /// @param out       output buffer (for reverse transformed image)
    /// @param sizeX     width of the output image
    /// @param sizeY     height of the output image
    /// @param winSteps  number of sliding window steps
    template<bool CHECKED_LOADS, bool CHECKED_WRITES>
    __device__ void transform(const int * const in, int * const out,
                              const int sizeX, const int sizeY,
                              const int winSteps) {
      // info about one main and one boundary column
      RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
      // index of first row to be transformed
      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
      // some threads initialize boundary columns
      boundaryColumn.clear();
      if(threadIdx.x < 3) {
        // First 3 threads also handle boundary columns. Thread #0 gets right
        // column #0, thread #1 get right column #1 and thread #2 left column.
        const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
        // Thread initializes offset of the boundary column (in shared
        // buffer), first 3 pixels of the column and a loader for this column.
        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
      }
      // All threads initialize central columns.
      initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
      // horizontally transform first 3 rows
      horizontalTransform(3, 0);
      // writer of output pixels - initialize it
      const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
      VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
      writer.init(sizeX, sizeY, outX, firstY);
      // offset of column (in transform buffer) saved by this thread
      const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
      // (Each iteration assumes that first 3 rows of transform buffer are
      // already loaded with horizontally transformed pixels.)
      for(int w = 0; w < winSteps; w++) {
        // Load another WIN_SIZE_Y lines of this thread's column
        // into the transform buffer.
        loadWindowIntoColumn(in, column);
        // possibly load boundary columns
        if(threadIdx.x < 3) {
          loadWindowIntoColumn(in, boundaryColumn);
        }
        // horizontally transform all newly loaded lines
        horizontalTransform(WIN_SIZE_Y, 3);
        // Using 3 registers, remember current values of last 3 rows
        // of transform buffer. These rows are transformed horizontally
        // only and will be used in next iteration.
        int last3Lines[3];
        last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
        last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
        last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
        // vertically transform all central columns
        buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
        buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
        // Save all results of current window. Results are in transform buffer
        // at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
        // (They only served as a boundary for vertical RDWT.)
        for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
          writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
        }
        // Use last 3 remembered lines as first 3 lines for next iteration.
        // As expected, these lines are already horizontally transformed.
        buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
        buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
        buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
        // Wait for all writing threads before proceeding to loading new
        // coeficients in next iteration. (Not to overwrite those which
        // are not written yet.)
        __syncthreads();
      }
    }
  public:
    /// Main GPU 5/3 RDWT entry point.
    /// @param in     input image (5/3 transformed coefficients)
    /// @param out    output buffer (for reverse transformed image)
    /// @param sizeX  width of the output image
    /// @param sizeY  height of the output image
    /// @param winSteps  number of sliding window steps
    __device__ static void run(const int * const input, int * const output,
                               const int sx, const int sy, const int steps) {
      // prepare instance with buffer in shared memory
      __shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
      // Compute limits of this threadblock's block of pixels and use them to
      // determine, whether this threadblock will have to deal with boundary.
      // (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
      const bool atRightBoudary = maxX >= sx;
      const bool atBottomBoudary = maxY >= sy;
      // Select specialized version of code according to distance of this
      // threadblock's pixels from image boundary.
      if(atBottomBoudary) {
        // near bottom boundary => check both writing and reading
        rdwt53.transform<true, true>(input, output, sx, sy, steps);
      } else if(atRightBoudary) {
        // near right boundary only => check writing only
        rdwt53.transform<false, true>(input, output, sx, sy, steps);
      } else {
        // no nearby boundary => check nothing
        rdwt53.transform<false, false>(input, output, sx, sy, steps);
      }
    }
  }; // end of class RDWT53
  /// Main GPU 5/3 RDWT entry point.
  /// @param in     input image (5/3 transformed coefficients)
  /// @param out    output buffer (for reverse transformed image)
  /// @param sizeX  width of the output image
  /// @param sizeY  height of the output image
  /// @param winSteps  number of sliding window steps
  template <int WIN_SX, int WIN_SY>
  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
  __global__ void rdwt53Kernel(const int * const in, int * const out,
                               const int sx, const int sy, const int steps) {
    RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
  }
  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 5/3 RDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    // finally transform this level
    PERF_BEGIN
    rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    PERF_END("        RDWT53", sx, sy)
    CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
  }
  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
  /// @param in      Input DWT coefficients. Format described in common rules.
  ///                Will not be preserved (will be overwritten).
  /// @param out     output buffer on GPU - will contain original image
  ///                in normalized range [-128, 127].
  /// @param sizeX   width of input image (in pixels)
  /// @param sizeY   height of input image (in pixels)
  /// @param levels  number of recursive DWT levels
  void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
    if(levels > 1) {
      // let this function recursively reverse transform deeper levels first
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      rdwt53(in, out, llSizeX, llSizeY, levels - 1);
      // copy reverse transformed LL band from output back into the input
      memCopy(in, out, llSizeX, llSizeY);
    }
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
    } else if (sizeX >= 480) {
      launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
    } else {
      launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
    }
  }
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/rdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt97.cu
@ -1,363 +0,0 @@
 ///
 /// @file    rdwt97.cu
 /// @brief   CUDA implementation of reverse 9/7 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-02-03 21:59
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #include "common.h"
 #include "transform_buffer.h"
 #include "io.h"
 namespace dwt_cuda {
  /// Wraps shared memory buffer and methods for computing 9/7 RDWT using
  /// lifting schema and sliding window.
  /// @tparam WIN_SIZE_X  width of the sliding window
  /// @tparam WIN_SIZE_Y  height of the sliding window
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class RDWT97 {
  private:
    /// Info related to loading of one input column.
    /// @tparam CHECKED true if boundary chould be checked,
    ///                 false if there is no near boudnary
    template <bool CHECKED>
    struct RDWT97Column  {
      /// laoder of input pxels for given column.
      VerticalDWTBandLoader<float, CHECKED> loader;
      /// Offset of loaded column in shared memory buffer.
      int offset;
      /// Sets all fields to some values to avoid 'uninitialized' warnings.
      __device__ void clear() {
        loader.clear();
        offset = 0;
      }
    };
    /// Shared memory buffer used for 9/7 DWT transforms.
    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> RDWT97Buffer;
    /// Shared buffer used for reverse 9/7 DWT.
    RDWT97Buffer buffer;
    /// Difference between indices of two vertical neighbors in buffer.
    enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE };
    /// Horizontal 9/7 RDWT on specified lines of transform buffer.
    /// @param lines      number of lines to be transformed
    /// @param firstLine  index of the first line to be transformed
    __device__ void horizontalRDWT97(int lines, int firstLine) {
      __syncthreads();
      buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines);
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2));
      __syncthreads();
      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2));
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1));
      __syncthreads();
      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1));
      __syncthreads();
    }
    /// Initializes one column of shared transform buffer with 7 input pixels.
    /// Those 7 pixels will not be transformed. Also initializes given loader.
    /// @tparam CHECKED  true if there are near image boundaries
    /// @param colIndex  index of column in shared transform buffer
    /// @param input     input image
    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param column    (uninitialized) info about loading one column
    /// @param firstY    index of first image row to be transformed
    template <bool CHECKED>
    __device__ void initColumn(const int colIndex, const float * const input,
                               const int sizeX, const int sizeY,
                               RDWT97Column<CHECKED> & column,
                               const int firstY) {
      // coordinates of the first coefficient to be loaded
      const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
      // offset of the column with index 'colIndex' in the transform buffer
      column.offset = buffer.getColumnOffset(colIndex);
      if(blockIdx.y == 0) {
        // topmost block - apply mirroring rules when loading first 7 rows
        column.loader.init(sizeX, sizeY, firstX, firstY);
        // load pixels in mirrored way
        buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 4 * STRIDE] =
        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
        buffer[column.offset + 5 * STRIDE] =
        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 6 * STRIDE] =
        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
      } else {
        // non-topmost row - regular loading:
        column.loader.init(sizeX, sizeY, firstX, firstY - 3);
        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
        buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input);
        buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input);
        buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input);
      }
      // Now, the next coefficient, which will be loaded by loader, is #4.
    }
    /// Using given loader, it loads another WIN_SIZE_Y coefficients
    /// into specified column.
    /// @tparam CHECKED  true if there are near image boundaries
    /// @param col       info about loaded column
    /// @param input     buffer with input coefficients
    template <bool CHECKED>
    inline __device__ void loadWindowIntoColumn(RDWT97Column<CHECKED> & col,
                                                const float * const input) {
      for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) {
        buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
        buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
      }
    }
    /// Actual GPU 9/7 RDWT sliding window lifting schema implementation.
    /// @tparam CHECKED_LOADS   true if loader should check boundaries
    /// @tparam CHECKED_WRITES  true if boundaries should be taken into account
    ///                         when writing into output buffer
    /// @param in        input image (9/7 transformed coefficients)
    /// @param out       output buffer (for reverse transformed image)
    /// @param sizeX     width of the output image
    /// @param sizeY     height of the output image
    /// @param winSteps  number of steps of sliding window
    template <bool CHECKED_LOADS, bool CHECKED_WRITES>
    __device__ void transform(const float * const in, float * const out,
                              const int sizeX, const int sizeY,
                              const int winSteps) {
      // info about one main column and one boundary column
      RDWT97Column<CHECKED_LOADS> column;
      RDWT97Column<CHECKED_LOADS> boundaryColumn;
      // index of first image row to be transformed
      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
      // initialize boundary columns
      boundaryColumn.clear();
      if(threadIdx.x < 7) {
        // each thread among first 7 ones gets index of one of boundary columns
        const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
        // Thread initializes offset of the boundary column (in shared
        // buffer), first 7 pixels of the column and a loader for this column.
        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
      }
      // All threads initialize central columns.
      initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
      // horizontally transform first 7 rows
      horizontalRDWT97(7, 0);
      // writer of output pixels - initialize it
      const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
      VerticalDWTPixelWriter<float, CHECKED_WRITES> writer;
      writer.init(sizeX, sizeY, outputX, firstY);
      // offset of column (in transform buffer) saved by this thread
      const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
      // (Each iteration assumes that first 7 rows of transform buffer are
      // already loaded with horizontally transformed pixels.)
      for(int w = 0; w < winSteps; w++) {
        // Load another WIN_SIZE_Y lines of this thread's column
        // into the transform buffer.
        loadWindowIntoColumn(column, in);
        // possibly load boundary columns
        if(threadIdx.x < 7) {
          loadWindowIntoColumn(boundaryColumn, in);
        }
        // horizontally transform all newly loaded lines
        horizontalRDWT97(WIN_SIZE_Y, 7);
        // Using 7 registers, remember current values of last 7 rows
        // of transform buffer. These rows are transformed horizontally
        // only and will be used in next iteration.
        float last7Lines[7];
        for(int i = 0; i < 7; i++) {
          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
        }
        // vertically transform all central columns
        buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset,
                             WIN_SIZE_Y + 7, 0);
        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2));
        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2));
        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1));
        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1));
        // Save all results of current window. Results are in transform buffer
        // at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now.
        // (They only served as a boundary for vertical RDWT.)
        for(int i = 3; i < (3 + WIN_SIZE_Y); i++) {
          writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]);
        }
        // Use last 7 remembered lines as first 7 lines for next iteration.
        // As expected, these lines are already horizontally transformed.
        for(int i = 0; i < 7; i++) {
          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
        }
        // Wait for all writing threads before proceeding to loading new
        // coeficients in next iteration. (Not to overwrite those which
        // are not written yet.)
        __syncthreads();
      }
    }
  public:
    /// Main GPU 9/7 RDWT entry point.
    /// @param in     input image (9/7 transformed coefficients)
    /// @param out    output buffer (for reverse transformed image)
    /// @param sizeX  width of the output image
    /// @param sizeY  height of the output image
    __device__ static void run(const float * const input, float * const output,
                               const int sx, const int sy, const int steps) {
      // prepare instance with buffer in shared memory
      __shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
      // Compute limits of this threadblock's block of pixels and use them to
      // determine, whether this threadblock will have to deal with boundary.
      // (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
      const bool atRightBoudary = maxX >= sx;
      const bool atBottomBoudary = maxY >= sy;
      // Select specialized version of code according to distance of this
      // threadblock's pixels from image boundary.
      if(atBottomBoudary) {
        // near bottom boundary => check both writing and reading
        rdwt97.transform<true, true>(input, output, sx, sy, steps);
      } else if(atRightBoudary) {
        // near right boundary only => check writing only
        rdwt97.transform<false, true>(input, output, sx, sy, steps);
      } else {
        // no nearby boundary => check nothing
        rdwt97.transform<false, false>(input, output, sx, sy, steps);
      }
    }
  }; // end of class RDWT97
  /// Main GPU 9/7 RDWT entry point.
  /// @param in     input image (9/7 transformed coefficients)
  /// @param out    output buffer (for reverse transformed image)
  /// @param sizeX  width of the output image
  /// @param sizeY  height of the output image
  template <int WIN_SX, int WIN_SY>
  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
  __global__ void rdwt97Kernel(const float * const in, float * const out,
                               const int sx, const int sy, const int steps) {
    RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
  }
  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 9/7 RDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    // finally launch kernel
    PERF_BEGIN
    rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    PERF_END("        RDWT97", sx, sy)
    CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
  }
  /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
  /// @param in      Input DWT coefficients. Format described in common rules.
  ///                Will not be preserved (will be overwritten).
  /// @param out     output buffer on GPU - will contain original image
  ///                in normalized range [-0.5, 0.5].
  /// @param sizeX   width of input image (in pixels)
  /// @param sizeY   height of input image (in pixels)
  /// @param levels  number of recursive DWT levels
  void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
    if(levels > 1) {
      // let this function recursively reverse transform deeper levels first
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      rdwt97(in, out, llSizeX, llSizeY, levels - 1);
      // copy reverse transformed LL band from output back into the input
      memCopy(in, out, llSizeX, llSizeY);
    }
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
    } else if (sizeX >= 480) {
      launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
    } else {
      launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
    }
  }
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/transform_buffer.h
+++ b/examples/dwt2d/dwt_cuda/transform_buffer.h
@ -1,338 +0,0 @@
 /// line 248 the index
 /// @file    transform_buffer.h
 /// @brief   Buffer with separated even and odd columns and related algorithms.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-01-20 18:33
 ///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
 ///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
 ///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef TRANSFORM_BUFFER_H
 #define TRANSFORM_BUFFER_H
 namespace dwt_cuda {
 /// Buffer (in shared memory of GPU) where block of input image is stored,
 /// but odd and even lines are separated. (Generates less bank conflicts when
 /// using lifting schema.) All operations expect SIZE_X threads.
 /// Also implements basic building blocks of lifting schema.
 /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
 ///                     a number of threads participating on all operations.)
 ///                     Must be divisible by 4.
 /// @tparam SIZE_Y      height of buffer (total number of lines)
 /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
 ///                     boundary is expected to be smaller than half SIZE_X
 ///                     Must be divisible by 2.
 template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
 class TransformBuffer {
 public:
  enum {
    /// difference between pointers to two vertical neigbors
    VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
  };
 private:
  enum {
 /// number of shared memory banks - needed for correct padding
 #ifdef __CUDA_ARCH__
    SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
 #else
    SHM_BANKS = 16, // for host code only - can be anything, won't be used
 #endif
    /// size of one of two buffers (odd or even)
    BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
    /// unused space between two buffers
    PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
    /// offset of the odd columns buffer from the beginning of data buffer
    ODD_OFFSET = BUFFER_SIZE + PADDING,
  };
  /// buffer for both even and odd columns
  T data[2 * BUFFER_SIZE + PADDING];
  /// Applies specified function to all central elements while also passing
  /// previous and next elements as parameters.
  /// @param count         count of central elements to apply function to
  /// @param prevOffset    offset of first central element
  /// @param midOffset     offset of first central element's predecessor
  /// @param nextOffset    offset of first central element's successor
  /// @param function      the function itself
  template <typename FUNC>
  __device__ void horizontalStep(const int count, const int prevOffset,
                                 const int midOffset, const int nextOffset,
                                 const FUNC &function) {
    // number of unchecked iterations
    const int STEPS = count / SIZE_X;
    // items remaining after last unchecked iteration
    const int finalCount = count % SIZE_X;
    // offset of items processed in last (checked) iteration
    const int finalOffset = count - finalCount;
    // all threads perform fixed number of iterations ...
    for (int i = 0; i < STEPS; i++) {
      // for(int i = 0; i < 3; i++) {
      const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
      const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
      T &center = data[midOffset + i * SIZE_X + threadIdx.x];
      // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
      function(previous, center, next); // the real one
    }
    // ... but not all threads participate on final iteration
    if (threadIdx.x < finalCount) {
      const T previous = data[prevOffset + finalOffset + threadIdx.x];
      const T next = data[nextOffset + finalOffset + threadIdx.x];
      T &center = data[midOffset + finalOffset + threadIdx.x];
      // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
      // kaixi
      function(previous, center, next); // the real one
    }
  }
 public:
  __device__ void getPrintData() {
    //
    for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
      printf(" index: %d  data: %f \n ", i, data[i]);
    }
  }
  /// Gets offset of the column with given index. Central columns have
  /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
  /// indices and right boundary columns indices start with NUM_LINES.
  /// @param columnIndex  index of column to get pointer to
  /// @return  offset of the first item of column with specified index
  __device__ int getColumnOffset(int columnIndex) {
    columnIndex += BOUNDARY_X;               // skip boundary
    return columnIndex / 2                   // select right column
           + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
  }
  /// Provides access to data of the transform buffer.
  /// @param index  index of the item to work with
  /// @return reference to item at given index
  __device__ T &operator[](const int index) { return data[index]; }
  /// Applies specified function to all horizontally even elements in
  /// specified lines. (Including even elements in boundaries except
  /// first even element in first left boundary.) SIZE_X threads participate
  /// and synchronization is needed before result can be used.
  /// @param firstLine  index of first line
  /// @param numLines   count of lines
  /// @param func       function to be applied on all even elements
  ///                   parameters: previous (odd) element, the even
  ///                   element itself and finally next (odd) element
  template <typename FUNC>
  __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
                                        const FUNC &func) {
    // number of even elemens to apply function to
    const int count = numLines * VERTICAL_STRIDE - 1;
    // offset of first even element
    const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
    // offset of odd predecessor of first even element
    const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
    // offset of odd successor of first even element
    const int nextOffset = prevOffset + 1;
    // if(threadIdx.x == 0) {
    //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d
    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
    // }
    // call generic horizontal step function
    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
  }
  /// Applies given function to all horizontally odd elements in specified
  /// lines. (Including odd elements in boundaries except last odd element
  /// in last right boundary.) SIZE_X threads participate and synchronization
  /// is needed before result can be used.
  /// @param firstLine  index of first line
  /// @param numLines   count of lines
  /// @param func       function to be applied on all odd elements
  ///                   parameters: previous (even) element, the odd
  ///                   element itself and finally next (even) element
  template <typename FUNC>
  __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
                                       const FUNC &func) {
    // numbet of odd elements to apply function to
    const int count = numLines * VERTICAL_STRIDE - 1;
    // offset of even predecessor of first odd element
    const int prevOffset = firstLine * VERTICAL_STRIDE;
    // offset of first odd element
    const int centerOffset = prevOffset + ODD_OFFSET;
    // offset of even successor of first odd element
    const int nextOffset = prevOffset + 1;
    //  if(threadIdx.x == 0) {
    //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d
    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
    // }
    // call generic horizontal step function
    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
  }
  /// Applies specified function to all even elements (except element #0)
  /// of given column. Each thread takes care of one column, so there's
  /// no need for synchronization.
  /// @param columnOffset  offset of thread's column
  /// @param f             function to be applied on all even elements
  ///                      parameters: previous (odd) element, the even
  ///                      element itself and finally next (odd) element
  template <typename F>
  __device__ void forEachVerticalEven(const int columnOffset, const F &f) {
    if (SIZE_Y > 3) { // makes no sense otherwise
      const int steps = SIZE_Y / 2 - 1;
      for (int i = 0; i < steps; i++) {
        const int row = 2 + i * 2;
        const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
        const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
        f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
        //--------------- FOR TEST -----------------
        /*		__syncthreads();
                        if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
                                diffOut[2500]++;
                                diffOut[diffOut[2500]] = 2;//data[columnOffset +
           row * VERTICAL_STRIDE];
                        }
                        __syncthreads();
        */		  //--------------- FOR TEST -----------------
      }
    }
  }
  /// Applies specified function to all odd elements of given column.
  /// Each thread takes care of one column, so there's no need for
  /// synchronization.
  /// @param columnOffset  offset of thread's column
  /// @param f             function to be applied on all odd elements
  ///                      parameters: previous (even) element, the odd
  ///                      element itself and finally next (even) element
  template <typename F>
  __device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
    const int steps = (SIZE_Y - 1) / 2;
    for (int i = 0; i < steps; i++) {
      const int row = i * 2 + 1;
      const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
      const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
      f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
      //--------------- FOR TEST -----------------
      /*		__syncthreads();
                      if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
                              diffOut[2500]++;
                              diffOut[diffOut[2500]] = 1; //data[columnOffset +
         row * VERTICAL_STRIDE];
                      }
                      __syncthreads();
      */		  //--------------- FOR TEST -----------------
    }
  }
  /// Scales elements at specified lines.
  /// @param evenScale  scaling factor for horizontally even elements
  /// @param oddScale   scaling factor for horizontally odd elements
  /// @param numLines   number of lines, whose elements should be scaled
  /// @param firstLine  index of first line to scale elements in
  __device__ void scaleHorizontal(const T evenScale, const T oddScale,
                                  const int firstLine, const int numLines) {
    const int offset = firstLine * VERTICAL_STRIDE;
    const int count = numLines * VERTICAL_STRIDE;
    const int steps = count / SIZE_X;
    const int finalCount = count % SIZE_X;
    const int finalOffset = count - finalCount;
    // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d,
    // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
    // finalCount, finalOffset);
    // run iterations, whete all threads participate
    for (int i = 0; i < steps; i++) {
      data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
      // if(threadIdx.x + i * SIZE_X + offset == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
    }
    // some threads also finish remaining unscaled items
    if (threadIdx.x < finalCount) {
      data[threadIdx.x + finalOffset + offset] *= evenScale;
      // if(threadIdx.x + finalOffset + offset == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
    }
  }
  /// Scales elements in specified column.
  /// @param evenScale     scaling factor for vertically even elements
  /// @param oddScale      scaling factor for vertically odd elements
  /// @param columnOffset  offset of the column to work with
  /// @param numLines      number of lines, whose elements should be scaled
  /// @param firstLine     index of first line to scale elements in
  __device__ void scaleVertical(const T evenScale, const T oddScale,
                                const int columnOffset, const int numLines,
                                const int firstLine) {
    for (int i = firstLine; i < (numLines + firstLine); i++) {
      if (i & 1) {
        data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
      } else {
        data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
      }
    }
  }
  //****************For Test(Feb23), test inter parameters*************
  __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
  __device__ int getSHM_BANKS() { return SHM_BANKS; }
  __device__ int getBuffersize() { return BUFFER_SIZE; }
  __device__ int getPADDING() { return PADDING; }
  __device__ int getODD_OFFSET() { return ODD_OFFSET; }
  //****************For Test(Feb23), test inter parameters*************
 }; // end of class TransformBuffer
 } // namespace dwt_cuda
 #endif // TRANSFORM_BUFFER_H
--- a/examples/dwt2d/main.cu
+++ b/examples/dwt2d/main.cu
@ -1,401 +0,0 @@
 /*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include <unistd.h>
 #include <error.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
 #include <assert.h>
 #include <sys/time.h>
 #include <getopt.h>
 #include "common.h"
 #include "components.h"
 #include "dwt.h"
 struct dwt {
    char * srcFilename;
    char * outFilename;
    unsigned char *srcImg;
    int pixWidth;
    int pixHeight;
    int components;
    int dwtLvls;
 };
 int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
 {
    // printf("Loading ipnput: %s\n", srcFilename);
    char *path = "../../data/dwt2d/";
    char *newSrc = NULL;
    if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
    {
        newSrc[0] = '\0';
        strcat(newSrc, path);
        strcat(newSrc, srcFilename);
        srcFilename= newSrc;
    }
    printf("Loading ipnput: %s\n", srcFilename);
    //srcFilename = strcat("../../data/dwt2d/",srcFilename);
    //read image
    int i = open(srcFilename, O_RDONLY, 0644);
    if (i == -1) {
        error(0,errno,"cannot access %s", srcFilename);
        return -1;
    }
    int ret = read(i, srcImg, inputSize);
    printf("precteno %d, inputsize %d\n", ret, inputSize);
    close(i);
    return 0;
 }
 void usage() {
    printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
  -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
  -c, --components\t\tnumber of color components, default 3\n\
  -b, --depth\t\t\tbit depth, default 8\n\
  -l, --level\t\t\tDWT level, default 3\n\
  -D, --device\t\t\tcuda device\n\
  -f, --forward\t\t\tforward transform\n\
  -r, --reverse\t\t\treverse transform\n\
  -9, --97\t\t\t9/7 transform\n\
  -5, --53\t\t\t5/3 transform\n\
  -w  --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
 }
 template <typename T>
 void processDWT(struct dwt *d, int forward, int writeVisual)
 {
    int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
    T *c_r_out, *backup ;
    cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
    cudaCheckError("Alloc device memory");
    cudaMemset(c_r_out, 0, componentSize);
    cudaCheckError("Memset device memory");
    cudaMalloc((void**)&backup, componentSize); //< aligned component size
    cudaCheckError("Alloc device memory");
    cudaMemset(backup, 0, componentSize);
    cudaCheckError("Memset device memory");
    if (d->components == 3) {
        /* Alloc two more buffers for G and B */
        T *c_g_out, *c_b_out;
        cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_g_out, 0, componentSize);
        cudaCheckError("Memset device memory");
        cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_b_out, 0, componentSize);
        cudaCheckError("Memset device memory");
        /* Load components */
        T *c_r, *c_g, *c_b;
        cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_r, 0, componentSize);
        cudaCheckError("Memset device memory");
        cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_g, 0, componentSize);
        cudaCheckError("Memset device memory");
        cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_b, 0, componentSize);
        cudaCheckError("Memset device memory");
        rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
        /* Compute DWT and always store into file */
        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
        nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
        nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
        // -------test----------
        // T *h_r_out=(T*)malloc(componentSize);
 		// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
        // int ii;
 		// for(ii=0;ii<componentSize/sizeof(T);ii++) {
 			// fprintf(stderr, "%d ", h_r_out[ii]);
 			// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
        // }
        // -------test----------
        /* Store DWT to file */
        writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
        // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
        // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
 #ifdef OUTPUT
        if (writeVisual) {
            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
            writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
            writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
        } else {
            writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
            writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
            writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
        }
 #endif
        cudaFree(c_r);
        cudaCheckError("Cuda free");
        cudaFree(c_g);
        cudaCheckError("Cuda free");
        cudaFree(c_b);
        cudaCheckError("Cuda free");
        cudaFree(c_g_out);
        cudaCheckError("Cuda free");
        cudaFree(c_b_out);
        cudaCheckError("Cuda free");
    }
    else if (d->components == 1) {
        //Load component
        T *c_r;
        cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_r, 0, componentSize);
        cudaCheckError("Memset device memory");
        bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
        // Compute DWT
        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
        // Store DWT to file
 // #ifdef OUTPUT
        if (writeVisual) {
            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
        } else {
            writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out");
        }
 // #endif
        cudaFree(c_r);
        cudaCheckError("Cuda free");
    }
    cudaFree(c_r_out);
    cudaCheckError("Cuda free device");
    cudaFree(backup);
    cudaCheckError("Cuda free device");
 }
 int main(int argc, char **argv)
 {
    int optindex = 0;
    char ch;
    struct option longopts[] = {
        {"dimension",   required_argument, 0, 'd'}, //dimensions of src img
        {"components",  required_argument, 0, 'c'}, //numger of components of src img
        {"depth",       required_argument, 0, 'b'}, //bit depth of src img
        {"level",       required_argument, 0, 'l'}, //level of dwt
        {"device",      required_argument, 0, 'D'}, //cuda device
        {"forward",     no_argument,       0, 'f'}, //forward transform
        {"reverse",     no_argument,       0, 'r'}, //reverse transform
        {"97",          no_argument,       0, '9'}, //9/7 transform
        {"53",          no_argument,       0, '5' }, //5/3transform
        {"write-visual",no_argument,       0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
        {"help",        no_argument,       0, 'h'}
    };
    int pixWidth    = 0; //<real pixWidth
    int pixHeight   = 0; //<real pixHeight
    int compCount   = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
    int bitDepth    = 8;
    int dwtLvls     = 3; //default numuber of DWT levels
    int device      = 0;
    int forward     = 1; //forward transform
    int dwt97       = 1; //1=dwt9/7, 0=dwt5/3 transform
    int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
    char * pos;
    while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) {
        switch (ch) {
        case 'd':
            pixWidth = atoi(optarg);
            pos = strstr(optarg, "x");
            if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) {
                usage();
                return -1;
            }
            pixHeight = atoi(pos+1);
            break;
        case 'c':
            compCount = atoi(optarg);
            break;
        case 'b':
            bitDepth = atoi(optarg);
            break;
        case 'l':
            dwtLvls = atoi(optarg);
            break;
        case 'D':
            device = atoi(optarg);
            break;
        case 'f':
            forward = 1;
            break;
        case 'r':
            forward = 0;
            break;
        case '9':
            dwt97 = 1;
            break;
        case '5':
            dwt97 = 0;
            break;
        case 'w':
            writeVisual = 1;
            break;
        case 'h':
            usage();
            return 0;
        case '?':
            return -1;
        default :
            usage();
            return -1;
        }
    }
 	argc -= optind;
 	argv += optind;
    if (argc == 0) { // at least one filename is expected
        printf("Please supply src file name\n");
        usage();
        return -1;
    }
    if (pixWidth <= 0 || pixHeight <=0) {
        printf("Wrong or missing dimensions\n");
        usage();
        return -1;
    }
    if (forward == 0) {
        writeVisual = 0; //do not write visual when RDWT
    }
    // device init
    int devCount;
    cudaSetDevice(0);
    cudaGetDeviceCount(&devCount);
    cudaCheckError("Get device count");
    if (devCount == 0) {
        printf("No CUDA enabled device\n");
        return -1;
    }
    if (device < 0 || device > devCount -1) {
        printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
               device, 0, devCount -1);
        return -1;
    }
    cudaDeviceProp devProp;
    cudaGetDeviceProperties(&devProp, device);
    cudaCheckError("Get device properties");
    // if (devProp.major < 1) {
    //     printf("Device %d does not support CUDA\n", device);
    //     return -1;
    // }
    printf("Using device %d: %s\n", device, devProp.name);
    cudaSetDevice(device);
    cudaCheckError("Set selected device");
    struct dwt *d;
    d = (struct dwt *)malloc(sizeof(struct dwt));
    d->srcImg = NULL;
    d->pixWidth = pixWidth;
    d->pixHeight = pixHeight;
    d->components = compCount;
    d->dwtLvls  = dwtLvls;
    // file names
    d->srcFilename = (char *)malloc(strlen(argv[0]));
    strcpy(d->srcFilename, argv[0]);
    if (argc == 1) { // only one filename supplyed
        d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
        strcpy(d->outFilename, d->srcFilename);
        strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
    } else {
        d->outFilename = strdup(argv[1]);
    }
    //Input review
    printf("Source file:\t\t%s\n", d->srcFilename);
    printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
    printf(" Components count:\t%d\n", compCount);
    printf(" Bit depth:\t\t%d\n", bitDepth);
    printf(" DWT levels:\t\t%d\n", dwtLvls);
    printf(" Forward transform:\t%d\n", forward);
    printf(" 9/7 transform:\t\t%d\n", dwt97);
    //data sizes
    int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
    //load img source image
    cudaMallocHost((void **)&d->srcImg, inputSize);
    cudaCheckError("Alloc host memory");
    if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
        return -1;
    /* DWT */
    if (forward == 1) {
        if(dwt97 == 1 )
            processDWT<float>(d, forward, writeVisual);
        else // 5/3
            processDWT<int>(d, forward, writeVisual);
    }
    else { // reverse
        if(dwt97 == 1 )
            processDWT<float>(d, forward, writeVisual);
        else // 5/3
            processDWT<int>(d, forward, writeVisual);
    }
    //writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g");
    //writeComponent(g_wave_cuda, 512000, ".g");
    //writeComponent(g_cuda, componentSize, ".g");
    //writeComponent(b_wave_cuda, componentSize, ".b");
    cudaFreeHost(d->srcImg);
    cudaCheckError("Cuda free host");
    return 0;
 }
--- a/examples/dwt2d/run.sh
+++ b/examples/dwt2d/run.sh
@ -1,8 +0,0 @@
 ./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3
 # ./dwt2d 8.bmp -d 8x8 -f -5 -l 3
 # ./dwt2d 16.bmp -d 16x16 -f -5 -l 3
 # ./dwt2d 64.bmp -d 64x64 -f -5 -l 3
 # ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
 # ls
 # ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
--- a/examples/dwt2d/run_cpu.sh
+++ b/examples/dwt2d/run_cpu.sh
@ -1,7 +0,0 @@
 # ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
 # ls
 # ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
 # ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\
 ./dwt2d 4.bmp  -d 4x4 -r -5 -l 3
 # ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3
 # ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3
--- a/examples/dwt2d/run_nvcc.sh
+++ b/examples/dwt2d/run_nvcc.sh
@ -1,14 +0,0 @@
 # ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
 # ls
 # ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
 # ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3
 ./nvcc_dwt2d 4.bmp  -d 4x4 -f -5 -l 3
 # ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3
 # ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3
 # ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3
 # ./nvcc_dwt2d  16.bmp -d 16x16 -f -9 -l 3
 # ./nvcc_dwt2d 4.bmp  -d 4x4 -r -9 -l 3
 # ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3
 # ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
 # ls
 # ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
--- a/examples/dwt2d/test_compile_cpu.sh
+++ b/examples/dwt2d/test_compile_cpu.sh
@ -1,51 +0,0 @@
 #!/bin/bash
 clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu  dwt_cuda/common.cu  dwt_cuda/rdwt97.cu  dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
 ../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
 ../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
 ../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
 ../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
 ../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
 ../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
 ../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
 ../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
 ../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
 ../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
 ../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
 ../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
 ../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
 ../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
 llc --relocation-model=pic --filetype=obj  common.bc
 llc --relocation-model=pic --filetype=obj  components.bc
 llc --relocation-model=pic --filetype=obj  fdwt53.bc
 llc --relocation-model=pic --filetype=obj  dwt.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 llc --relocation-model=pic --filetype=obj  common_host.bc
 llc --relocation-model=pic --filetype=obj  components_host.bc
 llc --relocation-model=pic --filetype=obj  fdwt53_host.bc
 llc --relocation-model=pic --filetype=obj  dwt_host.bc
 llc --relocation-model=pic --filetype=obj  fdwt97_host.bc
 llc --relocation-model=pic --filetype=obj  rdwt97_host.bc
 llc --relocation-model=pic --filetype=obj  rdwt53_host.bc
 llc --relocation-model=pic --filetype=obj  fdwt97.bc
 llc --relocation-model=pic --filetype=obj  rdwt97.bc
 llc --relocation-model=pic --filetype=obj  rdwt53.bc
 g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread
--- a/examples/dwt2d/test_compile_nvcc.sh
+++ b/examples/dwt2d/test_compile_nvcc.sh
@ -1,9 +0,0 @@
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
 g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
--- a/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,396 +0,0 @@
 ; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "gaussian.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_blockDim_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
 entry:
  %m_cuda.addr = alloca float*, align 8
  %a_cuda.addr = alloca float*, align 8
  %Size.addr = alloca i32, align 4
  %t.addr = alloca i32, align 4
  store float* %m_cuda, float** %m_cuda.addr, align 8
  store float* %a_cuda, float** %a_cuda.addr, align 8
  store i32 %Size, i32* %Size.addr, align 4
  store i32 %t, i32* %t.addr, align 4
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call1, %call2
  %add = add i32 %call, %mul
  %0 = load i32, i32* %Size.addr, align 4
  %sub = sub nsw i32 %0, 1
  %1 = load i32, i32* %t.addr, align 4
  %sub3 = sub nsw i32 %sub, %1
  %cmp = icmp uge i32 %add, %sub3
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  br label %return
 if.end:                                           ; preds = %entry
  %2 = load float*, float** %a_cuda.addr, align 8
  %3 = load i32, i32* %Size.addr, align 4
  %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul6 = mul i32 %call4, %call5
  %call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add8 = add i32 %mul6, %call7
  %4 = load i32, i32* %t.addr, align 4
  %add9 = add i32 %add8, %4
  %add10 = add i32 %add9, 1
  %mul11 = mul i32 %3, %add10
  %idx.ext = zext i32 %mul11 to i64
  %add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
  %5 = load i32, i32* %t.addr, align 4
  %idx.ext12 = sext i32 %5 to i64
  %add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
  %6 = load float, float* %add.ptr13, align 4
  %7 = load float*, float** %a_cuda.addr, align 8
  %8 = load i32, i32* %Size.addr, align 4
  %9 = load i32, i32* %t.addr, align 4
  %mul14 = mul nsw i32 %8, %9
  %idx.ext15 = sext i32 %mul14 to i64
  %add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
  %10 = load i32, i32* %t.addr, align 4
  %idx.ext17 = sext i32 %10 to i64
  %add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
  %11 = load float, float* %add.ptr18, align 4
  %div = fdiv float %6, %11
  %12 = load float*, float** %m_cuda.addr, align 8
  %13 = load i32, i32* %Size.addr, align 4
  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul21 = mul i32 %call19, %call20
  %call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add23 = add i32 %mul21, %call22
  %14 = load i32, i32* %t.addr, align 4
  %add24 = add i32 %add23, %14
  %add25 = add i32 %add24, 1
  %mul26 = mul i32 %13, %add25
  %idx.ext27 = zext i32 %mul26 to i64
  %add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
  %15 = load i32, i32* %t.addr, align 4
  %idx.ext29 = sext i32 %15 to i64
  %add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
  store float %div, float* %add.ptr30, align 4
  br label %return
 return:                                           ; preds = %if.end, %if.then
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
 entry:
  %m_cuda.addr = alloca float*, align 8
  %a_cuda.addr = alloca float*, align 8
  %b_cuda.addr = alloca float*, align 8
  %Size.addr = alloca i32, align 4
  %j1.addr = alloca i32, align 4
  %t.addr = alloca i32, align 4
  %xidx = alloca i32, align 4
  %yidx = alloca i32, align 4
  store float* %m_cuda, float** %m_cuda.addr, align 8
  store float* %a_cuda, float** %a_cuda.addr, align 8
  store float* %b_cuda, float** %b_cuda.addr, align 8
  store i32 %Size, i32* %Size.addr, align 4
  store i32 %j1, i32* %j1.addr, align 4
  store i32 %t, i32* %t.addr, align 4
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call1, %call2
  %add = add i32 %call, %mul
  %0 = load i32, i32* %Size.addr, align 4
  %sub = sub nsw i32 %0, 1
  %1 = load i32, i32* %t.addr, align 4
  %sub3 = sub nsw i32 %sub, %1
  %cmp = icmp uge i32 %add, %sub3
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  br label %if.end58
 if.end:                                           ; preds = %entry
  %call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
  %call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
  %mul7 = mul i32 %call5, %call6
  %add8 = add i32 %call4, %mul7
  %2 = load i32, i32* %Size.addr, align 4
  %3 = load i32, i32* %t.addr, align 4
  %sub9 = sub nsw i32 %2, %3
  %cmp10 = icmp uge i32 %add8, %sub9
  br i1 %cmp10, label %if.then11, label %if.end12
 if.then11:                                        ; preds = %if.end
  br label %if.end58
 if.end12:                                         ; preds = %if.end
  %call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %mul15 = mul i32 %call13, %call14
  %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add17 = add i32 %mul15, %call16
  store i32 %add17, i32* %xidx, align 4
  %call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
  %mul20 = mul i32 %call18, %call19
  %call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
  %add22 = add i32 %mul20, %call21
  store i32 %add22, i32* %yidx, align 4
  %4 = load float*, float** %m_cuda.addr, align 8
  %5 = load i32, i32* %Size.addr, align 4
  %6 = load i32, i32* %xidx, align 4
  %add23 = add nsw i32 %6, 1
  %7 = load i32, i32* %t.addr, align 4
  %add24 = add nsw i32 %add23, %7
  %mul25 = mul nsw i32 %5, %add24
  %8 = load i32, i32* %t.addr, align 4
  %add26 = add nsw i32 %mul25, %8
  %idxprom = sext i32 %add26 to i64
  %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
  %9 = load float, float* %arrayidx, align 4
  %10 = load float*, float** %a_cuda.addr, align 8
  %11 = load i32, i32* %Size.addr, align 4
  %12 = load i32, i32* %t.addr, align 4
  %mul27 = mul nsw i32 %11, %12
  %13 = load i32, i32* %yidx, align 4
  %14 = load i32, i32* %t.addr, align 4
  %add28 = add nsw i32 %13, %14
  %add29 = add nsw i32 %mul27, %add28
  %idxprom30 = sext i32 %add29 to i64
  %arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
  %15 = load float, float* %arrayidx31, align 4
  %mul32 = fmul contract float %9, %15
  %16 = load float*, float** %a_cuda.addr, align 8
  %17 = load i32, i32* %Size.addr, align 4
  %18 = load i32, i32* %xidx, align 4
  %add33 = add nsw i32 %18, 1
  %19 = load i32, i32* %t.addr, align 4
  %add34 = add nsw i32 %add33, %19
  %mul35 = mul nsw i32 %17, %add34
  %20 = load i32, i32* %yidx, align 4
  %21 = load i32, i32* %t.addr, align 4
  %add36 = add nsw i32 %20, %21
  %add37 = add nsw i32 %mul35, %add36
  %idxprom38 = sext i32 %add37 to i64
  %arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
  %22 = load float, float* %arrayidx39, align 4
  %sub40 = fsub contract float %22, %mul32
  store float %sub40, float* %arrayidx39, align 4
  %23 = load i32, i32* %yidx, align 4
  %cmp41 = icmp eq i32 %23, 0
  br i1 %cmp41, label %if.then42, label %if.end58
 if.then42:                                        ; preds = %if.end12
  %24 = load float*, float** %m_cuda.addr, align 8
  %25 = load i32, i32* %Size.addr, align 4
  %26 = load i32, i32* %xidx, align 4
  %add43 = add nsw i32 %26, 1
  %27 = load i32, i32* %t.addr, align 4
  %add44 = add nsw i32 %add43, %27
  %mul45 = mul nsw i32 %25, %add44
  %28 = load i32, i32* %yidx, align 4
  %29 = load i32, i32* %t.addr, align 4
  %add46 = add nsw i32 %28, %29
  %add47 = add nsw i32 %mul45, %add46
  %idxprom48 = sext i32 %add47 to i64
  %arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
  %30 = load float, float* %arrayidx49, align 4
  %31 = load float*, float** %b_cuda.addr, align 8
  %32 = load i32, i32* %t.addr, align 4
  %idxprom50 = sext i32 %32 to i64
  %arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
  %33 = load float, float* %arrayidx51, align 4
  %mul52 = fmul contract float %30, %33
  %34 = load float*, float** %b_cuda.addr, align 8
  %35 = load i32, i32* %xidx, align 4
  %add53 = add nsw i32 %35, 1
  %36 = load i32, i32* %t.addr, align 4
  %add54 = add nsw i32 %add53, %36
  %idxprom55 = sext i32 %add54 to i64
  %arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
  %37 = load float, float* %arrayidx56, align 4
  %sub57 = fsub contract float %37, %mul52
  store float %sub57, float* %arrayidx56, align 4
  br label %if.end58
 if.end58:                                         ; preds = %if.then, %if.then11, %if.then42, %if.end12
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  ret i32 %0
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
 !llvm.ident = !{!9}
 !nvvmir.version = !{!10}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
 !4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
 !5 = !{null, !"align", i32 8}
 !6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !7 = !{null, !"align", i32 16}
 !8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !10 = !{i32 1, i32 4}
--- a/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
--- a/examples/gauss/gaussian.cu
+++ b/examples/gauss/gaussian.cu
@ -1,522 +0,0 @@
 /*-----------------------------------------------------------
 ** gaussian.cu -- The program is to solve a linear system Ax = b
 **   by using Gaussian Elimination. The algorithm on page 101
 **   ("Foundations of Parallel Programming") is used.
 **   The sequential version is gaussian.c.  This parallel
 **   implementation converts three independent for() loops
 **   into three Fans.  Use the data file ge_3.dat to verify
 **   the correction of the output.
 **
 ** Written by Andreas Kura, 02/15/95
 ** Modified by Chong-wei Xu, 04/20/95
 ** Modified by Chris Gregg for CUDA, 07/20/2009
 **-----------------------------------------------------------
 */
 #include "cuda_runtime.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #ifdef TIMING
 #include "timing.h"
 #endif
 #ifdef RD_WG_SIZE_0_0
 #define MAXBLOCKSIZE RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define MAXBLOCKSIZE RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define MAXBLOCKSIZE RD_WG_SIZE
 #else
 #define MAXBLOCKSIZE 512
 #endif
 // 2D defines. Go from specific to general
 #ifdef RD_WG_SIZE_1_0
 #define BLOCK_SIZE_XY RD_WG_SIZE_1_0
 #elif defined(RD_WG_SIZE_1)
 #define BLOCK_SIZE_XY RD_WG_SIZE_1
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_XY RD_WG_SIZE
 #else
 #define BLOCK_SIZE_XY 1
 #endif
 #ifdef TIMING
 struct timeval tv;
 struct timeval tv_total_start, tv_total_end;
 struct timeval tv_h2d_start, tv_h2d_end;
 struct timeval tv_d2h_start, tv_d2h_end;
 struct timeval tv_kernel_start, tv_kernel_end;
 struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
 struct timeval tv_close_start, tv_close_end;
 float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
      d2h_time = 0, close_time = 0, total_time = 0;
 #endif
 int Size;
 float *a, *b, *finalVec;
 float *m;
 FILE *fp;
 void InitProblemOnce(char *filename);
 void InitPerRun();
 void ForwardSub();
 void BackSub();
 __global__ void Fan1(float *m, float *a, int Size, int t);
 __global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
 void InitMat(float *ary, int nrow, int ncol);
 void InitAry(float *ary, int ary_size);
 void PrintMat(float *ary, int nrow, int ncolumn);
 void PrintAry(float *ary, int ary_size);
 void PrintDeviceProperties();
 void checkCUDAError(const char *msg);
 unsigned int totalKernelTime = 0;
 // create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
 void create_matrix(float *m, int size) {
  int i, j;
  float lamda = -0.01;
  float coe[2 * size - 1];
  float coe_i = 0.0;
  for (i = 0; i < size; i++) {
    coe_i = 10 * exp(lamda * i);
    j = size - 1 + i;
    coe[j] = coe_i;
    j = size - 1 - i;
    coe[j] = coe_i;
  }
  for (i = 0; i < size; i++) {
    for (j = 0; j < size; j++) {
      m[i * size + j] = coe[size - 1 - i + j];
    }
  }
 }
 int main(int argc, char *argv[]) {
  printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
         MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
  int verbose = 1;
  int i, j;
  char flag;
  if (argc < 2) {
    printf("Usage: gaussian -f filename / -s size [-q]\n\n");
    printf("-q (quiet) suppresses printing the matrix and result values.\n");
    printf("-f (filename) path of input file\n");
    printf(
        "-s (size) size of matrix. Create matrix and rhs in this program \n");
    printf(
        "The first line of the file contains the dimension of the matrix, n.");
    printf("The second line of the file is a newline.\n");
    printf("The next n lines contain n tab separated values for the matrix.");
    printf("The next line of the file is a newline.\n");
    printf("The next line of the file is a 1xn vector with tab separated "
           "values.\n");
    printf("The next line of the file is a newline. (optional)\n");
    printf("The final line of the file is the pre-computed solution. "
           "(optional)\n");
    printf("Example: matrix4.txt:\n");
    printf("4\n");
    printf("\n");
    printf("-0.6	-0.5	0.7	0.3\n");
    printf("-0.3	-0.9	0.3	0.7\n");
    printf("-0.4	-0.5	-0.3	-0.8\n");
    printf("0.0	-0.1	0.2	0.9\n");
    printf("\n");
    printf("-0.85	-0.68	0.24	-0.53\n");
    printf("\n");
    printf("0.7	0.0	-0.4	-0.5\n");
    exit(0);
  }
  cudaSetDevice(0);
  PrintDeviceProperties();
  // char filename[100];
  // sprintf(filename,"matrices/matrix%d.txt",size);
  for (i = 1; i < argc; i++) {
    if (argv[i][0] == '-') { // flag
      flag = argv[i][1];
      switch (flag) {
      case 's': // platform
        i++;
        Size = atoi(argv[i]);
        printf("Create matrix internally in parse, size = %d \n", Size);
        a = (float *)malloc(Size * Size * sizeof(float));
        create_matrix(a, Size);
        b = (float *)malloc(Size * sizeof(float));
        for (j = 0; j < Size; j++)
          b[j] = 1.0;
        m = (float *)malloc(Size * Size * sizeof(float));
        break;
      case 'f': // platform
        i++;
        printf("Read file from %s \n", argv[i]);
        InitProblemOnce(argv[i]);
        break;
      case 'q': // quiet
        verbose = 1;
        break;
      }
    }
  }
  // InitProblemOnce(filename);
  InitPerRun();
  // begin timing
  struct timeval time_start;
  gettimeofday(&time_start, NULL);
  // run kernels
  ForwardSub();
  // end timing
  struct timeval time_end;
  gettimeofday(&time_end, NULL);
  unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
                            (time_start.tv_sec * 1000000 + time_start.tv_usec);
  if (verbose) {
    printf("Matrix m is: \n");
    PrintMat(m, Size, Size);
    printf("Matrix a is: \n");
    PrintMat(a, Size, Size);
    printf("Array b is: \n");
    PrintAry(b, Size);
  }
  BackSub();
  if (verbose) {
    printf("The final solution is: \n");
    PrintAry(finalVec, Size);
  }
  printf("\nTime total (including memory transfers)\t%f sec\n",
         time_total * 1e-6);
  printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
  /*printf("%d,%d\n",size,time_total);
  fprintf(stderr,"%d,%d\n",size,time_total);*/
  free(m);
  free(a);
  free(b);
 #ifdef TIMING
  printf("Exec: %f\n", kernel_time);
 #endif
 }
 /*------------------------------------------------------
 ** PrintDeviceProperties
 **-----------------------------------------------------
 */
 void PrintDeviceProperties() {
  cudaDeviceProp deviceProp;
  int nDevCount = 0;
  cudaGetDeviceCount(&nDevCount);
  printf("Total Device found: %d", nDevCount);
  for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
    memset(&deviceProp, 0, sizeof(deviceProp));
    if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
      printf("\nDevice Name \t\t - %s ", deviceProp.name);
      printf("\n**************************************");
      printf("\nTotal Global Memory\t\t\t - %lu KB",
             deviceProp.totalGlobalMem / 1024);
      printf("\nShared memory available per block \t - %lu KB",
             deviceProp.sharedMemPerBlock / 1024);
      printf("\nNumber of registers per thread block \t - %d",
             deviceProp.regsPerBlock);
      printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
      printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
      printf("\nMaximum threads per block \t\t - %d",
             deviceProp.maxThreadsPerBlock);
      printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
             deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
             deviceProp.maxThreadsDim[2]);
      printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
             deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
             deviceProp.maxGridSize[2]);
      printf("\nTotal constant memory \t\t\t - %zu bytes",
             deviceProp.totalConstMem);
      printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
      printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
      printf("\nTexture Alignment \t\t\t - %zu bytes",
             deviceProp.textureAlignment);
      printf("\nDevice Overlap \t\t\t\t - %s",
             deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
      printf("\nNumber of Multi processors \t\t - %d\n\n",
             deviceProp.multiProcessorCount);
    } else
      printf("\n%s", cudaGetErrorString(cudaGetLastError()));
  }
 }
 /*------------------------------------------------------
 ** InitProblemOnce -- Initialize all of matrices and
 ** vectors by opening a data file specified by the user.
 **
 ** We used dynamic array *a, *b, and *m to allocate
 ** the memory storages.
 **------------------------------------------------------
 */
 void InitProblemOnce(char *filename) {
  // char *filename = argv[1];
  // printf("Enter the data file name: ");
  // scanf("%s", filename);
  printf("The file name is: %s\n", filename);
  fp = fopen(filename, "r");
  fscanf(fp, "%d", &Size);
  a = (float *)malloc(Size * Size * sizeof(float));
  InitMat(a, Size, Size);
  printf("The input matrix a is:\n");
  PrintMat(a, Size, Size);
  b = (float *)malloc(Size * sizeof(float));
  InitAry(b, Size);
  printf("The input array b is:\n");
  PrintAry(b, Size);
  m = (float *)malloc(Size * Size * sizeof(float));
 }
 /*------------------------------------------------------
 ** InitPerRun() -- Initialize the contents of the
 ** multipier matrix **m
 **------------------------------------------------------
 */
 void InitPerRun() {
  int i;
  for (i = 0; i < Size * Size; i++)
    *(m + i) = 0.0;
 }
 /*-------------------------------------------------------
 ** Fan1() -- Calculate multiplier matrix
 ** Pay attention to the index.  Index i give the range
 ** which starts from 0 to range-1.  The real values of
 ** the index should be adjust and related with the value
 ** of t which is defined on the ForwardSub().
 **-------------------------------------------------------
 */
 __global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
  // if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
  // 		printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
  // Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
  // }
  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
    return;
  *(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
      *(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
      *(a_cuda + Size * t + t);
 }
 /*-------------------------------------------------------
 ** Fan2() -- Modify the matrix A into LUD
 **-------------------------------------------------------
 */
 __global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
                     int j1, int t) {
  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
    return;
  if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
    return;
  int xidx = blockIdx.x * blockDim.x + threadIdx.x;
  int yidx = blockIdx.y * blockDim.y + threadIdx.y;
  // printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
  // blockDim.x: %d, blockDim.y:
  // %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
  a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
      m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
  // a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
  if (yidx == 0) {
    // printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
    // printf("xidx:%d,yidx:%d\n",xidx,yidx);
    b_cuda[xidx + 1 + t] -=
        m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
  }
 }
 /*------------------------------------------------------
 ** ForwardSub() -- Forward substitution of Gaussian
 ** elimination.
 **------------------------------------------------------
 */
 void ForwardSub() {
  int t;
  float *m_cuda, *a_cuda, *b_cuda;
  int A = 1;
  int B = 2;
  int C = 3;
  int D = 4;
  int E = 5;
  int F = 6;
  // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
  // A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
  // threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
  // allocate memory on GPU
  cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
  cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
  cudaMalloc((void **)&b_cuda, Size * sizeof(float));
  // copy memory to GPU
  cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
  int block_size, grid_size;
  block_size = MAXBLOCKSIZE;
  grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
  printf("1d grid size: %d\n", grid_size);
  dim3 dimBlock(block_size);
  dim3 dimGrid(grid_size);
  // dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
  int blockSize2d, gridSize2d;
  blockSize2d = BLOCK_SIZE_XY;
  gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
  dim3 dimBlockXY(blockSize2d, blockSize2d);
  printf("BlockXY: %d \n", blockSize2d);
  dim3 dimGridXY(gridSize2d, gridSize2d);
 #ifdef TIMING
  gettimeofday(&tv_kernel_start, NULL);
 #endif
  printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
  // begin timing kernels
  struct timeval time_start;
  gettimeofday(&time_start, NULL);
  for (t = 0; t < (Size - 1); t++) {
    Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
    cudaDeviceSynchronize();
    Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
    cudaDeviceSynchronize();
    checkCUDAError("Fan2");
  }
  // end timing kernels
  struct timeval time_end;
  gettimeofday(&time_end, NULL);
  totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
                    (time_start.tv_sec * 1000000 + time_start.tv_usec);
 #ifdef TIMING
  tvsub(&time_end, &tv_kernel_start, &tv);
  kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
 #endif
  // copy memory back to CPU
  cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(m_cuda);
  cudaFree(a_cuda);
  cudaFree(b_cuda);
 }
 /*------------------------------------------------------
 ** BackSub() -- Backward substitution
 **------------------------------------------------------
 */
 void BackSub() {
  // create a new vector to hold the final answer
  finalVec = (float *)malloc(Size * sizeof(float));
  // solve "bottom up"
  int i, j;
  for (i = 0; i < Size; i++) {
    finalVec[Size - i - 1] = b[Size - i - 1];
    for (j = 0; j < i; j++) {
      finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
                                finalVec[Size - j - 1];
    }
    finalVec[Size - i - 1] =
        finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
  }
 }
 void InitMat(float *ary, int nrow, int ncol) {
  int i, j;
  for (i = 0; i < nrow; i++) {
    for (j = 0; j < ncol; j++) {
      fscanf(fp, "%f", ary + Size * i + j);
    }
  }
 }
 /*------------------------------------------------------
 ** PrintMat() -- Print the contents of the matrix
 **------------------------------------------------------
 */
 void PrintMat(float *ary, int nrow, int ncol) {
  return;
  int i, j;
  for (i = 0; i < nrow; i++) {
    for (j = 0; j < ncol; j++) {
      printf("%8.2f ", *(ary + Size * i + j));
    }
    printf("\n");
  }
  printf("\n");
 }
 /*------------------------------------------------------
 ** InitAry() -- Initialize the array (vector) by reading
 ** data from the data file
 **------------------------------------------------------
 */
 void InitAry(float *ary, int ary_size) {
  int i;
  for (i = 0; i < ary_size; i++) {
    fscanf(fp, "%f", &ary[i]);
  }
 }
 /*------------------------------------------------------
 ** PrintAry() -- Print the contents of the array (vector)
 **------------------------------------------------------
 */
 void PrintAry(float *ary, int ary_size) {
  int i;
  for (i = 0; i < ary_size; i++) {
    printf("%.2f ", ary[i]);
  }
  printf("\n\n");
 }
 void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if (cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 }
--- a/examples/gauss/run.sh
+++ b/examples/gauss/run.sh
@ -1,23 +0,0 @@
 #!/bin/bash
 set -e
 llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L../../build/runtime \
     -L../../build/runtime/threadPool \
     -o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
 if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/heartwall/AVI/avilib.c
+++ b/examples/heartwall/AVI/avilib.c
--- a/examples/heartwall/AVI/avilib.h
+++ b/examples/heartwall/AVI/avilib.h
@ -1,317 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*
 *  avilib.h
 *
 *  Copyright (C) Thomas Östreich - June 2001
 *  multiple audio track support Copyright (C) 2002 Thomas Östreich
 *
 *  Original code:
 *  Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
 *
 *  This file is part of transcode, a linux video stream processing tool
 *
 *  transcode is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  transcode is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */
 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 // #include <windows.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #ifndef AVILIB_H
 #define AVILIB_H
 #define AVI_MAX_TRACKS 8
 typedef struct {
  unsigned long key;
  unsigned long pos;
  unsigned long len;
 } video_index_entry;
 typedef struct {
  unsigned long pos;
  unsigned long len;
  unsigned long tot;
 } audio_index_entry;
 typedef struct track_s {
  long a_fmt;   /* Audio format, see #defines below */
  long a_chans; /* Audio channels, 0 for no audio */
  long a_rate;  /* Rate in Hz */
  long a_bits;  /* bits per audio sample */
  long mp3rate; /* mp3 bitrate kbs*/
  long audio_strn;   /* Audio stream number */
  long audio_bytes;  /* Total number of bytes of audio data */
  long audio_chunks; /* Chunks of audio data in the file */
  char audio_tag[4]; /* Tag of audio data */
  long audio_posc;   /* Audio position: chunk */
  long audio_posb;   /* Audio position: byte within chunk */
  long a_codech_off; /* absolut offset of audio codec information */
  long a_codecf_off; /* absolut offset of audio codec information */
  audio_index_entry *audio_index;
 } track_t;
 typedef struct {
  long fdes; /* File descriptor of AVI file */
  long mode; /* 0 for reading, 1 for writing */
  long width;          /* Width  of a video frame */
  long height;         /* Height of a video frame */
  double fps;          /* Frames per second */
  char compressor[8];  /* Type of compressor, 4 bytes + padding for 0 byte */
  char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
  long video_strn;     /* Video stream number */
  long video_frames;   /* Number of video frames */
  char video_tag[4];   /* Tag of video data */
  long video_pos;      /* Number of next frame to be read
                              (if index present) */
  unsigned long max_len; /* maximum video chunk present */
  track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
  unsigned long pos; /* position in file */
  long n_idx;        /* number of index entries actually filled */
  long max_idx;      /* number of index entries actually allocated */
  long v_codech_off; /* absolut offset of video codec (strh) info */
  long v_codecf_off; /* absolut offset of video codec (strf) info */
  unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
  video_index_entry *video_index;
  unsigned long last_pos; /* Position of last frame written */
  unsigned long last_len; /* Length of last frame written */
  int must_use_index;     /* Flag if frames are duplicated */
  unsigned long movi_start;
  int anum; // total number of audio tracks
  int aptr; // current audio working track
 } avi_t;
 #define AVI_MODE_WRITE 0
 #define AVI_MODE_READ 1
 /* The error codes delivered by avi_open_input_file */
 #define AVI_ERR_SIZELIM                                                        \
  1 /* The write of the data would exceed                                      \
                                           the maximum size of the AVI file.   \
                                           This is more a warning than an      \
       error since the file may be closed safely */
 #define AVI_ERR_OPEN                                                           \
  2 /* Error opening the AVI file - wrong path                                 \
                                           name or file nor readable/writable  \
     */
 #define AVI_ERR_READ 3 /* Error reading from AVI File */
 #define AVI_ERR_WRITE                                                          \
  4 /* Error writing to AVI File,                                              \
                                           disk full ??? */
 #define AVI_ERR_WRITE_INDEX                                                    \
  5 /* Could not write index to AVI file                                       \
                                           during close, file may still be     \
                                           usable */
 #define AVI_ERR_CLOSE                                                          \
  6 /* Could not write header to AVI file                                      \
                                           or not truncate the file during     \
       close, file is most probably corrupted */
 #define AVI_ERR_NOT_PERM                                                       \
  7 /* Operation not permitted:                                                \
                                           trying to read from a file open     \
                                           for writing or vice versa */
 #define AVI_ERR_NO_MEM 8 /* malloc failed */
 #define AVI_ERR_NO_AVI 9 /* Not an AVI file */
 #define AVI_ERR_NO_HDRL                                                        \
  10 /* AVI file has no has no header list,                                    \
                                            corrupted ??? */
 #define AVI_ERR_NO_MOVI                                                        \
  11 /* AVI file has no has no MOVI list,                                      \
                                            corrupted ??? */
 #define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
 #define AVI_ERR_NO_IDX                                                         \
  13 /* The file has been opened with                                          \
                                            getIndex==0, but an operation has  \
        been performed that needs an index */
 /* Possible Audio formats */
 #ifndef WAVE_FORMAT_PCM
 #define WAVE_FORMAT_UNKNOWN (0x0000)
 #define WAVE_FORMAT_PCM (0x0001)
 #define WAVE_FORMAT_ADPCM (0x0002)
 #define WAVE_FORMAT_IBM_CVSD (0x0005)
 #define WAVE_FORMAT_ALAW (0x0006)
 #define WAVE_FORMAT_MULAW (0x0007)
 #define WAVE_FORMAT_OKI_ADPCM (0x0010)
 #define WAVE_FORMAT_DVI_ADPCM (0x0011)
 #define WAVE_FORMAT_DIGISTD (0x0015)
 #define WAVE_FORMAT_DIGIFIX (0x0016)
 #define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
 #define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
 #define WAVE_FORMAT_GSM610 (0x0031)
 #define IBM_FORMAT_MULAW (0x0101)
 #define IBM_FORMAT_ALAW (0x0102)
 #define IBM_FORMAT_ADPCM (0x0103)
 #endif
 avi_t *AVI_open_output_file(char *filename);
 void AVI_set_video(avi_t *AVI, int width, int height, double fps,
                   char *compressor);
 void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
                   long mp3rate);
 int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
 int AVI_dup_frame(avi_t *AVI);
 int AVI_write_audio(avi_t *AVI, char *data, long bytes);
 int AVI_append_audio(avi_t *AVI, char *data, long bytes);
 long AVI_bytes_remain(avi_t *AVI);
 int AVI_close(avi_t *AVI);
 long AVI_bytes_written(avi_t *AVI);
 avi_t *AVI_open_input_file(char *filename, int getIndex);
 avi_t *AVI_open_fd(int fd, int getIndex);
 int avi_parse_input_file(avi_t *AVI, int getIndex);
 long AVI_audio_mp3rate(avi_t *AVI);
 long AVI_video_frames(avi_t *AVI);
 int AVI_video_width(avi_t *AVI);
 int AVI_video_height(avi_t *AVI);
 double AVI_frame_rate(avi_t *AVI);
 char *AVI_video_compressor(avi_t *AVI);
 int AVI_audio_channels(avi_t *AVI);
 int AVI_audio_bits(avi_t *AVI);
 int AVI_audio_format(avi_t *AVI);
 long AVI_audio_rate(avi_t *AVI);
 long AVI_audio_bytes(avi_t *AVI);
 long AVI_audio_chunks(avi_t *AVI);
 long AVI_max_video_chunk(avi_t *AVI);
 long AVI_frame_size(avi_t *AVI, long frame);
 long AVI_audio_size(avi_t *AVI, long frame);
 int AVI_seek_start(avi_t *AVI);
 int AVI_set_video_position(avi_t *AVI, long frame);
 long AVI_get_video_position(avi_t *AVI, long frame);
 long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
 int AVI_set_audio_position(avi_t *AVI, long byte);
 int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
 long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
 long AVI_audio_codech_offset(avi_t *AVI);
 long AVI_audio_codecf_offset(avi_t *AVI);
 long AVI_video_codech_offset(avi_t *AVI);
 long AVI_video_codecf_offset(avi_t *AVI);
 int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
                  long max_audbuf, long *len);
 void AVI_print_error(char *str);
 char *AVI_strerror();
 char *AVI_syserror();
 int AVI_scan(char *name);
 int AVI_dump(char *name, int mode);
 char *AVI_codec2str(short cc);
 int AVI_file_check(char *import_file);
 void AVI_info(avi_t *avifile);
 uint64_t AVI_max_size();
 int avi_update_header(avi_t *AVI);
 int AVI_set_audio_track(avi_t *AVI, int track);
 int AVI_get_audio_track(avi_t *AVI);
 int AVI_audio_tracks(avi_t *AVI);
 struct riff_struct {
  unsigned char id[4]; /* RIFF */
  unsigned long len;
  unsigned char wave_id[4]; /* WAVE */
 };
 struct chunk_struct {
  unsigned char id[4];
  unsigned long len;
 };
 struct common_struct {
  unsigned short wFormatTag;
  unsigned short wChannels;
  unsigned long dwSamplesPerSec;
  unsigned long dwAvgBytesPerSec;
  unsigned short wBlockAlign;
  unsigned short wBitsPerSample; /* Only for PCM */
 };
 struct wave_header {
  struct riff_struct riff;
  struct chunk_struct format;
  struct common_struct common;
  struct chunk_struct data;
 };
 struct AVIStreamHeader {
  long fccType;
  long fccHandler;
  long dwFlags;
  long dwPriority;
  long dwInitialFrames;
  long dwScale;
  long dwRate;
  long dwStart;
  long dwLength;
  long dwSuggestedBufferSize;
  long dwQuality;
  long dwSampleSize;
 };
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/examples/heartwall/AVI/avimod.c
+++ b/examples/heartwall/AVI/avimod.c
@ -1,130 +0,0 @@
 // #ifdef __cplusplus
 // extern "C" {
 // #endif
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 #include "avimod.h"
 //===============================================================================================================================================================================================================
 //	FUNCTIONS
 //===============================================================================================================================================================================================================
 // Flips the specified image and crops it to the specified dimensions
 // If scaled == true, all values are scaled to the range [0.0, 1.0
 fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
                    int converted) {
  // fixed dimensions for cropping or not cropping, square vertices starting
  // from initial point in top left corner going down and right
  int top;
  int bottom;
  int left;
  int right;
  if (cropped == 1) {
    top = 0;
    bottom = 0;
    left = 0;
    right = 0;
  } else {
    top = 0;
    bottom = height - 1;
    left = 0;
    right = width - 1;
  }
  // dimensions of new cropped image
  int height_new = bottom - top + 1;
  int width_new = right - left + 1;
  // counters
  int i, j;
  // allocate memory for cropped/flipped frame
  fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
  // crop/flip and scale frame
  fp temp;
  if (scaled) {
    fp scale = 1.0 / 255.0;
    for (i = 0; i < height_new; i++) {  // rows
      for (j = 0; j < width_new; j++) { // colums
        temp =
            (fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
        if (temp < 0) {
          result[i * width_new + j] = temp + 256;
        } else {
          result[i * width_new + j] = temp;
        }
      }
    }
  } else {
    for (i = 0; i < height_new; i++) {  // rows
      for (j = 0; j < width_new; j++) { // colums
        temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
        if (temp < 0) {
          result[i * width_new + j] = temp + 256;
        } else {
          result[i * width_new + j] = temp;
        }
      }
    }
  }
  // convert storage method (from row-major to column-major)
  fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
  if (converted == 1) {
    for (i = 0; i < width_new; i++) {    // rows
      for (j = 0; j < height_new; j++) { // colums
        result_converted[i * height_new + j] = result[j * width_new + i];
      }
    }
  } else {
    result_converted = result;
  }
  free(result);
  // return
  return result_converted;
 }
 // Returns the specified frame from the specified video file
 // If cropped == true, the frame is cropped to pre-determined dimensions
 //  (hardcoded to the boundaries of the blood vessel in the test video)
 // If scaled == true, all values are scaled to the range [0.0, 1.0]
 fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
              int converted) {
  // variable
  int dummy;
  int width = AVI_video_width(cell_file);
  int height = AVI_video_height(cell_file);
  int status;
  // There are 600 frames in this file (i.e. frame_num = 600 causes an error)
  AVI_set_video_position(cell_file, frame_num);
  // Read in the frame from the AVI
  char *image_buf = (char *)malloc(width * height * sizeof(char));
  status = AVI_read_frame(cell_file, image_buf, &dummy);
  if (status == -1) {
    AVI_print_error((char *)"Error with AVI_read_frame");
    exit(-1);
  }
  // The image is read in upside-down, so we need to flip it
  fp *image_chopped;
  image_chopped =
      chop_flip_image(image_buf, height, width, cropped, scaled, converted);
  // free image buffer
  free(image_buf);
  // return
  return image_chopped;
 }
 // #ifdef __cplusplus
 // }
 // #endif
--- a/examples/heartwall/AVI/avimod.h
+++ b/examples/heartwall/AVI/avimod.h
@ -1,24 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 #define fp float
 #include "avilib.h"
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
                    int converted);
 fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
              int converted);
 #ifdef __cplusplus
 }
 #endif
--- a/examples/heartwall/define.c
+++ b/examples/heartwall/define.c
@ -1,396 +0,0 @@
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 #define fp float
 /* #define NUMBER_THREADS 512 */
 #ifdef RD_WG_SIZE_0_0
 #define NUMBER_THREADS RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define NUMBER_THREADS RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define NUMBER_THREADS RD_WG_SIZE
 #else
 #define NUMBER_THREADS 256
 #endif
 #define ENDO_POINTS 20
 #define EPI_POINTS 31
 #define ALL_POINTS 51
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	PARAMS_COMMON_CHANGE STRUCT
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 typedef struct params_common_change {
  //======================================================================================================================================================
  //	FRAME
  //======================================================================================================================================================
  fp *d_frame;
  int frame_no;
 } params_common_change;
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	PARAMS_COMMON STRUCTURE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 typedef struct params_common {
  //======================================================================================================================================================
  //	HARDCODED INPUTS FROM MATLAB
  //======================================================================================================================================================
  //====================================================================================================
  //	CONSTANTS
  //====================================================================================================
  int sSize;
  int tSize;
  int maxMove;
  fp alpha;
  //====================================================================================================
  //	FRAME
  //====================================================================================================
  int no_frames;
  int frame_rows;
  int frame_cols;
  int frame_elem;
  int frame_mem;
  //====================================================================================================
  //	ENDO POINTS
  //====================================================================================================
  int endoPoints;
  int endo_mem;
  int *endoRow;
  int *endoCol;
  int *tEndoRowLoc;
  int *tEndoColLoc;
  int *d_endoRow;
  int *d_endoCol;
  int *d_tEndoRowLoc;
  int *d_tEndoColLoc;
  fp *d_endoT;
  //====================================================================================================
  //	EPI POINTS
  //====================================================================================================
  int epiPoints;
  int epi_mem;
  int *epiRow;
  int *epiCol;
  int *tEpiRowLoc;
  int *tEpiColLoc;
  int *d_epiRow;
  int *d_epiCol;
  int *d_tEpiRowLoc;
  int *d_tEpiColLoc;
  fp *d_epiT;
  //====================================================================================================
  //	ALL POINTS
  //====================================================================================================
  int allPoints;
  //======================================================================================================================================================
  //	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
  //======================================================================================================================================================
  int in_rows;
  int in_cols;
  int in_elem;
  int in_mem;
  //======================================================================================================================================================
  // 	AREA AROUND POINT		FROM	FRAME
  //======================================================================================================================================================
  int in2_rows;
  int in2_cols;
  int in2_elem;
  int in2_mem;
  //======================================================================================================================================================
  //	CONVOLUTION
  //======================================================================================================================================================
  int conv_rows;
  int conv_cols;
  int conv_elem;
  int conv_mem;
  int ioffset;
  int joffset;
  //======================================================================================================================================================
  //	CUMULATIVE SUM 1
  //======================================================================================================================================================
  //====================================================================================================
  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
  //====================================================================================================
  int in2_pad_add_rows;
  int in2_pad_add_cols;
  int in2_pad_cumv_rows;
  int in2_pad_cumv_cols;
  int in2_pad_cumv_elem;
  int in2_pad_cumv_mem;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  int in2_pad_cumv_sel_rows;
  int in2_pad_cumv_sel_cols;
  int in2_pad_cumv_sel_elem;
  int in2_pad_cumv_sel_mem;
  int in2_pad_cumv_sel_rowlow;
  int in2_pad_cumv_sel_rowhig;
  int in2_pad_cumv_sel_collow;
  int in2_pad_cumv_sel_colhig;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  //====================================================================================================
  int in2_pad_cumv_sel2_rowlow;
  int in2_pad_cumv_sel2_rowhig;
  int in2_pad_cumv_sel2_collow;
  int in2_pad_cumv_sel2_colhig;
  int in2_sub_cumh_rows;
  int in2_sub_cumh_cols;
  int in2_sub_cumh_elem;
  int in2_sub_cumh_mem;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  int in2_sub_cumh_sel_rows;
  int in2_sub_cumh_sel_cols;
  int in2_sub_cumh_sel_elem;
  int in2_sub_cumh_sel_mem;
  int in2_sub_cumh_sel_rowlow;
  int in2_sub_cumh_sel_rowhig;
  int in2_sub_cumh_sel_collow;
  int in2_sub_cumh_sel_colhig;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  int in2_sub_cumh_sel2_rowlow;
  int in2_sub_cumh_sel2_rowhig;
  int in2_sub_cumh_sel2_collow;
  int in2_sub_cumh_sel2_colhig;
  int in2_sub2_rows;
  int in2_sub2_cols;
  int in2_sub2_elem;
  int in2_sub2_mem;
  //======================================================================================================================================================
  //	CUMULATIVE SUM 2
  //======================================================================================================================================================
  //====================================================================================================
  //	MULTIPLICATION
  //====================================================================================================
  int in2_sqr_rows;
  int in2_sqr_cols;
  int in2_sqr_elem;
  int in2_sqr_mem;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  int in2_sqr_sub2_rows;
  int in2_sqr_sub2_cols;
  int in2_sqr_sub2_elem;
  int in2_sqr_sub2_mem;
  //======================================================================================================================================================
  //	FINAL
  //======================================================================================================================================================
  int in_sqr_rows;
  int in_sqr_cols;
  int in_sqr_elem;
  int in_sqr_mem;
  //======================================================================================================================================================
  //	TEMPLATE MASK CREATE
  //======================================================================================================================================================
  int tMask_rows;
  int tMask_cols;
  int tMask_elem;
  int tMask_mem;
  //======================================================================================================================================================
  //	POINT MASK INITIALIZE
  //======================================================================================================================================================
  int mask_rows;
  int mask_cols;
  int mask_elem;
  int mask_mem;
  //======================================================================================================================================================
  //	MASK CONVOLUTION
  //======================================================================================================================================================
  int mask_conv_rows;
  int mask_conv_cols;
  int mask_conv_elem;
  int mask_conv_mem;
  int mask_conv_ioffset;
  int mask_conv_joffset;
 } params_common;
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	PARAMS_UNIQUE STRUCTURE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 typedef struct params_unique {
  //======================================================================================================================================================
  //	POINT NUMBER
  //======================================================================================================================================================
  int *d_Row;
  int *d_Col;
  int *d_tRowLoc;
  int *d_tColLoc;
  fp *d_T;
  //======================================================================================================================================================
  //	POINT NUMBER
  //======================================================================================================================================================
  int point_no;
  //======================================================================================================================================================
  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
  //======================================================================================================================================================
  int in_pointer;
  //======================================================================================================================================================
  //	AREA AROUND POINT		FROM	FRAME
  //======================================================================================================================================================
  fp *d_in2;
  //======================================================================================================================================================
  //	CONVOLUTION
  //======================================================================================================================================================
  fp *d_conv;
  fp *d_in_mod;
  //======================================================================================================================================================
  //	CUMULATIVE SUM
  //======================================================================================================================================================
  //====================================================================================================
  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
  //====================================================================================================
  fp *d_in2_pad_cumv;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  fp *d_in2_pad_cumv_sel;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  //====================================================================================================
  fp *d_in2_sub_cumh;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  fp *d_in2_sub_cumh_sel;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  fp *d_in2_sub2;
  //======================================================================================================================================================
  //	CUMULATIVE SUM 2
  //======================================================================================================================================================
  //====================================================================================================
  //	MULTIPLICATION
  //====================================================================================================
  fp *d_in2_sqr;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  fp *d_in2_sqr_sub2;
  //======================================================================================================================================================
  //	FINAL
  //======================================================================================================================================================
  fp *d_in_sqr;
  //======================================================================================================================================================
  //	TEMPLATE MASK
  //======================================================================================================================================================
  fp *d_tMask;
  //======================================================================================================================================================
  //	POINT MASK INITIALIZE
  //======================================================================================================================================================
  fp *d_mask;
  //======================================================================================================================================================
  //	MASK CONVOLUTION
  //======================================================================================================================================================
  fp *d_mask_conv;
 } params_unique;
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	END OF STRUCTURE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
--- a/examples/heartwall/kernel.cu
+++ b/examples/heartwall/kernel.cu
--- a/examples/heartwall/main.cu
+++ b/examples/heartwall/main.cu
@ -1,795 +0,0 @@
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //======================================================================================================================================================
 //	LIBRARIES
 //======================================================================================================================================================
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <avilib.h>
 #include <avimod.h>
 #include <cuda.h>
 //======================================================================================================================================================
 //	STRUCTURES, GLOBAL STRUCTURE VARIABLES
 //======================================================================================================================================================
 #include "define.c"
 params_common_change common_change;
 __constant__ params_common_change d_common_change;
 params_common common;
 __constant__ params_common d_common;
 params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
                                  // more than usually needed
 __constant__ params_unique d_unique[ALL_POINTS];
 //======================================================================================================================================================
 // KERNEL CODE
 //======================================================================================================================================================
 #include "kernel.cu"
 //	WRITE DATA FUNCTION
 //===============================================================================================================================================================================================================200
 void write_data(char *filename, int frameNo, int frames_processed,
                int endoPoints, int *input_a, int *input_b, int epiPoints,
                int *input_2a, int *input_2b) {
  //================================================================================80
  //	VARIABLES
  //================================================================================80
  FILE *fid;
  int i, j;
  char c;
  //================================================================================80
  //	OPEN FILE FOR READING
  //================================================================================80
  fid = fopen(filename, "w+");
  if (fid == NULL) {
    printf("The file was not opened for writing\n");
    return;
  }
  //================================================================================80
  //	WRITE VALUES TO THE FILE
  //================================================================================80
  fprintf(fid, "Total AVI Frames: %d\n", frameNo);
  fprintf(fid, "Frames Processed: %d\n", frames_processed);
  fprintf(fid, "endoPoints: %d\n", endoPoints);
  fprintf(fid, "epiPoints: %d", epiPoints);
  for (j = 0; j < frames_processed; j++) {
    fprintf(fid, "\n---Frame %d---", j);
    fprintf(fid, "\n--endo--\n", j);
    for (i = 0; i < endoPoints; i++) {
      fprintf(fid, "%d\t", input_a[j + i * frameNo]);
    }
    fprintf(fid, "\n");
    for (i = 0; i < endoPoints; i++) {
      // if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
      fprintf(fid, "%d\t", input_b[j + i * frameNo]);
    }
    fprintf(fid, "\n--epi--\n", j);
    for (i = 0; i < epiPoints; i++) {
      // if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
      fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
    }
    fprintf(fid, "\n");
    for (i = 0; i < epiPoints; i++) {
      // if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
      fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
    }
  }
  // 	================================================================================80
  //		CLOSE FILE
  //	================================================================================80
  fclose(fid);
 }
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	MAIN FUNCTION
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 int main(int argc, char *argv[]) {
  cudaSetDevice(0);
  printf("WG size of kernel = %d \n", NUMBER_THREADS);
  //======================================================================================================================================================
  //	VARIABLES
  //======================================================================================================================================================
  // CUDA kernel execution parameters
  dim3 threads;
  dim3 blocks;
  // counter
  int i;
  int frames_processed;
  // frames
  char *video_file_name;
  avi_t *frames;
  fp *frame;
  //======================================================================================================================================================
  // 	FRAME
  //======================================================================================================================================================
  if (argc != 3) {
    printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
    exit(1);
  }
  // open movie file
  video_file_name = argv[1];
  frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
  if (frames == NULL) {
    AVI_print_error((char *)"Error with AVI_open_input_file");
    return -1;
  }
  // common
  common.no_frames = AVI_video_frames(frames);
  common.frame_rows = AVI_video_height(frames);
  common.frame_cols = AVI_video_width(frames);
  common.frame_elem = common.frame_rows * common.frame_cols;
  common.frame_mem = sizeof(fp) * common.frame_elem;
  // pointers
  cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
  //======================================================================================================================================================
  // 	CHECK INPUT ARGUMENTS
  //======================================================================================================================================================
  frames_processed = atoi(argv[2]);
  if (frames_processed < 0 || frames_processed > common.no_frames) {
    printf("ERROR: %d is an incorrect number of frames specified, select in "
           "the range of 0-%d\n",
           frames_processed, common.no_frames);
    return 0;
  }
  //======================================================================================================================================================
  //	HARDCODED INPUTS FROM MATLAB
  //======================================================================================================================================================
  //====================================================================================================
  //	CONSTANTS
  //====================================================================================================
  common.sSize = 40;
  common.tSize = 25;
  common.maxMove = 10;
  common.alpha = 0.87;
  //====================================================================================================
  //	ENDO POINTS
  //====================================================================================================
  common.endoPoints = ENDO_POINTS;
  common.endo_mem = sizeof(int) * common.endoPoints;
  common.endoRow = (int *)malloc(common.endo_mem);
  common.endoRow[0] = 369;
  common.endoRow[1] = 400;
  common.endoRow[2] = 429;
  common.endoRow[3] = 452;
  common.endoRow[4] = 476;
  common.endoRow[5] = 486;
  common.endoRow[6] = 479;
  common.endoRow[7] = 458;
  common.endoRow[8] = 433;
  common.endoRow[9] = 404;
  common.endoRow[10] = 374;
  common.endoRow[11] = 346;
  common.endoRow[12] = 318;
  common.endoRow[13] = 294;
  common.endoRow[14] = 277;
  common.endoRow[15] = 269;
  common.endoRow[16] = 275;
  common.endoRow[17] = 287;
  common.endoRow[18] = 311;
  common.endoRow[19] = 339;
  cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
  cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
             cudaMemcpyHostToDevice);
  common.endoCol = (int *)malloc(common.endo_mem);
  common.endoCol[0] = 408;
  common.endoCol[1] = 406;
  common.endoCol[2] = 397;
  common.endoCol[3] = 383;
  common.endoCol[4] = 354;
  common.endoCol[5] = 322;
  common.endoCol[6] = 294;
  common.endoCol[7] = 270;
  common.endoCol[8] = 250;
  common.endoCol[9] = 237;
  common.endoCol[10] = 235;
  common.endoCol[11] = 241;
  common.endoCol[12] = 254;
  common.endoCol[13] = 273;
  common.endoCol[14] = 300;
  common.endoCol[15] = 328;
  common.endoCol[16] = 356;
  common.endoCol[17] = 383;
  common.endoCol[18] = 401;
  common.endoCol[19] = 411;
  cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
  cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
             cudaMemcpyHostToDevice);
  common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEndoRowLoc,
             common.endo_mem * common.no_frames);
  common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEndoColLoc,
             common.endo_mem * common.no_frames);
  //====================================================================================================
  //	EPI POINTS
  //====================================================================================================
  common.epiPoints = EPI_POINTS;
  common.epi_mem = sizeof(int) * common.epiPoints;
  common.epiRow = (int *)malloc(common.epi_mem);
  common.epiRow[0] = 390;
  common.epiRow[1] = 419;
  common.epiRow[2] = 448;
  common.epiRow[3] = 474;
  common.epiRow[4] = 501;
  common.epiRow[5] = 519;
  common.epiRow[6] = 535;
  common.epiRow[7] = 542;
  common.epiRow[8] = 543;
  common.epiRow[9] = 538;
  common.epiRow[10] = 528;
  common.epiRow[11] = 511;
  common.epiRow[12] = 491;
  common.epiRow[13] = 466;
  common.epiRow[14] = 438;
  common.epiRow[15] = 406;
  common.epiRow[16] = 376;
  common.epiRow[17] = 347;
  common.epiRow[18] = 318;
  common.epiRow[19] = 291;
  common.epiRow[20] = 275;
  common.epiRow[21] = 259;
  common.epiRow[22] = 256;
  common.epiRow[23] = 252;
  common.epiRow[24] = 252;
  common.epiRow[25] = 257;
  common.epiRow[26] = 266;
  common.epiRow[27] = 283;
  common.epiRow[28] = 305;
  common.epiRow[29] = 331;
  common.epiRow[30] = 360;
  cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
  cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
             cudaMemcpyHostToDevice);
  common.epiCol = (int *)malloc(common.epi_mem);
  common.epiCol[0] = 457;
  common.epiCol[1] = 454;
  common.epiCol[2] = 446;
  common.epiCol[3] = 431;
  common.epiCol[4] = 411;
  common.epiCol[5] = 388;
  common.epiCol[6] = 361;
  common.epiCol[7] = 331;
  common.epiCol[8] = 301;
  common.epiCol[9] = 273;
  common.epiCol[10] = 243;
  common.epiCol[11] = 218;
  common.epiCol[12] = 196;
  common.epiCol[13] = 178;
  common.epiCol[14] = 166;
  common.epiCol[15] = 157;
  common.epiCol[16] = 155;
  common.epiCol[17] = 165;
  common.epiCol[18] = 177;
  common.epiCol[19] = 197;
  common.epiCol[20] = 218;
  common.epiCol[21] = 248;
  common.epiCol[22] = 276;
  common.epiCol[23] = 304;
  common.epiCol[24] = 333;
  common.epiCol[25] = 361;
  common.epiCol[26] = 391;
  common.epiCol[27] = 415;
  common.epiCol[28] = 434;
  common.epiCol[29] = 448;
  common.epiCol[30] = 455;
  cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
  cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
             cudaMemcpyHostToDevice);
  common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
  common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
  //====================================================================================================
  //	ALL POINTS
  //====================================================================================================
  common.allPoints = ALL_POINTS;
  //======================================================================================================================================================
  // 	TEMPLATE SIZES
  //======================================================================================================================================================
  // common
  common.in_rows = common.tSize + 1 + common.tSize;
  common.in_cols = common.in_rows;
  common.in_elem = common.in_rows * common.in_cols;
  common.in_mem = sizeof(fp) * common.in_elem;
  //======================================================================================================================================================
  // 	CREATE ARRAY OF TEMPLATES FOR ALL POINTS
  //======================================================================================================================================================
  // common
  cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
  cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
  //======================================================================================================================================================
  //	SPECIFIC TO ENDO OR EPI TO BE SET HERE
  //======================================================================================================================================================
  for (i = 0; i < common.endoPoints; i++) {
    unique[i].point_no = i;
    unique[i].d_Row = common.d_endoRow;
    unique[i].d_Col = common.d_endoCol;
    unique[i].d_tRowLoc = common.d_tEndoRowLoc;
    unique[i].d_tColLoc = common.d_tEndoColLoc;
    unique[i].d_T = common.d_endoT;
  }
  for (i = common.endoPoints; i < common.allPoints; i++) {
    unique[i].point_no = i - common.endoPoints;
    unique[i].d_Row = common.d_epiRow;
    unique[i].d_Col = common.d_epiCol;
    unique[i].d_tRowLoc = common.d_tEpiRowLoc;
    unique[i].d_tColLoc = common.d_tEpiColLoc;
    unique[i].d_T = common.d_epiT;
  }
  //======================================================================================================================================================
  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
  //======================================================================================================================================================
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    unique[i].in_pointer = unique[i].point_no * common.in_elem;
  }
  //======================================================================================================================================================
  // 	AREA AROUND POINT		FROM	FRAME
  //======================================================================================================================================================
  // common
  common.in2_rows = 2 * common.sSize + 1;
  common.in2_cols = 2 * common.sSize + 1;
  common.in2_elem = common.in2_rows * common.in2_cols;
  common.in2_mem = sizeof(float) * common.in2_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
  }
  //======================================================================================================================================================
  // 	CONVOLUTION
  //======================================================================================================================================================
  // common
  common.conv_rows =
      common.in_rows + common.in2_rows - 1; // number of rows in I
  common.conv_cols =
      common.in_cols + common.in2_cols - 1; // number of columns in I
  common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
  common.conv_mem = sizeof(float) * common.conv_elem;
  common.ioffset = 0;
  common.joffset = 0;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
  }
  //======================================================================================================================================================
  // 	CUMULATIVE SUM
  //======================================================================================================================================================
  //====================================================================================================
  // 	PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
  //====================================================================================================
  // common
  common.in2_pad_add_rows = common.in_rows;
  common.in2_pad_add_cols = common.in_cols;
  common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
  common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
  common.in2_pad_cumv_elem =
      common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
  common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
  }
  //====================================================================================================
  // 	SELECTION
  //====================================================================================================
  // common
  common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
  common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
  common.in2_pad_cumv_sel_collow = 1;
  common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
  common.in2_pad_cumv_sel_rows =
      common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
  common.in2_pad_cumv_sel_cols =
      common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
  common.in2_pad_cumv_sel_elem =
      common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
  common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
               common.in2_pad_cumv_sel_mem);
  }
  //====================================================================================================
  // 	SELECTION	2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  //====================================================================================================
  // common
  common.in2_pad_cumv_sel2_rowlow = 1;
  common.in2_pad_cumv_sel2_rowhig =
      common.in2_pad_cumv_rows - common.in_rows - 1;
  common.in2_pad_cumv_sel2_collow = 1;
  common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
  common.in2_sub_cumh_rows =
      common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
  common.in2_sub_cumh_cols =
      common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
  common.in2_sub_cumh_elem =
      common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
  common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
  }
  //====================================================================================================
  // 	SELECTION
  //====================================================================================================
  // common
  common.in2_sub_cumh_sel_rowlow = 1;
  common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
  common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
  common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
  common.in2_sub_cumh_sel_rows =
      common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
  common.in2_sub_cumh_sel_cols =
      common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
  common.in2_sub_cumh_sel_elem =
      common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
  common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
               common.in2_sub_cumh_sel_mem);
  }
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  // common
  common.in2_sub_cumh_sel2_rowlow = 1;
  common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
  common.in2_sub_cumh_sel2_collow = 1;
  common.in2_sub_cumh_sel2_colhig =
      common.in2_sub_cumh_cols - common.in_cols - 1;
  common.in2_sub2_rows =
      common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
  common.in2_sub2_cols =
      common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
  common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
  common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
  }
  //======================================================================================================================================================
  //	CUMULATIVE SUM 2
  //======================================================================================================================================================
  //====================================================================================================
  //	MULTIPLICATION
  //====================================================================================================
  // common
  common.in2_sqr_rows = common.in2_rows;
  common.in2_sqr_cols = common.in2_cols;
  common.in2_sqr_elem = common.in2_elem;
  common.in2_sqr_mem = common.in2_mem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
  }
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  // common
  common.in2_sqr_sub2_rows = common.in2_sub2_rows;
  common.in2_sqr_sub2_cols = common.in2_sub2_cols;
  common.in2_sqr_sub2_elem = common.in2_sub2_elem;
  common.in2_sqr_sub2_mem = common.in2_sub2_mem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
  }
  //======================================================================================================================================================
  //	FINAL
  //======================================================================================================================================================
  // common
  common.in_sqr_rows = common.in_rows;
  common.in_sqr_cols = common.in_cols;
  common.in_sqr_elem = common.in_elem;
  common.in_sqr_mem = common.in_mem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
  }
  //======================================================================================================================================================
  //	TEMPLATE MASK CREATE
  //======================================================================================================================================================
  // common
  common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
  common.tMask_cols = common.tMask_rows;
  common.tMask_elem = common.tMask_rows * common.tMask_cols;
  common.tMask_mem = sizeof(float) * common.tMask_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
  }
  //======================================================================================================================================================
  //	POINT MASK INITIALIZE
  //======================================================================================================================================================
  // common
  common.mask_rows = common.maxMove;
  common.mask_cols = common.mask_rows;
  common.mask_elem = common.mask_rows * common.mask_cols;
  common.mask_mem = sizeof(float) * common.mask_elem;
  //======================================================================================================================================================
  //	MASK CONVOLUTION
  //======================================================================================================================================================
  // common
  common.mask_conv_rows = common.tMask_rows; // number of rows in I
  common.mask_conv_cols = common.tMask_cols; // number of columns in I
  common.mask_conv_elem =
      common.mask_conv_rows * common.mask_conv_cols; // number of elements
  common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
  common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
  if ((common.mask_rows - 1) % 2 > 0.5) {
    common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
  }
  common.mask_conv_joffset = (common.mask_cols - 1) / 2;
  if ((common.mask_cols - 1) % 2 > 0.5) {
    common.mask_conv_joffset = common.mask_conv_joffset + 1;
  }
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
  }
  //======================================================================================================================================================
  //	KERNEL
  //======================================================================================================================================================
  //====================================================================================================
  //	THREAD BLOCK
  //====================================================================================================
  // All kernels operations within kernel use same max size of threads. Size of
  // block size is set to the size appropriate for max size operation (on padded
  // matrix). Other use subsets of that.
  threads.x = NUMBER_THREADS; // define the number of threads in the block
  threads.y = 1;
  blocks.x = common.allPoints; // define the number of blocks in the grid
  blocks.y = 1;
  //====================================================================================================
  //	COPY ARGUMENTS
  //====================================================================================================
  cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
  cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
  //====================================================================================================
  //	PRINT FRAME PROGRESS START
  //====================================================================================================
  printf("frame progress: ");
  fflush(NULL);
  //====================================================================================================
  //	LAUNCH
  //====================================================================================================
  for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
       common_change.frame_no++) {
    printf("get frame\n");
    // Extract a cropped version of the first frame from the video file
    frame = get_frame(
        frames,                 // pointer to video file
        common_change.frame_no, // number of frame that needs to be returned
        0,                      // cropped?
        0,                      // scaled?
        1);                     // converted
    printf("memcpy\n");
    // copy frame to GPU memory
    cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
               cudaMemcpyHostToDevice);
    printf("toSymbol\n");
    cudaMemcpyToSymbol(d_common_change, &common_change,
                       sizeof(params_common_change));
    // launch GPU kernel
    printf("launch\n");
    kernel<<<1, 32>>>();
    cudaDeviceSynchronize();
    printf("return\n");
    // free frame after each loop iteration, since AVI library allocates memory
    // for every frame fetched
    printf("free\n");
    free(frame);
    // print frame progress
    printf("%d ", common_change.frame_no);
    fflush(NULL);
  }
  //====================================================================================================
  //	PRINT FRAME PROGRESS END
  //====================================================================================================
  printf("\n");
  fflush(NULL);
  //====================================================================================================
  //	OUTPUT
  //====================================================================================================
  cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
  cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
  cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
  cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
 #ifdef OUTPUT
  //==================================================50
  //	DUMP DATA TO FILE
  //==================================================50
  write_data("result.txt", common.no_frames, frames_processed,
             common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
             common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
  //==================================================50
  //	End
  //==================================================50
 #endif
  //======================================================================================================================================================
  //	DEALLOCATION
  //======================================================================================================================================================
  //====================================================================================================
  //	COMMON
  //====================================================================================================
  // frame
  cudaFree(common_change.d_frame);
  // endo points
  free(common.endoRow);
  free(common.endoCol);
  free(common.tEndoRowLoc);
  free(common.tEndoColLoc);
  cudaFree(common.d_endoRow);
  cudaFree(common.d_endoCol);
  cudaFree(common.d_tEndoRowLoc);
  cudaFree(common.d_tEndoColLoc);
  cudaFree(common.d_endoT);
  // epi points
  free(common.epiRow);
  free(common.epiCol);
  free(common.tEpiRowLoc);
  free(common.tEpiColLoc);
  cudaFree(common.d_epiRow);
  cudaFree(common.d_epiCol);
  cudaFree(common.d_tEpiRowLoc);
  cudaFree(common.d_tEpiColLoc);
  cudaFree(common.d_epiT);
  //====================================================================================================
  //	POINTERS
  //====================================================================================================
  for (i = 0; i < common.allPoints; i++) {
    cudaFree(unique[i].d_in2);
    cudaFree(unique[i].d_conv);
    cudaFree(unique[i].d_in2_pad_cumv);
    cudaFree(unique[i].d_in2_pad_cumv_sel);
    cudaFree(unique[i].d_in2_sub_cumh);
    cudaFree(unique[i].d_in2_sub_cumh_sel);
    cudaFree(unique[i].d_in2_sub2);
    cudaFree(unique[i].d_in2_sqr);
    cudaFree(unique[i].d_in2_sqr_sub2);
    cudaFree(unique[i].d_in_sqr);
    cudaFree(unique[i].d_tMask);
    cudaFree(unique[i].d_mask_conv);
  }
 }
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	MAIN FUNCTION
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
--- a/examples/heartwall/run.sh
+++ b/examples/heartwall/run.sh
@ -1,17 +0,0 @@
 #!/bin/bash
 cd AVI; make; cd ..;
 clang++ -DOUTPUT main.cu -I./AVI  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 /home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 /home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o  ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
 ./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20
--- a/examples/heartwall/setdevice.cu
+++ b/examples/heartwall/setdevice.cu
@ -1,5 +0,0 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Set Device
 ////////////////////////////////////////////////////////////////////////////////
 void setdevice(void) { cudaSetDevice(0); }
--- a/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,719 +0,0 @@
 ; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "hotspot.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
 entry:
  %iteration.addr = alloca i32, align 4
  %power.addr = alloca float*, align 8
  %temp_src.addr = alloca float*, align 8
  %temp_dst.addr = alloca float*, align 8
  %grid_cols.addr = alloca i32, align 4
  %grid_rows.addr = alloca i32, align 4
  %border_cols.addr = alloca i32, align 4
  %border_rows.addr = alloca i32, align 4
  %Cap.addr = alloca float, align 4
  %Rx.addr = alloca float, align 4
  %Ry.addr = alloca float, align 4
  %Rz.addr = alloca float, align 4
  %step.addr = alloca float, align 4
  %time_elapsed.addr = alloca float, align 4
  %amb_temp = alloca float, align 4
  %step_div_Cap = alloca float, align 4
  %Rx_1 = alloca float, align 4
  %Ry_1 = alloca float, align 4
  %Rz_1 = alloca float, align 4
  %bx = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %small_block_rows = alloca i32, align 4
  %small_block_cols = alloca i32, align 4
  %blkY = alloca i32, align 4
  %blkX = alloca i32, align 4
  %blkYmax = alloca i32, align 4
  %blkXmax = alloca i32, align 4
  %yidx = alloca i32, align 4
  %xidx = alloca i32, align 4
  %loadYidx = alloca i32, align 4
  %loadXidx = alloca i32, align 4
  %index = alloca i32, align 4
  %validYmin = alloca i32, align 4
  %validYmax = alloca i32, align 4
  %validXmin = alloca i32, align 4
  %validXmax = alloca i32, align 4
  %N = alloca i32, align 4
  %S = alloca i32, align 4
  %W = alloca i32, align 4
  %E = alloca i32, align 4
  %computed = alloca i8, align 1
  %i = alloca i32, align 4
  store i32 %iteration, i32* %iteration.addr, align 4
  store float* %power, float** %power.addr, align 8
  store float* %temp_src, float** %temp_src.addr, align 8
  store float* %temp_dst, float** %temp_dst.addr, align 8
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %border_cols, i32* %border_cols.addr, align 4
  store i32 %border_rows, i32* %border_rows.addr, align 4
  store float %Cap, float* %Cap.addr, align 4
  store float %Rx, float* %Rx.addr, align 4
  store float %Ry, float* %Ry.addr, align 4
  store float %Rz, float* %Rz.addr, align 4
  store float %step, float* %step.addr, align 4
  store float %time_elapsed, float* %time_elapsed.addr, align 4
  store float 8.000000e+01, float* %amb_temp, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %bx, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call1, i32* %by, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call2, i32* %tx, align 4
  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call3, i32* %ty, align 4
  %0 = load float, float* %step.addr, align 4
  %1 = load float, float* %Cap.addr, align 4
  %div = fdiv float %0, %1
  store float %div, float* %step_div_Cap, align 4
  %2 = load float, float* %Rx.addr, align 4
  %div4 = fdiv float 1.000000e+00, %2
  store float %div4, float* %Rx_1, align 4
  %3 = load float, float* %Ry.addr, align 4
  %div5 = fdiv float 1.000000e+00, %3
  store float %div5, float* %Ry_1, align 4
  %4 = load float, float* %Rz.addr, align 4
  %div6 = fdiv float 1.000000e+00, %4
  store float %div6, float* %Rz_1, align 4
  %5 = load i32, i32* %iteration.addr, align 4
  %mul = mul nsw i32 %5, 2
  %sub = sub nsw i32 16, %mul
  store i32 %sub, i32* %small_block_rows, align 4
  %6 = load i32, i32* %iteration.addr, align 4
  %mul7 = mul nsw i32 %6, 2
  %sub8 = sub nsw i32 16, %mul7
  store i32 %sub8, i32* %small_block_cols, align 4
  %7 = load i32, i32* %small_block_rows, align 4
  %8 = load i32, i32* %by, align 4
  %mul9 = mul nsw i32 %7, %8
  %9 = load i32, i32* %border_rows.addr, align 4
  %sub10 = sub nsw i32 %mul9, %9
  store i32 %sub10, i32* %blkY, align 4
  %10 = load i32, i32* %small_block_cols, align 4
  %11 = load i32, i32* %bx, align 4
  %mul11 = mul nsw i32 %10, %11
  %12 = load i32, i32* %border_cols.addr, align 4
  %sub12 = sub nsw i32 %mul11, %12
  store i32 %sub12, i32* %blkX, align 4
  %13 = load i32, i32* %blkY, align 4
  %add = add nsw i32 %13, 16
  %sub13 = sub nsw i32 %add, 1
  store i32 %sub13, i32* %blkYmax, align 4
  %14 = load i32, i32* %blkX, align 4
  %add14 = add nsw i32 %14, 16
  %sub15 = sub nsw i32 %add14, 1
  store i32 %sub15, i32* %blkXmax, align 4
  %15 = load i32, i32* %blkY, align 4
  %16 = load i32, i32* %ty, align 4
  %add16 = add nsw i32 %15, %16
  store i32 %add16, i32* %yidx, align 4
  %17 = load i32, i32* %blkX, align 4
  %18 = load i32, i32* %tx, align 4
  %add17 = add nsw i32 %17, %18
  store i32 %add17, i32* %xidx, align 4
  %19 = load i32, i32* %yidx, align 4
  store i32 %19, i32* %loadYidx, align 4
  %20 = load i32, i32* %xidx, align 4
  store i32 %20, i32* %loadXidx, align 4
  %21 = load i32, i32* %grid_cols.addr, align 4
  %22 = load i32, i32* %loadYidx, align 4
  %mul18 = mul nsw i32 %21, %22
  %23 = load i32, i32* %loadXidx, align 4
  %add19 = add nsw i32 %mul18, %23
  store i32 %add19, i32* %index, align 4
  %24 = load i32, i32* %loadYidx, align 4
  %cmp = icmp sge i32 %24, 0
  br i1 %cmp, label %land.lhs.true, label %if.end
 land.lhs.true:                                    ; preds = %entry
  %25 = load i32, i32* %loadYidx, align 4
  %26 = load i32, i32* %grid_rows.addr, align 4
  %sub20 = sub nsw i32 %26, 1
  %cmp21 = icmp sle i32 %25, %sub20
  br i1 %cmp21, label %land.lhs.true22, label %if.end
 land.lhs.true22:                                  ; preds = %land.lhs.true
  %27 = load i32, i32* %loadXidx, align 4
  %cmp23 = icmp sge i32 %27, 0
  br i1 %cmp23, label %land.lhs.true24, label %if.end
 land.lhs.true24:                                  ; preds = %land.lhs.true22
  %28 = load i32, i32* %loadXidx, align 4
  %29 = load i32, i32* %grid_cols.addr, align 4
  %sub25 = sub nsw i32 %29, 1
  %cmp26 = icmp sle i32 %28, %sub25
  br i1 %cmp26, label %if.then, label %if.end
 if.then:                                          ; preds = %land.lhs.true24
  %30 = load float*, float** %temp_src.addr, align 8
  %31 = load i32, i32* %index, align 4
  %idxprom = sext i32 %31 to i64
  %arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
  %32 = load float, float* %arrayidx, align 4
  %33 = load i32, i32* %ty, align 4
  %idxprom27 = sext i32 %33 to i64
  %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
  %34 = load i32, i32* %tx, align 4
  %idxprom29 = sext i32 %34 to i64
  %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
  store float %32, float* %arrayidx30, align 4
  %35 = load float*, float** %power.addr, align 8
  %36 = load i32, i32* %index, align 4
  %idxprom31 = sext i32 %36 to i64
  %arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
  %37 = load float, float* %arrayidx32, align 4
  %38 = load i32, i32* %ty, align 4
  %idxprom33 = sext i32 %38 to i64
  %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
  %39 = load i32, i32* %tx, align 4
  %idxprom35 = sext i32 %39 to i64
  %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
  store float %37, float* %arrayidx36, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
  call void @llvm.nvvm.barrier0()
  %40 = load i32, i32* %blkY, align 4
  %cmp37 = icmp slt i32 %40, 0
  br i1 %cmp37, label %cond.true, label %cond.false
 cond.true:                                        ; preds = %if.end
  %41 = load i32, i32* %blkY, align 4
  %sub38 = sub nsw i32 0, %41
  br label %cond.end
 cond.false:                                       ; preds = %if.end
  br label %cond.end
 cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
  store i32 %cond, i32* %validYmin, align 4
  %42 = load i32, i32* %blkYmax, align 4
  %43 = load i32, i32* %grid_rows.addr, align 4
  %sub39 = sub nsw i32 %43, 1
  %cmp40 = icmp sgt i32 %42, %sub39
  br i1 %cmp40, label %cond.true41, label %cond.false45
 cond.true41:                                      ; preds = %cond.end
  %44 = load i32, i32* %blkYmax, align 4
  %45 = load i32, i32* %grid_rows.addr, align 4
  %sub42 = sub nsw i32 %44, %45
  %add43 = add nsw i32 %sub42, 1
  %sub44 = sub nsw i32 15, %add43
  br label %cond.end46
 cond.false45:                                     ; preds = %cond.end
  br label %cond.end46
 cond.end46:                                       ; preds = %cond.false45, %cond.true41
  %cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
  store i32 %cond47, i32* %validYmax, align 4
  %46 = load i32, i32* %blkX, align 4
  %cmp48 = icmp slt i32 %46, 0
  br i1 %cmp48, label %cond.true49, label %cond.false51
 cond.true49:                                      ; preds = %cond.end46
  %47 = load i32, i32* %blkX, align 4
  %sub50 = sub nsw i32 0, %47
  br label %cond.end52
 cond.false51:                                     ; preds = %cond.end46
  br label %cond.end52
 cond.end52:                                       ; preds = %cond.false51, %cond.true49
  %cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
  store i32 %cond53, i32* %validXmin, align 4
  %48 = load i32, i32* %blkXmax, align 4
  %49 = load i32, i32* %grid_cols.addr, align 4
  %sub54 = sub nsw i32 %49, 1
  %cmp55 = icmp sgt i32 %48, %sub54
  br i1 %cmp55, label %cond.true56, label %cond.false60
 cond.true56:                                      ; preds = %cond.end52
  %50 = load i32, i32* %blkXmax, align 4
  %51 = load i32, i32* %grid_cols.addr, align 4
  %sub57 = sub nsw i32 %50, %51
  %add58 = add nsw i32 %sub57, 1
  %sub59 = sub nsw i32 15, %add58
  br label %cond.end61
 cond.false60:                                     ; preds = %cond.end52
  br label %cond.end61
 cond.end61:                                       ; preds = %cond.false60, %cond.true56
  %cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
  store i32 %cond62, i32* %validXmax, align 4
  %52 = load i32, i32* %ty, align 4
  %sub63 = sub nsw i32 %52, 1
  store i32 %sub63, i32* %N, align 4
  %53 = load i32, i32* %ty, align 4
  %add64 = add nsw i32 %53, 1
  store i32 %add64, i32* %S, align 4
  %54 = load i32, i32* %tx, align 4
  %sub65 = sub nsw i32 %54, 1
  store i32 %sub65, i32* %W, align 4
  %55 = load i32, i32* %tx, align 4
  %add66 = add nsw i32 %55, 1
  store i32 %add66, i32* %E, align 4
  %56 = load i32, i32* %N, align 4
  %57 = load i32, i32* %validYmin, align 4
  %cmp67 = icmp slt i32 %56, %57
  br i1 %cmp67, label %cond.true68, label %cond.false69
 cond.true68:                                      ; preds = %cond.end61
  %58 = load i32, i32* %validYmin, align 4
  br label %cond.end70
 cond.false69:                                     ; preds = %cond.end61
  %59 = load i32, i32* %N, align 4
  br label %cond.end70
 cond.end70:                                       ; preds = %cond.false69, %cond.true68
  %cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
  store i32 %cond71, i32* %N, align 4
  %60 = load i32, i32* %S, align 4
  %61 = load i32, i32* %validYmax, align 4
  %cmp72 = icmp sgt i32 %60, %61
  br i1 %cmp72, label %cond.true73, label %cond.false74
 cond.true73:                                      ; preds = %cond.end70
  %62 = load i32, i32* %validYmax, align 4
  br label %cond.end75
 cond.false74:                                     ; preds = %cond.end70
  %63 = load i32, i32* %S, align 4
  br label %cond.end75
 cond.end75:                                       ; preds = %cond.false74, %cond.true73
  %cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
  store i32 %cond76, i32* %S, align 4
  %64 = load i32, i32* %W, align 4
  %65 = load i32, i32* %validXmin, align 4
  %cmp77 = icmp slt i32 %64, %65
  br i1 %cmp77, label %cond.true78, label %cond.false79
 cond.true78:                                      ; preds = %cond.end75
  %66 = load i32, i32* %validXmin, align 4
  br label %cond.end80
 cond.false79:                                     ; preds = %cond.end75
  %67 = load i32, i32* %W, align 4
  br label %cond.end80
 cond.end80:                                       ; preds = %cond.false79, %cond.true78
  %cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
  store i32 %cond81, i32* %W, align 4
  %68 = load i32, i32* %E, align 4
  %69 = load i32, i32* %validXmax, align 4
  %cmp82 = icmp sgt i32 %68, %69
  br i1 %cmp82, label %cond.true83, label %cond.false84
 cond.true83:                                      ; preds = %cond.end80
  %70 = load i32, i32* %validXmax, align 4
  br label %cond.end85
 cond.false84:                                     ; preds = %cond.end80
  %71 = load i32, i32* %E, align 4
  br label %cond.end85
 cond.end85:                                       ; preds = %cond.false84, %cond.true83
  %cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
  store i32 %cond86, i32* %E, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %cond.end85
  %72 = load i32, i32* %i, align 4
  %73 = load i32, i32* %iteration.addr, align 4
  %cmp87 = icmp slt i32 %72, %73
  br i1 %cmp87, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  store i8 0, i8* %computed, align 1
  %74 = load i32, i32* %tx, align 4
  %75 = load i32, i32* %i, align 4
  %add88 = add nsw i32 %75, 1
  %cmp89 = icmp sge i32 %74, %add88
  br i1 %cmp89, label %land.lhs.true90, label %if.end175
 land.lhs.true90:                                  ; preds = %for.body
  %76 = load i32, i32* %tx, align 4
  %77 = load i32, i32* %i, align 4
  %sub91 = sub nsw i32 16, %77
  %sub92 = sub nsw i32 %sub91, 2
  %cmp93 = icmp sle i32 %76, %sub92
  br i1 %cmp93, label %land.lhs.true94, label %if.end175
 land.lhs.true94:                                  ; preds = %land.lhs.true90
  %78 = load i32, i32* %ty, align 4
  %79 = load i32, i32* %i, align 4
  %add95 = add nsw i32 %79, 1
  %cmp96 = icmp sge i32 %78, %add95
  br i1 %cmp96, label %land.lhs.true97, label %if.end175
 land.lhs.true97:                                  ; preds = %land.lhs.true94
  %80 = load i32, i32* %ty, align 4
  %81 = load i32, i32* %i, align 4
  %sub98 = sub nsw i32 16, %81
  %sub99 = sub nsw i32 %sub98, 2
  %cmp100 = icmp sle i32 %80, %sub99
  br i1 %cmp100, label %land.lhs.true101, label %if.end175
 land.lhs.true101:                                 ; preds = %land.lhs.true97
  %82 = load i32, i32* %tx, align 4
  %83 = load i32, i32* %validXmin, align 4
  %cmp102 = icmp sge i32 %82, %83
  br i1 %cmp102, label %land.lhs.true103, label %if.end175
 land.lhs.true103:                                 ; preds = %land.lhs.true101
  %84 = load i32, i32* %tx, align 4
  %85 = load i32, i32* %validXmax, align 4
  %cmp104 = icmp sle i32 %84, %85
  br i1 %cmp104, label %land.lhs.true105, label %if.end175
 land.lhs.true105:                                 ; preds = %land.lhs.true103
  %86 = load i32, i32* %ty, align 4
  %87 = load i32, i32* %validYmin, align 4
  %cmp106 = icmp sge i32 %86, %87
  br i1 %cmp106, label %land.lhs.true107, label %if.end175
 land.lhs.true107:                                 ; preds = %land.lhs.true105
  %88 = load i32, i32* %ty, align 4
  %89 = load i32, i32* %validYmax, align 4
  %cmp108 = icmp sle i32 %88, %89
  br i1 %cmp108, label %if.then109, label %if.end175
 if.then109:                                       ; preds = %land.lhs.true107
  store i8 1, i8* %computed, align 1
  %90 = load i32, i32* %ty, align 4
  %idxprom110 = sext i32 %90 to i64
  %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
  %91 = load i32, i32* %tx, align 4
  %idxprom112 = sext i32 %91 to i64
  %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
  %92 = load float, float* %arrayidx113, align 4
  %conv = fpext float %92 to double
  %93 = load float, float* %step_div_Cap, align 4
  %conv114 = fpext float %93 to double
  %94 = load i32, i32* %ty, align 4
  %idxprom115 = sext i32 %94 to i64
  %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
  %95 = load i32, i32* %tx, align 4
  %idxprom117 = sext i32 %95 to i64
  %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
  %96 = load float, float* %arrayidx118, align 4
  %conv119 = fpext float %96 to double
  %97 = load i32, i32* %S, align 4
  %idxprom120 = sext i32 %97 to i64
  %arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
  %98 = load i32, i32* %tx, align 4
  %idxprom122 = sext i32 %98 to i64
  %arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
  %99 = load float, float* %arrayidx123, align 4
  %100 = load i32, i32* %N, align 4
  %idxprom124 = sext i32 %100 to i64
  %arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
  %101 = load i32, i32* %tx, align 4
  %idxprom126 = sext i32 %101 to i64
  %arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
  %102 = load float, float* %arrayidx127, align 4
  %add128 = fadd contract float %99, %102
  %conv129 = fpext float %add128 to double
  %103 = load i32, i32* %ty, align 4
  %idxprom130 = sext i32 %103 to i64
  %arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
  %104 = load i32, i32* %tx, align 4
  %idxprom132 = sext i32 %104 to i64
  %arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
  %105 = load float, float* %arrayidx133, align 4
  %conv134 = fpext float %105 to double
  %mul135 = fmul contract double 2.000000e+00, %conv134
  %sub136 = fsub contract double %conv129, %mul135
  %106 = load float, float* %Ry_1, align 4
  %conv137 = fpext float %106 to double
  %mul138 = fmul contract double %sub136, %conv137
  %add139 = fadd contract double %conv119, %mul138
  %107 = load i32, i32* %ty, align 4
  %idxprom140 = sext i32 %107 to i64
  %arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
  %108 = load i32, i32* %E, align 4
  %idxprom142 = sext i32 %108 to i64
  %arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
  %109 = load float, float* %arrayidx143, align 4
  %110 = load i32, i32* %ty, align 4
  %idxprom144 = sext i32 %110 to i64
  %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
  %111 = load i32, i32* %W, align 4
  %idxprom146 = sext i32 %111 to i64
  %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
  %112 = load float, float* %arrayidx147, align 4
  %add148 = fadd contract float %109, %112
  %conv149 = fpext float %add148 to double
  %113 = load i32, i32* %ty, align 4
  %idxprom150 = sext i32 %113 to i64
  %arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
  %114 = load i32, i32* %tx, align 4
  %idxprom152 = sext i32 %114 to i64
  %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
  %115 = load float, float* %arrayidx153, align 4
  %conv154 = fpext float %115 to double
  %mul155 = fmul contract double 2.000000e+00, %conv154
  %sub156 = fsub contract double %conv149, %mul155
  %116 = load float, float* %Rx_1, align 4
  %conv157 = fpext float %116 to double
  %mul158 = fmul contract double %sub156, %conv157
  %add159 = fadd contract double %add139, %mul158
  %117 = load float, float* %amb_temp, align 4
  %118 = load i32, i32* %ty, align 4
  %idxprom160 = sext i32 %118 to i64
  %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
  %119 = load i32, i32* %tx, align 4
  %idxprom162 = sext i32 %119 to i64
  %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
  %120 = load float, float* %arrayidx163, align 4
  %sub164 = fsub contract float %117, %120
  %121 = load float, float* %Rz_1, align 4
  %mul165 = fmul contract float %sub164, %121
  %conv166 = fpext float %mul165 to double
  %add167 = fadd contract double %add159, %conv166
  %mul168 = fmul contract double %conv114, %add167
  %add169 = fadd contract double %conv, %mul168
  %conv170 = fptrunc double %add169 to float
  %122 = load i32, i32* %ty, align 4
  %idxprom171 = sext i32 %122 to i64
  %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
  %123 = load i32, i32* %tx, align 4
  %idxprom173 = sext i32 %123 to i64
  %arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
  store float %conv170, float* %arrayidx174, align 4
  br label %if.end175
 if.end175:                                        ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
  call void @llvm.nvvm.barrier0()
  %124 = load i32, i32* %i, align 4
  %125 = load i32, i32* %iteration.addr, align 4
  %sub176 = sub nsw i32 %125, 1
  %cmp177 = icmp eq i32 %124, %sub176
  br i1 %cmp177, label %if.then178, label %if.end179
 if.then178:                                       ; preds = %if.end175
  br label %for.end
 if.end179:                                        ; preds = %if.end175
  %126 = load i8, i8* %computed, align 1
  %tobool = trunc i8 %126 to i1
  br i1 %tobool, label %if.then180, label %if.end189
 if.then180:                                       ; preds = %if.end179
  %127 = load i32, i32* %ty, align 4
  %idxprom181 = sext i32 %127 to i64
  %arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
  %128 = load i32, i32* %tx, align 4
  %idxprom183 = sext i32 %128 to i64
  %arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
  %129 = load float, float* %arrayidx184, align 4
  %130 = load i32, i32* %ty, align 4
  %idxprom185 = sext i32 %130 to i64
  %arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
  %131 = load i32, i32* %tx, align 4
  %idxprom187 = sext i32 %131 to i64
  %arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
  store float %129, float* %arrayidx188, align 4
  br label %if.end189
 if.end189:                                        ; preds = %if.then180, %if.end179
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end189
  %132 = load i32, i32* %i, align 4
  %inc = add nsw i32 %132, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %if.then178, %for.cond
  %133 = load i8, i8* %computed, align 1
  %tobool190 = trunc i8 %133 to i1
  br i1 %tobool190, label %if.then191, label %if.end198
 if.then191:                                       ; preds = %for.end
  %134 = load i32, i32* %ty, align 4
  %idxprom192 = sext i32 %134 to i64
  %arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
  %135 = load i32, i32* %tx, align 4
  %idxprom194 = sext i32 %135 to i64
  %arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
  %136 = load float, float* %arrayidx195, align 4
  %137 = load float*, float** %temp_dst.addr, align 8
  %138 = load i32, i32* %index, align 4
  %idxprom196 = sext i32 %138 to i64
  %arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
  store float %136, float* %arrayidx197, align 4
  br label %if.end198
 if.end198:                                        ; preds = %if.then191, %for.end
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot/hotspot.cu
+++ b/examples/hotspot/hotspot.cu
@ -1,353 +0,0 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #ifdef RD_WG_SIZE_0_0
 #define BLOCK_SIZE RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define BLOCK_SIZE RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE RD_WG_SIZE
 #else
 #define BLOCK_SIZE 16
 #endif
 #define STR_SIZE 256
 /* maximum power density possible (say 300W for a 10mm x 10mm chip)	*/
 #define MAX_PD (3.0e6)
 /* required precision in degrees	*/
 #define PRECISION 0.001
 #define SPEC_HEAT_SI 1.75e6
 #define K_SI 100
 /* capacitance fitting factor	*/
 #define FACTOR_CHIP 0.5
 /* chip parameters	*/
 float t_chip = 0.0005;
 float chip_height = 0.016;
 float chip_width = 0.016;
 /* ambient temperature, assuming no package at all	*/
 float amb_temp = 80.0;
 void run(int argc, char **argv);
 /* define timer macros */
 #define pin_stats_reset() startCycle()
 #define pin_stats_pause(cycles) stopCycle(cycles)
 #define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
 void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
 void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
  int i, j, index = 0;
  FILE *fp;
  char str[STR_SIZE];
  if ((fp = fopen(file, "w")) == 0)
    printf("The file was not opened\n");
  for (i = 0; i < grid_rows; i++)
    for (j = 0; j < grid_cols; j++) {
      sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
      fputs(str, fp);
      index++;
    }
  fclose(fp);
 }
 void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
  int i, j;
  FILE *fp;
  char str[STR_SIZE];
  float val;
  if ((fp = fopen(file, "r")) == 0)
    printf("The file was not opened\n");
  for (i = 0; i <= grid_rows - 1; i++)
    for (j = 0; j <= grid_cols - 1; j++) {
      fgets(str, STR_SIZE, fp);
      if (feof(fp))
        fatal("not enough lines in file");
      // if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
      // ((i-1)*(grid_cols-2)+j-1)))
      if ((sscanf(str, "%f", &val) != 1))
        fatal("invalid file format");
      vect[i * grid_cols + j] = val;
    }
  fclose(fp);
 }
 #define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
 #define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
 __global__ void calculate_temp(int iteration,   // number of iteration
                               float *power,    // power input
                               float *temp_src, // temperature input/output
                               float *temp_dst, // temperature input/output
                               int grid_cols,   // Col of grid
                               int grid_rows,   // Row of grid
                               int border_cols, // border offset
                               int border_rows, // border offset
                               float Cap,       // Capacitance
                               float Rx, float Ry, float Rz, float step,
                               float time_elapsed) {
  __shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float temp_t[BLOCK_SIZE]
                         [BLOCK_SIZE]; // saving temparary temperature result
  float amb_temp = 80.0;
  float step_div_Cap;
  float Rx_1, Ry_1, Rz_1;
  int bx = blockIdx.x;
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  step_div_Cap = step / Cap;
  Rx_1 = 1 / Rx;
  Ry_1 = 1 / Ry;
  Rz_1 = 1 / Rz;
  // each block finally computes result for a small block
  // after N iterations.
  // it is the non-overlapping small blocks that cover
  // all the input data
  // calculate the small block size
  int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
  int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
  // calculate the boundary for the block according to
  // the boundary of its small block
  int blkY = small_block_rows * by - border_rows;
  int blkX = small_block_cols * bx - border_cols;
  int blkYmax = blkY + BLOCK_SIZE - 1;
  int blkXmax = blkX + BLOCK_SIZE - 1;
  // calculate the global thread coordination
  int yidx = blkY + ty;
  int xidx = blkX + tx;
  // load data if it is within the valid input range
  int loadYidx = yidx, loadXidx = xidx;
  int index = grid_cols * loadYidx + loadXidx;
  if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
      IN_RANGE(loadXidx, 0, grid_cols - 1)) {
    temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
                                            // global memory to shared memory
    power_on_cuda[ty][tx] =
        power[index]; // Load the power data from global memory to shared memory
  }
  __syncthreads();
  // effective range within this block that falls within
  // the valid range of the input data
  // used to rule out computation outside the boundary.
  int validYmin = (blkY < 0) ? -blkY : 0;
  int validYmax = (blkYmax > grid_rows - 1)
                      ? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
                      : BLOCK_SIZE - 1;
  int validXmin = (blkX < 0) ? -blkX : 0;
  int validXmax = (blkXmax > grid_cols - 1)
                      ? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
                      : BLOCK_SIZE - 1;
  int N = ty - 1;
  int S = ty + 1;
  int W = tx - 1;
  int E = tx + 1;
  N = (N < validYmin) ? validYmin : N;
  S = (S > validYmax) ? validYmax : S;
  W = (W < validXmin) ? validXmin : W;
  E = (E > validXmax) ? validXmax : E;
  bool computed;
  for (int i = 0; i < iteration; i++) {
    computed = false;
    if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
        IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
        IN_RANGE(tx, validXmin, validXmax) &&
        IN_RANGE(ty, validYmin, validYmax)) {
      computed = true;
      temp_t[ty][tx] =
          temp_on_cuda[ty][tx] +
          step_div_Cap * (power_on_cuda[ty][tx] +
                          (temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
                           2.0 * temp_on_cuda[ty][tx]) *
                              Ry_1 +
                          (temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
                           2.0 * temp_on_cuda[ty][tx]) *
                              Rx_1 +
                          (amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
    }
    __syncthreads();
    if (i == iteration - 1)
      break;
    if (computed) // Assign the computation range
      temp_on_cuda[ty][tx] = temp_t[ty][tx];
    __syncthreads();
  }
  // update the global memory
  // after the last iteration, only threads coordinated within the
  // small block perform the calculation and switch on ``computed''
  if (computed) {
    temp_dst[index] = temp_t[ty][tx];
  }
 }
 /*
   compute N time steps
 */
 int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
                      int row, int total_iterations, int num_iterations,
                      int blockCols, int blockRows, int borderCols,
                      int borderRows) {
  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  dim3 dimGrid(blockCols, blockRows);
  float grid_height = chip_height / row;
  float grid_width = chip_width / col;
  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
  float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
  float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
  float Rz = t_chip / (K_SI * grid_height * grid_width);
  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
  float step = PRECISION / max_slope;
  float t;
  float time_elapsed;
  time_elapsed = 0.001;
  int src = 1, dst = 0;
  for (t = 0; t < total_iterations; t += num_iterations) {
    int temp = src;
    src = dst;
    dst = temp;
    calculate_temp<<<dimGrid, dimBlock>>>(
        MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
        MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
        step, time_elapsed);
    cudaDeviceSynchronize();
  }
  return dst;
 }
 void usage(int argc, char **argv) {
  fprintf(stderr,
          "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
          "<temp_file> <power_file> <output_file>\n",
          argv[0]);
  fprintf(stderr, "\t<grid_rows/grid_cols>  - number of rows/cols in the grid "
                  "(positive integer)\n");
  fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
  fprintf(stderr, "\t<sim_time>   - number of iterations\n");
  fprintf(stderr, "\t<temp_file>  - name of the file containing the initial "
                  "temperature values of each cell\n");
  fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
                  "power values of each cell\n");
  fprintf(stderr, "\t<output_file> - name of the output file\n");
  exit(1);
 }
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
  run(argc, argv);
  return EXIT_SUCCESS;
 }
 void run(int argc, char **argv) {
  int size;
  int grid_rows, grid_cols;
  float *FilesavingTemp, *FilesavingPower, *MatrixOut;
  char *tfile, *pfile, *ofile;
  int total_iterations = 60;
  int pyramid_height = 1; // number of iterations
  if (argc != 7)
    usage(argc, argv);
  if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
      (pyramid_height = atoi(argv[2])) <= 0 ||
      (total_iterations = atoi(argv[3])) <= 0)
    usage(argc, argv);
  tfile = argv[4];
  pfile = argv[5];
  ofile = argv[6];
  size = grid_rows * grid_cols;
 /* --------------- pyramid parameters --------------- */
 #define EXPAND_RATE                                                            \
  2 // add one iteration will extend the pyramid base by 2 per each borderline
  int borderCols = (pyramid_height)*EXPAND_RATE / 2;
  int borderRows = (pyramid_height)*EXPAND_RATE / 2;
  int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
  int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
  int blockCols =
      grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
  int blockRows =
      grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
  FilesavingTemp = (float *)malloc(size * sizeof(float));
  FilesavingPower = (float *)malloc(size * sizeof(float));
  MatrixOut = (float *)calloc(size, sizeof(float));
  if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
    fatal("unable to allocate memory");
  printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
         "%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
         pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
         blockCols, blockRows, smallBlockCol, smallBlockRow);
  readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
  readinput(FilesavingPower, grid_rows, grid_cols, pfile);
  float *MatrixTemp[2], *MatrixPower;
  cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
  cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
  cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
             cudaMemcpyHostToDevice);
  cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
  cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
             cudaMemcpyHostToDevice);
  printf("Start computing the transient temperature\n");
  int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
                              total_iterations, pyramid_height, blockCols,
                              blockRows, borderCols, borderRows);
  printf("Ending simulation\n");
  cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
             cudaMemcpyDeviceToHost);
  writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
  cudaFree(MatrixPower);
  cudaFree(MatrixTemp[0]);
  cudaFree(MatrixTemp[1]);
  free(MatrixOut);
 }
--- a/examples/hotspot/run.sh
+++ b/examples/hotspot/run.sh
@ -1,21 +0,0 @@
 #!/bin/bash
 set -e
 llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
    -o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
 if head output.out | grep -q "323.829"; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,587 +0,0 @@
 ; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "3D.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockDim_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
 entry:
  %p.addr = alloca float*, align 8
  %tIn.addr = alloca float*, align 8
  %tOut.addr = alloca float*, align 8
  %sdc.addr = alloca float, align 4
  %nx.addr = alloca i32, align 4
  %ny.addr = alloca i32, align 4
  %nz.addr = alloca i32, align 4
  %ce.addr = alloca float, align 4
  %cw.addr = alloca float, align 4
  %cn.addr = alloca float, align 4
  %cs.addr = alloca float, align 4
  %ct.addr = alloca float, align 4
  %cb.addr = alloca float, align 4
  %cc.addr = alloca float, align 4
  %amb_temp = alloca float, align 4
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %c = alloca i32, align 4
  %xy = alloca i32, align 4
  %W = alloca i32, align 4
  %E = alloca i32, align 4
  %N = alloca i32, align 4
  %S = alloca i32, align 4
  %temp1 = alloca float, align 4
  %temp2 = alloca float, align 4
  %temp3 = alloca float, align 4
  %k = alloca i32, align 4
  store float* %p, float** %p.addr, align 8
  store float* %tIn, float** %tIn.addr, align 8
  store float* %tOut, float** %tOut.addr, align 8
  store float %sdc, float* %sdc.addr, align 4
  store i32 %nx, i32* %nx.addr, align 4
  store i32 %ny, i32* %ny.addr, align 4
  store i32 %nz, i32* %nz.addr, align 4
  store float %ce, float* %ce.addr, align 4
  store float %cw, float* %cw.addr, align 4
  store float %cn, float* %cn.addr, align 4
  store float %cs, float* %cs.addr, align 4
  store float %ct, float* %ct.addr, align 4
  store float %cb, float* %cb.addr, align 4
  store float %cc, float* %cc.addr, align 4
  store float 8.000000e+01, float* %amb_temp, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call, %call1
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add = add i32 %mul, %call2
  store i32 %add, i32* %i, align 4
  %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
  %call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
  %mul5 = mul i32 %call3, %call4
  %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
  %add7 = add i32 %mul5, %call6
  store i32 %add7, i32* %j, align 4
  %0 = load i32, i32* %i, align 4
  %1 = load i32, i32* %j, align 4
  %2 = load i32, i32* %nx.addr, align 4
  %mul8 = mul nsw i32 %1, %2
  %add9 = add nsw i32 %0, %mul8
  store i32 %add9, i32* %c, align 4
  %3 = load i32, i32* %nx.addr, align 4
  %4 = load i32, i32* %ny.addr, align 4
  %mul10 = mul nsw i32 %3, %4
  store i32 %mul10, i32* %xy, align 4
  %5 = load i32, i32* %i, align 4
  %cmp = icmp eq i32 %5, 0
  br i1 %cmp, label %cond.true, label %cond.false
 cond.true:                                        ; preds = %entry
  %6 = load i32, i32* %c, align 4
  br label %cond.end
 cond.false:                                       ; preds = %entry
  %7 = load i32, i32* %c, align 4
  %sub = sub nsw i32 %7, 1
  br label %cond.end
 cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
  store i32 %cond, i32* %W, align 4
  %8 = load i32, i32* %i, align 4
  %9 = load i32, i32* %nx.addr, align 4
  %sub11 = sub nsw i32 %9, 1
  %cmp12 = icmp eq i32 %8, %sub11
  br i1 %cmp12, label %cond.true13, label %cond.false14
 cond.true13:                                      ; preds = %cond.end
  %10 = load i32, i32* %c, align 4
  br label %cond.end16
 cond.false14:                                     ; preds = %cond.end
  %11 = load i32, i32* %c, align 4
  %add15 = add nsw i32 %11, 1
  br label %cond.end16
 cond.end16:                                       ; preds = %cond.false14, %cond.true13
  %cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
  store i32 %cond17, i32* %E, align 4
  %12 = load i32, i32* %j, align 4
  %cmp18 = icmp eq i32 %12, 0
  br i1 %cmp18, label %cond.true19, label %cond.false20
 cond.true19:                                      ; preds = %cond.end16
  %13 = load i32, i32* %c, align 4
  br label %cond.end22
 cond.false20:                                     ; preds = %cond.end16
  %14 = load i32, i32* %c, align 4
  %15 = load i32, i32* %nx.addr, align 4
  %sub21 = sub nsw i32 %14, %15
  br label %cond.end22
 cond.end22:                                       ; preds = %cond.false20, %cond.true19
  %cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
  store i32 %cond23, i32* %N, align 4
  %16 = load i32, i32* %j, align 4
  %17 = load i32, i32* %ny.addr, align 4
  %sub24 = sub nsw i32 %17, 1
  %cmp25 = icmp eq i32 %16, %sub24
  br i1 %cmp25, label %cond.true26, label %cond.false27
 cond.true26:                                      ; preds = %cond.end22
  %18 = load i32, i32* %c, align 4
  br label %cond.end29
 cond.false27:                                     ; preds = %cond.end22
  %19 = load i32, i32* %c, align 4
  %20 = load i32, i32* %nx.addr, align 4
  %add28 = add nsw i32 %19, %20
  br label %cond.end29
 cond.end29:                                       ; preds = %cond.false27, %cond.true26
  %cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
  store i32 %cond30, i32* %S, align 4
  %21 = load float*, float** %tIn.addr, align 8
  %22 = load i32, i32* %c, align 4
  %idxprom = sext i32 %22 to i64
  %arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
  %23 = load float, float* %arrayidx, align 4
  store float %23, float* %temp2, align 4
  store float %23, float* %temp1, align 4
  %24 = load float*, float** %tIn.addr, align 8
  %25 = load i32, i32* %c, align 4
  %26 = load i32, i32* %xy, align 4
  %add31 = add nsw i32 %25, %26
  %idxprom32 = sext i32 %add31 to i64
  %arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
  %27 = load float, float* %arrayidx33, align 4
  store float %27, float* %temp3, align 4
  %28 = load float, float* %cc.addr, align 4
  %29 = load float, float* %temp2, align 4
  %mul34 = fmul contract float %28, %29
  %30 = load float, float* %cw.addr, align 4
  %31 = load float*, float** %tIn.addr, align 8
  %32 = load i32, i32* %W, align 4
  %idxprom35 = sext i32 %32 to i64
  %arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
  %33 = load float, float* %arrayidx36, align 4
  %mul37 = fmul contract float %30, %33
  %add38 = fadd contract float %mul34, %mul37
  %34 = load float, float* %ce.addr, align 4
  %35 = load float*, float** %tIn.addr, align 8
  %36 = load i32, i32* %E, align 4
  %idxprom39 = sext i32 %36 to i64
  %arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
  %37 = load float, float* %arrayidx40, align 4
  %mul41 = fmul contract float %34, %37
  %add42 = fadd contract float %add38, %mul41
  %38 = load float, float* %cs.addr, align 4
  %39 = load float*, float** %tIn.addr, align 8
  %40 = load i32, i32* %S, align 4
  %idxprom43 = sext i32 %40 to i64
  %arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
  %41 = load float, float* %arrayidx44, align 4
  %mul45 = fmul contract float %38, %41
  %add46 = fadd contract float %add42, %mul45
  %42 = load float, float* %cn.addr, align 4
  %43 = load float*, float** %tIn.addr, align 8
  %44 = load i32, i32* %N, align 4
  %idxprom47 = sext i32 %44 to i64
  %arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
  %45 = load float, float* %arrayidx48, align 4
  %mul49 = fmul contract float %42, %45
  %add50 = fadd contract float %add46, %mul49
  %46 = load float, float* %cb.addr, align 4
  %47 = load float, float* %temp1, align 4
  %mul51 = fmul contract float %46, %47
  %add52 = fadd contract float %add50, %mul51
  %48 = load float, float* %ct.addr, align 4
  %49 = load float, float* %temp3, align 4
  %mul53 = fmul contract float %48, %49
  %add54 = fadd contract float %add52, %mul53
  %50 = load float, float* %sdc.addr, align 4
  %51 = load float*, float** %p.addr, align 8
  %52 = load i32, i32* %c, align 4
  %idxprom55 = sext i32 %52 to i64
  %arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
  %53 = load float, float* %arrayidx56, align 4
  %mul57 = fmul contract float %50, %53
  %add58 = fadd contract float %add54, %mul57
  %54 = load float, float* %ct.addr, align 4
  %55 = load float, float* %amb_temp, align 4
  %mul59 = fmul contract float %54, %55
  %add60 = fadd contract float %add58, %mul59
  %56 = load float*, float** %tOut.addr, align 8
  %57 = load i32, i32* %c, align 4
  %idxprom61 = sext i32 %57 to i64
  %arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
  store float %add60, float* %arrayidx62, align 4
  %58 = load i32, i32* %xy, align 4
  %59 = load i32, i32* %c, align 4
  %add63 = add nsw i32 %59, %58
  store i32 %add63, i32* %c, align 4
  %60 = load i32, i32* %xy, align 4
  %61 = load i32, i32* %W, align 4
  %add64 = add nsw i32 %61, %60
  store i32 %add64, i32* %W, align 4
  %62 = load i32, i32* %xy, align 4
  %63 = load i32, i32* %E, align 4
  %add65 = add nsw i32 %63, %62
  store i32 %add65, i32* %E, align 4
  %64 = load i32, i32* %xy, align 4
  %65 = load i32, i32* %N, align 4
  %add66 = add nsw i32 %65, %64
  store i32 %add66, i32* %N, align 4
  %66 = load i32, i32* %xy, align 4
  %67 = load i32, i32* %S, align 4
  %add67 = add nsw i32 %67, %66
  store i32 %add67, i32* %S, align 4
  store i32 1, i32* %k, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %cond.end29
  %68 = load i32, i32* %k, align 4
  %69 = load i32, i32* %nz.addr, align 4
  %sub68 = sub nsw i32 %69, 1
  %cmp69 = icmp slt i32 %68, %sub68
  br i1 %cmp69, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %70 = load float, float* %temp2, align 4
  store float %70, float* %temp1, align 4
  %71 = load float, float* %temp3, align 4
  store float %71, float* %temp2, align 4
  %72 = load float*, float** %tIn.addr, align 8
  %73 = load i32, i32* %c, align 4
  %74 = load i32, i32* %xy, align 4
  %add70 = add nsw i32 %73, %74
  %idxprom71 = sext i32 %add70 to i64
  %arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
  %75 = load float, float* %arrayidx72, align 4
  store float %75, float* %temp3, align 4
  %76 = load float, float* %cc.addr, align 4
  %77 = load float, float* %temp2, align 4
  %mul73 = fmul contract float %76, %77
  %78 = load float, float* %cw.addr, align 4
  %79 = load float*, float** %tIn.addr, align 8
  %80 = load i32, i32* %W, align 4
  %idxprom74 = sext i32 %80 to i64
  %arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
  %81 = load float, float* %arrayidx75, align 4
  %mul76 = fmul contract float %78, %81
  %add77 = fadd contract float %mul73, %mul76
  %82 = load float, float* %ce.addr, align 4
  %83 = load float*, float** %tIn.addr, align 8
  %84 = load i32, i32* %E, align 4
  %idxprom78 = sext i32 %84 to i64
  %arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
  %85 = load float, float* %arrayidx79, align 4
  %mul80 = fmul contract float %82, %85
  %add81 = fadd contract float %add77, %mul80
  %86 = load float, float* %cs.addr, align 4
  %87 = load float*, float** %tIn.addr, align 8
  %88 = load i32, i32* %S, align 4
  %idxprom82 = sext i32 %88 to i64
  %arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
  %89 = load float, float* %arrayidx83, align 4
  %mul84 = fmul contract float %86, %89
  %add85 = fadd contract float %add81, %mul84
  %90 = load float, float* %cn.addr, align 4
  %91 = load float*, float** %tIn.addr, align 8
  %92 = load i32, i32* %N, align 4
  %idxprom86 = sext i32 %92 to i64
  %arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
  %93 = load float, float* %arrayidx87, align 4
  %mul88 = fmul contract float %90, %93
  %add89 = fadd contract float %add85, %mul88
  %94 = load float, float* %cb.addr, align 4
  %95 = load float, float* %temp1, align 4
  %mul90 = fmul contract float %94, %95
  %add91 = fadd contract float %add89, %mul90
  %96 = load float, float* %ct.addr, align 4
  %97 = load float, float* %temp3, align 4
  %mul92 = fmul contract float %96, %97
  %add93 = fadd contract float %add91, %mul92
  %98 = load float, float* %sdc.addr, align 4
  %99 = load float*, float** %p.addr, align 8
  %100 = load i32, i32* %c, align 4
  %idxprom94 = sext i32 %100 to i64
  %arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
  %101 = load float, float* %arrayidx95, align 4
  %mul96 = fmul contract float %98, %101
  %add97 = fadd contract float %add93, %mul96
  %102 = load float, float* %ct.addr, align 4
  %103 = load float, float* %amb_temp, align 4
  %mul98 = fmul contract float %102, %103
  %add99 = fadd contract float %add97, %mul98
  %104 = load float*, float** %tOut.addr, align 8
  %105 = load i32, i32* %c, align 4
  %idxprom100 = sext i32 %105 to i64
  %arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
  store float %add99, float* %arrayidx101, align 4
  %106 = load i32, i32* %xy, align 4
  %107 = load i32, i32* %c, align 4
  %add102 = add nsw i32 %107, %106
  store i32 %add102, i32* %c, align 4
  %108 = load i32, i32* %xy, align 4
  %109 = load i32, i32* %W, align 4
  %add103 = add nsw i32 %109, %108
  store i32 %add103, i32* %W, align 4
  %110 = load i32, i32* %xy, align 4
  %111 = load i32, i32* %E, align 4
  %add104 = add nsw i32 %111, %110
  store i32 %add104, i32* %E, align 4
  %112 = load i32, i32* %xy, align 4
  %113 = load i32, i32* %N, align 4
  %add105 = add nsw i32 %113, %112
  store i32 %add105, i32* %N, align 4
  %114 = load i32, i32* %xy, align 4
  %115 = load i32, i32* %S, align 4
  %add106 = add nsw i32 %115, %114
  store i32 %add106, i32* %S, align 4
  br label %for.inc
 for.inc:                                          ; preds = %for.body
  %116 = load i32, i32* %k, align 4
  %inc = add nsw i32 %116, 1
  store i32 %inc, i32* %k, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %117 = load float, float* %temp2, align 4
  store float %117, float* %temp1, align 4
  %118 = load float, float* %temp3, align 4
  store float %118, float* %temp2, align 4
  %119 = load float, float* %cc.addr, align 4
  %120 = load float, float* %temp2, align 4
  %mul107 = fmul contract float %119, %120
  %121 = load float, float* %cw.addr, align 4
  %122 = load float*, float** %tIn.addr, align 8
  %123 = load i32, i32* %W, align 4
  %idxprom108 = sext i32 %123 to i64
  %arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
  %124 = load float, float* %arrayidx109, align 4
  %mul110 = fmul contract float %121, %124
  %add111 = fadd contract float %mul107, %mul110
  %125 = load float, float* %ce.addr, align 4
  %126 = load float*, float** %tIn.addr, align 8
  %127 = load i32, i32* %E, align 4
  %idxprom112 = sext i32 %127 to i64
  %arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
  %128 = load float, float* %arrayidx113, align 4
  %mul114 = fmul contract float %125, %128
  %add115 = fadd contract float %add111, %mul114
  %129 = load float, float* %cs.addr, align 4
  %130 = load float*, float** %tIn.addr, align 8
  %131 = load i32, i32* %S, align 4
  %idxprom116 = sext i32 %131 to i64
  %arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
  %132 = load float, float* %arrayidx117, align 4
  %mul118 = fmul contract float %129, %132
  %add119 = fadd contract float %add115, %mul118
  %133 = load float, float* %cn.addr, align 4
  %134 = load float*, float** %tIn.addr, align 8
  %135 = load i32, i32* %N, align 4
  %idxprom120 = sext i32 %135 to i64
  %arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
  %136 = load float, float* %arrayidx121, align 4
  %mul122 = fmul contract float %133, %136
  %add123 = fadd contract float %add119, %mul122
  %137 = load float, float* %cb.addr, align 4
  %138 = load float, float* %temp1, align 4
  %mul124 = fmul contract float %137, %138
  %add125 = fadd contract float %add123, %mul124
  %139 = load float, float* %ct.addr, align 4
  %140 = load float, float* %temp3, align 4
  %mul126 = fmul contract float %139, %140
  %add127 = fadd contract float %add125, %mul126
  %141 = load float, float* %sdc.addr, align 4
  %142 = load float*, float** %p.addr, align 8
  %143 = load i32, i32* %c, align 4
  %idxprom128 = sext i32 %143 to i64
  %arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
  %144 = load float, float* %arrayidx129, align 4
  %mul130 = fmul contract float %141, %144
  %add131 = fadd contract float %add127, %mul130
  %145 = load float, float* %ct.addr, align 4
  %146 = load float, float* %amb_temp, align 4
  %mul132 = fmul contract float %145, %146
  %add133 = fadd contract float %add131, %mul132
  %147 = load float*, float** %tOut.addr, align 8
  %148 = load i32, i32* %c, align 4
  %idxprom134 = sext i32 %148 to i64
  %arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
  store float %add133, float* %arrayidx135, align 4
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot3D/3D.cu
+++ b/examples/hotspot3D/3D.cu
@ -1,205 +0,0 @@
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include <time.h>
 #define BLOCK_SIZE 16
 #define STR_SIZE 256
 #define block_x_ 128
 #define block_y_ 2
 #define block_z_ 1
 #define MAX_PD (3.0e6)
 /* required precision in degrees	*/
 #define PRECISION 0.001
 #define SPEC_HEAT_SI 1.75e6
 #define K_SI 100
 /* capacitance fitting factor	*/
 #define FACTOR_CHIP 0.5
 #include "opt1.cu"
 /* chip parameters	*/
 float t_chip = 0.0005;
 float chip_height = 0.016;
 float chip_width = 0.016; /* ambient temperature, assuming no package at all
                           */
 float amb_temp = 80.0;
 void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
 void readinput(float *vect, int grid_rows, int grid_cols, int layers,
               char *file) {
  int i, j, k;
  FILE *fp;
  char str[STR_SIZE];
  float val;
  if ((fp = fopen(file, "r")) == 0)
    fatal("The file was not opened");
  for (i = 0; i <= grid_rows - 1; i++)
    for (j = 0; j <= grid_cols - 1; j++)
      for (k = 0; k <= layers - 1; k++) {
        if (fgets(str, STR_SIZE, fp) == NULL)
          fatal("Error reading file\n");
        if (feof(fp))
          fatal("not enough lines in file");
        if ((sscanf(str, "%f", &val) != 1))
          fatal("invalid file format");
        vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
      }
  fclose(fp);
 }
 void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
                 char *file) {
  int i, j, k, index = 0;
  FILE *fp;
  char str[STR_SIZE];
  if ((fp = fopen(file, "w")) == 0)
    printf("The file was not opened\n");
  for (i = 0; i < grid_rows; i++)
    for (j = 0; j < grid_cols; j++)
      for (k = 0; k < layers; k++) {
        sprintf(str, "%d\t%g\n", index,
                vect[i * grid_cols + j + k * grid_rows * grid_cols]);
        fputs(str, fp);
        index++;
      }
  fclose(fp);
 }
 void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
                    float Cap, float Rx, float Ry, float Rz, float dt,
                    int numiter) {
  float ce, cw, cn, cs, ct, cb, cc;
  float stepDivCap = dt / Cap;
  ce = cw = stepDivCap / Rx;
  cn = cs = stepDivCap / Ry;
  ct = cb = stepDivCap / Rz;
  cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
  int c, w, e, n, s, b, t;
  int x, y, z;
  int i = 0;
  do {
    for (z = 0; z < nz; z++)
      for (y = 0; y < ny; y++)
        for (x = 0; x < nx; x++) {
          c = x + y * nx + z * nx * ny;
          w = (x == 0) ? c : c - 1;
          e = (x == nx - 1) ? c : c + 1;
          n = (y == 0) ? c : c - nx;
          s = (y == ny - 1) ? c : c + nx;
          b = (z == 0) ? c : c - nx * ny;
          t = (z == nz - 1) ? c : c + nx * ny;
          tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
                    tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
                    (dt / Cap) * pIn[c] + ct * amb_temp;
        }
    float *temp = tIn;
    tIn = tOut;
    tOut = temp;
    i++;
  } while (i < numiter);
 }
 float accuracy(float *arr1, float *arr2, int len) {
  float err = 0.0;
  int i;
  for (i = 0; i < len; i++) {
    err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
  }
  return (float)sqrt(err / len);
 }
 void usage(int argc, char **argv) {
  fprintf(stderr,
          "Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
          "<outputFile>\n",
          argv[0]);
  fprintf(
      stderr,
      "\t<rows/cols>  - number of rows/cols in the grid (positive integer)\n");
  fprintf(stderr,
          "\t<layers>  - number of layers in the grid (positive integer)\n");
  fprintf(stderr, "\t<iteration> - number of iterations\n");
  fprintf(stderr, "\t<powerFile>  - name of the file containing the initial "
                  "power values of each cell\n");
  fprintf(stderr, "\t<tempFile>  - name of the file containing the initial "
                  "temperature values of each cell\n");
  fprintf(stderr, "\t<outputFile - output file\n");
  exit(1);
 }
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  if (argc != 7) {
    usage(argc, argv);
  }
  char *pfile, *tfile, *ofile;
  int iterations = atoi(argv[3]);
  pfile = argv[4];
  tfile = argv[5];
  ofile = argv[6];
  int numCols = atoi(argv[1]);
  int numRows = atoi(argv[1]);
  int layers = atoi(argv[2]);
  /* calculating parameters*/
  float dx = chip_height / numRows;
  float dy = chip_width / numCols;
  float dz = t_chip / layers;
  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
  float Rx = dy / (2.0 * K_SI * t_chip * dx);
  float Ry = dx / (2.0 * K_SI * t_chip * dy);
  float Rz = dz / (K_SI * dx * dy);
  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
  float dt = PRECISION / max_slope;
  float *powerIn, *tempOut, *tempIn, *tempCopy;
  int size = numCols * numRows * layers;
  powerIn = (float *)calloc(size, sizeof(float));
  tempCopy = (float *)malloc(size * sizeof(float));
  tempIn = (float *)calloc(size, sizeof(float));
  tempOut = (float *)calloc(size, sizeof(float));
  float *answer = (float *)calloc(size, sizeof(float));
  readinput(powerIn, numRows, numCols, layers, pfile);
  readinput(tempIn, numRows, numCols, layers, tfile);
  memcpy(tempCopy, tempIn, size * sizeof(float));
  hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
               Rz, dt, iterations);
  computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
                 Ry, Rz, dt, iterations);
  float acc = accuracy(tempOut, answer, numRows * numCols * layers);
  printf("Accuracy: %e\n", acc);
  writeoutput(tempOut, numRows, numCols, layers, ofile);
  free(tempIn);
  free(tempOut);
  free(powerIn);
  return 0;
 }
--- a/examples/hotspot3D/run.sh
+++ b/examples/hotspot3D/run.sh
@ -1,22 +0,0 @@
 # # #!/bin/bash
 set -e
 llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o 3D \
    -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
 if head output.out | grep -q "334.017"; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/huffman/comparison_helpers.h
+++ b/examples/huffman/comparison_helpers.h
@ -1,24 +0,0 @@
 #ifndef _COMPARISON_HELPERS_H_
 #define _COMPARISON_HELPERS_H_
 #include <stdio.h>
 template <typename T>
 __inline int compare_vectors(T *data1, T *data2, unsigned int size) {
  printf("Comparing vectors: \n");
  bool match = true;
  for (unsigned int i = 0; i < size; i++)
    if (data1[i] != data2[i]) {
      match = false;
      printf("Diff: data1[%d]=%d,  data1[%d]=%d.\n", i, data1[i], i, data2[i]);
    }
  if (match) {
    printf("PASS! vectors are matching!\n");
    return 0;
  } else {
    printf("FAIL! vectors are NOT matching!\n");
    exit(1);
    return -1;
  }
 }
 #endif
--- a/examples/huffman/cpuencode.cpp
+++ b/examples/huffman/cpuencode.cpp
@ -1,116 +0,0 @@
 #include "stdafx.h"
 #include "cpuencode.h"
 #include "print_helpers.h"
 using namespace std;
 #if 1
 // The max. codeword length for each byte symbol is 32-bits
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens) {
  unsigned int *bitstreamPt =
      (unsigned int *)outdata; /* Pointer to current byte   */
  *bitstreamPt = 0x00000000U;
  unsigned int startbit = 0;
  unsigned int totalBytes = 0;
  for (unsigned int k = 0; k < num_elements; k++) {
    unsigned int cw32 = 0;
    unsigned int val32 = indata[k];
    unsigned int numbits = 0;
    unsigned int mask32;
    for (unsigned int i = 0; i < 4; i++) {
      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
      cw32 = codewords[symbol];
      numbits = codewordlens[symbol];
      while (numbits > 0) {
        int writebits = min(32 - startbit, numbits);
        if (numbits == writebits)
          mask32 = (cw32 & ((1 << numbits) - 1))
                   << (32 - startbit -
                       numbits); // first make sure that the start of the word
                                 // is clean, then shift to the left as many
                                 // places as you need
        else
          mask32 = cw32 >>
                   (numbits - writebits); // shift out the bits that can not fit
        *bitstreamPt = (*bitstreamPt) | mask32;
        numbits = numbits - writebits;
        startbit = (startbit + writebits) % 32;
        if (startbit == 0) {
          bitstreamPt++;
          *bitstreamPt = 0x00000000;
          totalBytes += 4;
        }
      }
    }
  }
  totalBytes += (startbit / 8) +
                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
  *outsize = totalBytes;
 }
 //////////////////////////////////////////////////////////////////////
 /// ALTERNATIVE CODER
 /// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
 /// i.e. g 64 bits
 ///////////////////////////////////////////////////////////////////////
 #else
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens) {
  unsigned int *bitstreamPt =
      (unsigned int *)outdata; /* Pointer to current byte   */
  // assume memset is done.
  *bitstreamPt = 0x00000000U;
  unsigned int startbit = 0;
  unsigned int totalBytes = 0;
  for (unsigned int k = 0; k < num_elements; k++) {
    unsigned long long cw64 = 0, mask64 = 0;
    unsigned int val32 = indata[k];
    unsigned int numbits = 0;
    unsigned int mask32, temp32;
    for (unsigned int i = 0; i < 4; i++) {
      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
      cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
      numbits += codewordlens[symbol];
      // if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
      // %d!!!!!!!\n", k, numbits);
    }
    while (numbits > 0) {
      int writebits = min(32 - startbit, numbits);
      if (numbits == writebits) {
        temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
        mask32 = temp32 << (32 - startbit - numbits);
      } else {
        mask32 = (unsigned int)(cw64 >> (numbits - writebits));
        cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
      }
      *bitstreamPt = (*bitstreamPt) | mask32;
      numbits = numbits - writebits;
      startbit = (startbit + writebits) % 32;
      if (startbit == 0) {
        bitstreamPt++;
        *bitstreamPt = 0x00000000;
        totalBytes += 4;
      }
    }
  }
  totalBytes += (startbit / 8) +
                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
  *outsize = totalBytes;
 }
 #endif
--- a/examples/huffman/cpuencode.h
+++ b/examples/huffman/cpuencode.h
@ -1,8 +0,0 @@
 #ifndef _CE_H_
 #define _CE_H_
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens);
 #endif
--- a/examples/huffman/cuda_helpers.h
+++ b/examples/huffman/cuda_helpers.h
@ -1,20 +0,0 @@
 #ifndef __CUDA_HELPERS__
 #define __CUDA_HELPERS__
 #include <stdio.h>
 /************************************************************************/
 /* Init CUDA                                                            */
 /************************************************************************/
 #if __DEVICE_EMULATION__
 bool InitCUDA(void) { return true; }
 #else
 bool InitCUDA(void) {
  cudaSetDevice(0);
  printf("CUDA initialized.\n");
  return true;
 }
 #endif
 #endif
--- a/examples/huffman/cutil.h
+++ b/examples/huffman/cutil.h
@ -1,931 +0,0 @@
 /*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
 /*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
 /* CUda UTility Library */
 #ifndef _CUTIL_H_
 #define _CUTIL_H_
 #ifdef _WIN32
 #pragma warning(disable : 4996) // disable deprecated warning
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 // helper typedefs for building DLL
 #ifdef _WIN32
 #ifdef BUILD_DLL
 #define DLL_MAPPING __declspec(dllexport)
 #else
 #define DLL_MAPPING __declspec(dllimport)
 #endif
 #else
 #define DLL_MAPPING
 #endif
 #ifdef _WIN32
 #define CUTIL_API __stdcall
 #else
 #define CUTIL_API
 #endif
 ////////////////////////////////////////////////////////////////////////////
 //! CUT bool type
 ////////////////////////////////////////////////////////////////////////////
 enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 };
 ////////////////////////////////////////////////////////////////////////////
 //! Deallocate memory allocated within Cutil
 //! @param  pointer to memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 void CUTIL_API cutFree(void *ptr);
 ////////////////////////////////////////////////////////////////////////////
 //! Helper for bank conflict checking (should only be used with the
 //! CUT_BANK_CHECKER macro)
 //! @param tidx  thread id in x dimension of block
 //! @param tidy  thread id in y dimension of block
 //! @param tidz  thread id in z dimension of block
 //! @param bdimx block size in x dimension
 //! @param bdimy block size in y dimension
 //! @param bdimz block size in z dimension
 //! @param file  name of the source file where the access takes place
 //! @param line  line in the source file where the access takes place
 //! @param aname name of the array which is accessed
 //! @param index index into the array
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 void CUTIL_API cutCheckBankAccess(unsigned int tidx, unsigned int tidy,
                                  unsigned int tidz, unsigned int bdimx,
                                  unsigned int bdimy, unsigned int bdimz,
                                  const char *file, const int line,
                                  const char *aname, const int index);
 ////////////////////////////////////////////////////////////////////////////
 //! Find the path for a filename
 //! @return the path if succeeded, otherwise 0
 //! @param filename        name of the file
 //! @param executablePath  optional absolute path of the executable
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 char *CUTIL_API cutFindFilePath(const char *filename,
                                const char *executablePath);
 ////////////////////////////////////////////////////////////////////////////
 //! Read file \filename containing single precision floating point data
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param filename name of the source file
 //! @param data  uninitialized pointer, returned initialized and pointing to
 //!        the data read
 //! @param len  number of data elements in data, -1 on error
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutReadFilef(const char *filename, float **data,
                                  unsigned int *len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Read file \filename containing double precision floating point data
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param filename name of the source file
 //! @param data  uninitialized pointer, returned initialized and pointing to
 //!        the data read
 //! @param len  number of data elements in data, -1 on error
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutReadFiled(const char *filename, double **data,
                                  unsigned int *len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Read file \filename containing integer data
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param filename name of the source file
 //! @param data  uninitialized pointer, returned initialized and pointing to
 //!        the data read
 //! @param len  number of data elements in data, -1 on error
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutReadFilei(const char *filename, int **data,
                                  unsigned int *len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Read file \filename containing unsigned integer data
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param filename name of the source file
 //! @param data  uninitialized pointer, returned initialized and pointing to
 //!        the data read
 //! @param len  number of data elements in data, -1 on error
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutReadFileui(const char *filename, unsigned int **data,
                                   unsigned int *len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Read file \filename containing char / byte data
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param filename name of the source file
 //! @param data  uninitialized pointer, returned initialized and pointing to
 //!        the data read
 //! @param len  number of data elements in data, -1 on error
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutReadFileb(const char *filename, char **data,
                                  unsigned int *len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Read file \filename containing unsigned char / byte data
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param filename name of the source file
 //! @param data  uninitialized pointer, returned initialized and pointing to
 //!        the data read
 //! @param len  number of data elements in data, -1 on error
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutReadFileub(const char *filename, unsigned char **data,
                                   unsigned int *len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Write a data file \filename containing single precision floating point
 //! data
 //! @return CUTTrue if writing the file succeeded, otherwise false
 //! @param filename name of the file to write
 //! @param data  pointer to data to write
 //! @param len  number of data elements in data, -1 on error
 //! @param epsilon  epsilon for comparison
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutWriteFilef(const char *filename, const float *data,
                                   unsigned int len, const float epsilon,
                                   bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Write a data file \filename containing double precision floating point
 //! data
 //! @return CUTTrue if writing the file succeeded, otherwise false
 //! @param filename name of the file to write
 //! @param data  pointer to data to write
 //! @param len  number of data elements in data, -1 on error
 //! @param epsilon  epsilon for comparison
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutWriteFiled(const char *filename, const float *data,
                                   unsigned int len, const double epsilon,
                                   bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Write a data file \filename containing integer data
 //! @return CUTTrue if writing the file succeeded, otherwise false
 //! @param filename name of the file to write
 //! @param data  pointer to data to write
 //! @param len  number of data elements in data, -1 on error
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutWriteFilei(const char *filename, const int *data,
                                   unsigned int len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Write a data file \filename containing unsigned integer data
 //! @return CUTTrue if writing the file succeeded, otherwise false
 //! @param filename name of the file to write
 //! @param data  pointer to data to write
 //! @param len  number of data elements in data, -1 on error
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutWriteFileui(const char *filename,
                                    const unsigned int *data, unsigned int len,
                                    bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Write a data file \filename containing char / byte data
 //! @return CUTTrue if writing the file succeeded, otherwise false
 //! @param filename name of the file to write
 //! @param data  pointer to data to write
 //! @param len  number of data elements in data, -1 on error
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutWriteFileb(const char *filename, const char *data,
                                   unsigned int len, bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Write a data file \filename containing unsigned char / byte data
 //! @return CUTTrue if writing the file succeeded, otherwise false
 //! @param filename name of the file to write
 //! @param data  pointer to data to write
 //! @param len  number of data elements in data, -1 on error
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutWriteFileub(const char *filename,
                                    const unsigned char *data, unsigned int len,
                                    bool verbose = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Load PGM image file (with unsigned char as data element type)
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutLoadPGMub(const char *file, unsigned char **data,
                                  unsigned int *w, unsigned int *h);
 ////////////////////////////////////////////////////////////////////////////
 //! Load PPM image file (with unsigned char as data element type)
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutLoadPPMub(const char *file, unsigned char **data,
                                  unsigned int *w, unsigned int *h);
 ////////////////////////////////////////////////////////////////////////////
 //! Load PPM image file (with unsigned char as data element type), padding
 //! 4th component
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutLoadPPM4ub(const char *file, unsigned char **data,
                                   unsigned int *w, unsigned int *h);
 ////////////////////////////////////////////////////////////////////////////
 //! Load PGM image file (with unsigned int as data element type)
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized within Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutLoadPGMi(const char *file, unsigned int **data,
                                 unsigned int *w, unsigned int *h);
 ////////////////////////////////////////////////////////////////////////////
 //! Load PGM image file (with unsigned short as data element type)
 //! @return CUTTrue if reading the file succeeded, otherwise false
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized  withing Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutLoadPGMs(const char *file, unsigned short **data,
                                 unsigned int *w, unsigned int *h);
 ////////////////////////////////////////////////////////////////////////////
 //! Load PGM image file (with float as data element type)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 //! @note If a NULL pointer is passed to this function and it is
 //!       initialized withing Cutil then cutFree() has to be used to
 //!       deallocate the memory
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutLoadPGMf(const char *file, float **data,
                                 unsigned int *w, unsigned int *h);
 ////////////////////////////////////////////////////////////////////////////
 //! Save PGM image file (with unsigned char as data element type)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutSavePGMub(const char *file, unsigned char *data,
                                  unsigned int w, unsigned int h);
 ////////////////////////////////////////////////////////////////////////////
 //! Save PPM image file (with unsigned char as data element type)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutSavePPMub(const char *file, unsigned char *data,
                                  unsigned int w, unsigned int h);
 ////////////////////////////////////////////////////////////////////////////
 //! Save PPM image file (with unsigned char as data element type, padded to
 //! 4 bytes)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutSavePPM4ub(const char *file, unsigned char *data,
                                   unsigned int w, unsigned int h);
 ////////////////////////////////////////////////////////////////////////////
 //! Save PGM image file (with unsigned int as data element type)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutSavePGMi(const char *file, unsigned int *data,
                                 unsigned int w, unsigned int h);
 ////////////////////////////////////////////////////////////////////////////
 //! Save PGM image file (with unsigned short as data element type)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutSavePGMs(const char *file, unsigned short *data,
                                 unsigned int w, unsigned int h);
 ////////////////////////////////////////////////////////////////////////////
 //! Save PGM image file (with float as data element type)
 //! @param file  name of the image file
 //! @param data  handle to the data read
 //! @param w     width of the image
 //! @param h     height of the image
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutSavePGMf(const char *file, float *data, unsigned int w,
                                 unsigned int h);
 ////////////////////////////////////////////////////////////////////////////
 // Command line arguments: General notes
 // * All command line arguments begin with '--' followed by the token;
 //   token and value are seperated by '='; example --samples=50
 // * Arrays have the form --model=[one.obj,two.obj,three.obj]
 //   (without whitespaces)
 ////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////
 //! Check if command line argument \a flag-name is given
 //! @return CUTTrue if command line argument \a flag_name has been given,
 //!         otherwise 0
 //! @param argc  argc as passed to main()
 //! @param argv  argv as passed to main()
 //! @param flag_name  name of command line flag
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCheckCmdLineFlag(const int argc, const char **argv,
                                         const char *flag_name);
 ////////////////////////////////////////////////////////////////////////////
 //! Get the value of a command line argument of type int
 //! @return CUTTrue if command line argument \a arg_name has been given and
 //!         is of the requested type, otherwise CUTFalse
 //! @param argc  argc as passed to main()
 //! @param argv  argv as passed to main()
 //! @param arg_name  name of the command line argument
 //! @param val  value of the command line argument
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutGetCmdLineArgumenti(const int argc, const char **argv,
                                            const char *arg_name, int *val);
 ////////////////////////////////////////////////////////////////////////////
 //! Get the value of a command line argument of type float
 //! @return CUTTrue if command line argument \a arg_name has been given and
 //!         is of the requested type, otherwise CUTFalse
 //! @param argc  argc as passed to main()
 //! @param argv  argv as passed to main()
 //! @param arg_name  name of the command line argument
 //! @param val  value of the command line argument
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutGetCmdLineArgumentf(const int argc, const char **argv,
                                            const char *arg_name, float *val);
 ////////////////////////////////////////////////////////////////////////////
 //! Get the value of a command line argument of type string
 //! @return CUTTrue if command line argument \a arg_name has been given and
 //!         is of the requested type, otherwise CUTFalse
 //! @param argc  argc as passed to main()
 //! @param argv  argv as passed to main()
 //! @param arg_name  name of the command line argument
 //! @param val  value of the command line argument
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutGetCmdLineArgumentstr(const int argc, const char **argv,
                                              const char *arg_name, char **val);
 ////////////////////////////////////////////////////////////////////////////
 //! Get the value of a command line argument list those element are strings
 //! @return CUTTrue if command line argument \a arg_name has been given and
 //!         is of the requested type, otherwise CUTFalse
 //! @param argc  argc as passed to main()
 //! @param argv  argv as passed to main()
 //! @param arg_name  name of the command line argument
 //! @param val  command line argument list
 //! @param len  length of the list / number of elements
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutGetCmdLineArgumentListstr(const int argc,
                                                  const char **argv,
                                                  const char *arg_name,
                                                  char **val,
                                                  unsigned int *len);
 ////////////////////////////////////////////////////////////////////////////
 //! Extended assert
 //! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
 //! @param val  condition to test
 //! @param file  __FILE__ macro
 //! @param line  __LINE__ macro
 //! @note This function should be used via the CONDITION(val) macro
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCheckCondition(int val, const char *file,
                                       const int line);
 ////////////////////////////////////////////////////////////////////////////
 //! Compare two float arrays
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutComparef(const float *reference, const float *data,
                                 const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////
 //! Compare two integer arrays
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutComparei(const int *reference, const int *data,
                                 const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compare two unsigned integer arrays, with epsilon and threshold
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 //! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
 ////////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCompareuit(const unsigned int *reference,
                                   const unsigned int *data,
                                   const unsigned int len, const float epsilon,
                                   const float threshold);
 ////////////////////////////////////////////////////////////////////////////
 //! Compare two unsigned char arrays
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCompareub(const unsigned char *reference,
                                  const unsigned char *data,
                                  const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compare two integers with a tolernance for # of byte errors
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 //! @param epsilon    epsilon to use for the comparison
 //! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
 ////////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCompareubt(const unsigned char *reference,
                                   const unsigned char *data,
                                   const unsigned int len, const float epsilon,
                                   const float threshold);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compare two integer arrays witha n epsilon tolerance for equality
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 //! @param epsilon    epsilon to use for the comparison
 ////////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCompareube(const unsigned char *reference,
                                   const unsigned char *data,
                                   const unsigned int len, const float epsilon);
 ////////////////////////////////////////////////////////////////////////////
 //! Compare two float arrays with an epsilon tolerance for equality
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 //! @param epsilon    epsilon to use for the comparison
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutComparefe(const float *reference, const float *data,
                                  const unsigned int len, const float epsilon);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compare two float arrays with an epsilon tolerance for equality and a
 //!     threshold for # pixel errors
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 //! @param epsilon    epsilon to use for the comparison
 ////////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutComparefet(const float *reference, const float *data,
                                   const unsigned int len, const float epsilon,
                                   const float threshold);
 ////////////////////////////////////////////////////////////////////////////
 //! Compare two float arrays using L2-norm with an epsilon tolerance for
 //! equality
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param reference  handle to the reference data / gold image
 //! @param data       handle to the computed data
 //! @param len        number of elements in reference and data
 //! @param epsilon    epsilon to use for the comparison
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCompareL2fe(const float *reference, const float *data,
                                    const unsigned int len,
                                    const float epsilon);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compare two PPM image files with an epsilon tolerance for equality
 //! @return  CUTTrue if \a reference and \a data are identical,
 //!          otherwise CUTFalse
 //! @param src_file   filename for the image to be compared
 //! @param data       filename for the reference data / gold image
 //! @param epsilon    epsilon to use for the comparison
 //! @param threshold  threshold of pixels that can still mismatch to pass (i.e.
 //! 0.15f = 15% must pass) $param verboseErrors output details of image mismatch
 //! to std::err
 ////////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutComparePPM(const char *src_file, const char *ref_file,
                                   const float epsilon, const float threshold,
                                   bool verboseErrors = false);
 ////////////////////////////////////////////////////////////////////////////
 //! Timer functionality
 ////////////////////////////////////////////////////////////////////////////
 //! Create a new timer
 //! @return CUTTrue if a time has been created, otherwise false
 //! @param  name of the new timer, 0 if the creation failed
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutCreateTimer(unsigned int *name);
 ////////////////////////////////////////////////////////////////////////////
 //! Delete a timer
 //! @return CUTTrue if a time has been deleted, otherwise false
 //! @param  name of the timer to delete
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutDeleteTimer(unsigned int name);
 ////////////////////////////////////////////////////////////////////////////
 //! Start the time with name \a name
 //! @param name  name of the timer to start
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutStartTimer(const unsigned int name);
 ////////////////////////////////////////////////////////////////////////////
 //! Stop the time with name \a name. Does not reset.
 //! @param name  name of the timer to stop
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutStopTimer(const unsigned int name);
 ////////////////////////////////////////////////////////////////////////////
 //! Resets the timer's counter.
 //! @param name  name of the timer to reset.
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 CUTBoolean CUTIL_API cutResetTimer(const unsigned int name);
 ////////////////////////////////////////////////////////////////////////////
 //! Returns total execution time in milliseconds for the timer over all
 //! runs since the last reset or timer creation.
 //! @param name  name of the timer to return the time of
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 float CUTIL_API cutGetTimerValue(const unsigned int name);
 ////////////////////////////////////////////////////////////////////////////
 //! Return the average time in milliseconds for timer execution as the
 //! total  time for the timer dividied by the number of completed (stopped)
 //! runs the timer has made.
 //! Excludes the current running time if the timer is currently running.
 //! @param name  name of the timer to return the time of
 ////////////////////////////////////////////////////////////////////////////
 DLL_MAPPING
 float CUTIL_API cutGetAverageTimerValue(const unsigned int name);
 ////////////////////////////////////////////////////////////////////////////
 //! Macros
 #if CUDART_VERSION >= 4000
 #define CUT_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
 #else
 #define CUT_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
 #endif
 #if CUDART_VERSION >= 4000
 #define CUT_DEVICE_RESET() cudaDeviceReset();
 #else
 #define CUT_DEVICE_RESET() cudaThreadExit();
 #endif
 // This is for the CUTIL bank checker
 #ifdef _DEBUG
 #if __DEVICE_EMULATION__
 // Interface for bank conflict checker
 #define CUT_BANK_CHECKER(array, index)                                         \
  (cutCheckBankAccess(threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x,       \
                      blockDim.y, blockDim.z, __FILE__, __LINE__, #array,      \
                      index),                                                  \
   array[index])
 #else
 #define CUT_BANK_CHECKER(array, index) array[index]
 #endif
 #else
 #define CUT_BANK_CHECKER(array, index) array[index]
 #endif
 #define CU_SAFE_CALL_NO_SYNC(call)                                             \
  {                                                                            \
    CUresult err = call;                                                       \
    if (CUDA_SUCCESS != err) {                                                 \
      fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err,  \
              __FILE__, __LINE__);                                             \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #define CU_SAFE_CALL(call) CU_SAFE_CALL_NO_SYNC(call);
 #define CU_SAFE_CTX_SYNC()                                                     \
  {                                                                            \
    CUresult err = cuCtxSynchronize();                                         \
    if (CUDA_SUCCESS != err) {                                                 \
      fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err,  \
              __FILE__, __LINE__);                                             \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
  {                                                                            \
    cudaError err = call;                                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
              __LINE__, cudaGetErrorString(err));                              \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
 #define CUDA_SAFE_THREAD_SYNC()                                                \
  {                                                                            \
    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                  \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
              __LINE__, cudaGetErrorString(err));                              \
    }                                                                          \
  }
 #define CUFFT_SAFE_CALL(call)                                                  \
  {                                                                            \
    cufftResult err = call;                                                    \
    if (CUFFT_SUCCESS != err) {                                                \
      fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", __FILE__,      \
              __LINE__);                                                       \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #define CUT_SAFE_CALL(call)                                                    \
  if (CUTTrue != call) {                                                       \
    fprintf(stderr, "Cut error in file '%s' in line %i.\n", __FILE__,          \
            __LINE__);                                                         \
    exit(EXIT_FAILURE);                                                        \
  }
 //! Check for CUDA error
 #ifdef _DEBUG
 #define CUT_CHECK_ERROR(errorMessage)                                          \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
    err = CUT_DEVICE_SYNCHRONIZE();                                            \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #else
 #define CUT_CHECK_ERROR(errorMessage)                                          \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #endif
 //! Check for malloc error
 #define CUT_SAFE_MALLOC(mallocCall)                                            \
  {                                                                            \
    if (!(mallocCall)) {                                                       \
      fprintf(stderr, "Host malloc failure in file '%s' in line %i\n",         \
              __FILE__, __LINE__);                                             \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }                                                                            \
  while (0)                                                                    \
    ;
 //! Check if conditon is true (flexible assert)
 #define CUT_CONDITION(val)                                                     \
  if (CUTFalse == cutCheckCondition(val, __FILE__, __LINE__)) {                \
    exit(EXIT_FAILURE);                                                        \
  }
 #if __DEVICE_EMULATION__
 #define CUT_DEVICE_INIT(ARGC, ARGV)
 #else
 #define CUT_DEVICE_INIT(ARGC, ARGV)                                            \
  {                                                                            \
    int deviceCount;                                                           \
    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));                  \
    if (deviceCount == 0) {                                                    \
      fprintf(stderr, "cutil error: no devices supporting CUDA.\n");           \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
    int dev = 0;                                                               \
    cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev);         \
    if (dev < 0)                                                               \
      dev = 0;                                                                 \
    if (dev > deviceCount - 1)                                                 \
      dev = deviceCount - 1;                                                   \
    cudaDeviceProp deviceProp;                                                 \
    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));         \
    if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse)   \
      fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);          \
    CUDA_SAFE_CALL(cudaSetDevice(dev));                                        \
  }
 //! Check for CUDA context lost
 #define CUDA_CHECK_CTX_LOST(errorMessage)                                      \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
    err = CUT_DEVICE_SYNCHRONIZE();                                            \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 //! Check for CUDA context lost
 #define CU_CHECK_CTX_LOST(errorMessage)                                        \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (CUDA_ERROR_INVALID_CONTEXT != err) {                                   \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
    err = CUT_DEVICE_SYNCHRONIZE();                                            \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  }
 #endif
 #define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV)                              \
  {                                                                            \
    cuDevice = 0;                                                              \
    int deviceCount = 0;                                                       \
    CUresult err = cuInit(0);                                                  \
    if (CUDA_SUCCESS == err)                                                   \
      CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));                    \
    if (deviceCount == 0) {                                                    \
      fprintf(stderr, "cutil error: no devices supporting CUDA\n");            \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
    int dev = 0;                                                               \
    cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev);         \
    if (dev < 0)                                                               \
      dev = 0;                                                                 \
    if (dev > deviceCount - 1)                                                 \
      dev = deviceCount - 1;                                                   \
    CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev));                         \
    char name[100];                                                            \
    cuDeviceGetName(name, 100, cuDevice);                                      \
    if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse)   \
      fprintf(stderr, "Using device %d: %s\n", dev, name);                     \
  }
 #define CUT_EXIT(argc, argv)                                                   \
  if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) {           \
    printf("\nPress ENTER to exit...\n");                                      \
    fflush(stdout);                                                            \
    fflush(stderr);                                                            \
    getchar();                                                                 \
  }                                                                            \
  exit(EXIT_SUCCESS);
 #endif // #ifndef _CUTIL_H_
--- a/examples/huffman/hist.cu
+++ b/examples/huffman/hist.cu
@ -1,104 +0,0 @@
 /*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and *
 * proprietary rights in and to this software and related documentation. Any
 * use, reproduction, disclosure, or distribution of this software and related
 * documentation without an express license agreement from NVIDIA Corporation is
 * strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
 #include <iostream>
 #include <stdio.h>
 #define CHECK(ans)                                                             \
  { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line,
                      bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
            line);
    if (abort)
      exit(code);
  }
 }
 using namespace std;
 #define SIZE (100 * 1024 * 1024)
 __global__ void histo_kernel(unsigned char *buffer, long size,
                             unsigned int *histo) {
  __shared__ unsigned int temp[256];
  temp[threadIdx.x] = 0;
  __syncthreads();
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int offset = blockDim.x * gridDim.x;
  while (i < size) {
    atomicAdd(&temp[buffer[i]], 1);
    i += offset;
  }
  __syncthreads();
  atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
 }
 int runHisto(char *file, unsigned int *freq, unsigned int memSize,
             unsigned int *source) {
  FILE *f = fopen(file, "rb");
  if (!f) {
    perror(file);
    exit(1);
  }
  fseek(f, 0, SEEK_SET);
  size_t result = fread(source, 1, memSize, f);
  if (result != memSize)
    fputs("Cannot read input file", stderr);
  fclose(f);
  unsigned char *buffer = (unsigned char *)source;
  int blocks = 2;
  // allocate memory on the GPU for the file's data
  int partSize = memSize / 32;
  int totalNum = memSize / sizeof(unsigned int);
  int partialNum = partSize / sizeof(unsigned int);
  unsigned char *dev_buffer0;
  unsigned char *dev_buffer1;
  unsigned int *dev_histo;
  cudaMalloc((void **)&dev_buffer0, partSize);
  cudaMalloc((void **)&dev_buffer1, partSize);
  cudaMalloc((void **)&dev_histo, 256 * sizeof(int));
  cudaMemset(dev_histo, 0, 256 * sizeof(int));
  for (int i = 0; i < totalNum; i += partialNum * 2) {
    CHECK(
        cudaMemcpy(dev_buffer0, buffer + i, partSize, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(dev_buffer1, buffer + i + partialNum, partSize,
                     cudaMemcpyHostToDevice));
    // kernel launch - 2x the number of mps gave best timing
    histo_kernel<<<blocks * 2, 256>>>(dev_buffer0, partSize, dev_histo);
    cudaDeviceSynchronize();
    histo_kernel<<<blocks * 2, 256>>>(dev_buffer1, partSize, dev_histo);
    cudaDeviceSynchronize();
  }
  cudaMemcpy(freq, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost);
  cudaFree(dev_histo);
  cudaFree(dev_buffer0);
  cudaFree(dev_buffer1);
  return 0;
 }
--- a/examples/huffman/huffTree.h
+++ b/examples/huffman/huffTree.h
@ -1,90 +0,0 @@
 #include "stdio.h"
 #include <algorithm>
 #include <climits> // for CHAR_BIT
 #include <iostream>
 #include <iterator>
 #include <map>
 #include <math.h>
 #include <queue>
 using namespace std;
 const int UniqueSymbols = 1 << CHAR_BIT;
 void printBits(unsigned int val, int numbits) {
  for (int i = numbits - 1; i >= 0; i--)
    putchar('0' + ((val >> i) & 1));
 }
 typedef vector<bool> HuffCode;
 typedef map<unsigned char, HuffCode> HuffCodeMap;
 class INode {
 public:
  const int f;
  virtual ~INode() {}
 protected:
  INode(int f) : f(f) {}
 };
 class InternalNode : public INode {
 public:
  INode *const left;
  INode *const right;
  InternalNode(INode *c0, INode *c1)
      : INode(c0->f + c1->f), left(c0), right(c1) {}
  ~InternalNode() {
    delete left;
    delete right;
  }
 };
 class LeafNode : public INode {
 public:
  const char c;
  LeafNode(int f, char c) : INode(f), c(c) {}
 };
 struct NodeCmp {
  bool operator()(const INode *lhs, const INode *rhs) const {
    return lhs->f > rhs->f;
  }
 };
 INode *BuildTree(unsigned int (&frequencies)[UniqueSymbols]) {
  std::priority_queue<INode *, std::vector<INode *>, NodeCmp> trees;
  for (int i = 0; i < UniqueSymbols; ++i) {
    if (frequencies[i] != 0)
      trees.push(new LeafNode(frequencies[i], (char)i));
  }
  while (trees.size() > 1) {
    INode *childR = trees.top();
    trees.pop();
    INode *childL = trees.top();
    trees.pop();
    INode *parent = new InternalNode(childR, childL);
    trees.push(parent);
  }
  return trees.top();
 }
 void GenerateCodes(const INode *node, const HuffCode &prefix,
                   HuffCodeMap &outCodes) {
  if (const LeafNode *lf = dynamic_cast<const LeafNode *>(node)) {
    outCodes[lf->c] = prefix;
  } else if (const InternalNode *in =
                 dynamic_cast<const InternalNode *>(node)) {
    HuffCode leftPrefix = prefix;
    leftPrefix.push_back(false);
    GenerateCodes(in->left, leftPrefix, outCodes);
    HuffCode rightPrefix = prefix;
    rightPrefix.push_back(true);
    GenerateCodes(in->right, rightPrefix, outCodes);
  }
 }
--- a/examples/huffman/load_data.h
+++ b/examples/huffman/load_data.h
@ -1,65 +0,0 @@
 #ifndef _LOADTESTDATA_H_
 #define _LOADTESTDATA_H_
 //#include "testdatagen.h"
 #include "hist.cu"
 #include "huffTree.h"
 inline void initParams(char *file_name, uint num_block_threads,
                       uint &num_blocks, uint &num_elements, uint &mem_size,
                       uint symbol_type_size) {
  if (file_name == NULL) {
    num_elements = num_blocks * num_block_threads;
    mem_size = num_elements * symbol_type_size;
  } else {
    FILE *f = fopen(file_name, "rb");
    if (!f) {
      perror(file_name);
      exit(1);
    }
    fseek(f, 0, SEEK_END);
    mem_size = ftell(f);
    fclose(f);
    num_elements = mem_size / symbol_type_size;
    // todo add check if we need 1 more block!
    num_blocks = num_elements / num_block_threads;
  }
 }
 inline void loadData(char *file_name, uint *sourceData, uint *codewords,
                     uint *codewordlens, uint num_elements, uint mem_size,
                     double &H) {
  if (file_name == NULL) {
    printf("No input file\n");
    exit(-1);
  } else {
    unsigned int freqs[UniqueSymbols] = {0};
    runHisto(file_name, freqs, mem_size, sourceData);
    INode *root = BuildTree(freqs);
    HuffCodeMap codes;
    GenerateCodes(root, HuffCode(), codes);
    delete root;
    for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end();
         ++it) {
      unsigned int count = distance(it->second.begin(), it->second.end());
      for (int i = 0; i < count; i++)
        if (it->second[i])
          codewords[(unsigned int)(it->first)] +=
              (uint)pow(2.0f, (int)count - i - 1);
      codewordlens[(unsigned int)(it->first)] = count;
    }
    H = 0.0;
    for (unsigned int i = 0; i < 256; i++)
      if (freqs[i] > 0) {
        double p = (double)freqs[i] / (double)mem_size;
        H += p * log(p) / log(2.0);
      }
    H = -H;
    printf("\n%s, %u bytes, entropy %f\n\n", file_name, mem_size, H);
  }
 }
 #endif
--- a/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
--- a/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll
--- a/examples/huffman/main_test_cu.cu
+++ b/examples/huffman/main_test_cu.cu
@ -1,225 +0,0 @@
 /*
 * PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.
 *
 * Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the MIT License. Read the full licence:
 * http://www.opensource.org/licenses/mit-license.php
 *
 * If you find this program useful, please contact me and reference PAVLE home
 * page in your work.
 *
 */
 #include "comparison_helpers.h"
 #include "cuda_helpers.h"
 #include "load_data.h"
 #include "print_helpers.h"
 #include "stats_logger.h"
 #include "stdafx.h"
 #include <cuda_runtime.h>
 #include <sys/time.h>
 //#include "vlc_kernel_gm32.cu"
 //#include "vlc_kernel_sm32.cu"
 #include "vlc_kernel_sm64huff.cu"
 //#include "vlc_kernel_dpt.cu"
 //#include "vlc_kernel_dptt.cu"
 //#include "scan_kernel.cu"
 #include "cpuencode.h"
 #include "pack_kernels.cu"
 #include "scan.cu"
 long long get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (tv.tv_sec * 1000000) + tv.tv_usec;
 }
 void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens);
 int main(int argc, char *argv[]) {
  if (!InitCUDA()) {
    return 0;
  }
  unsigned int num_block_threads = 256;
  if (argc > 1)
    for (int i = 1; i < argc; i++)
      runVLCTest(argv[i], num_block_threads);
  else {
    runVLCTest(NULL, num_block_threads, 1024);
  }
  return 0;
 }
 void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
  printf("CUDA! Starting VLC Tests!\n");
  unsigned int
      num_elements;      // uint num_elements = num_blocks * num_block_threads;
  unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);
  unsigned int symbol_type_size = sizeof(int);
  //////// LOAD DATA ///////////////
  double H; // entropy
  initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,
             symbol_type_size);
  printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "
         "%d\n----------------------------\n",
         num_elements, num_blocks, num_block_threads);
  ////////LOAD DATA ///////////////
  uint *sourceData = (uint *)malloc(mem_size);
  uint *destData = (uint *)malloc(mem_size);
  uint *crefData = (uint *)malloc(mem_size);
  uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
  uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
  uint *cw32 = (uint *)malloc(mem_size);
  uint *cw32len = (uint *)malloc(mem_size);
  uint *cw32idx = (uint *)malloc(mem_size);
  uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int));
  memset(sourceData, 0, mem_size);
  memset(destData, 0, mem_size);
  memset(crefData, 0, mem_size);
  memset(cw32, 0, mem_size);
  memset(cw32len, 0, mem_size);
  memset(cw32idx, 0, mem_size);
  memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);
  memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);
  memset(cindex2, 0, num_blocks * sizeof(int));
  //////// LOAD DATA ///////////////
  loadData(file_name, sourceData, codewords, codewordlens, num_elements,
           mem_size, H);
  //////// LOAD DATA ///////////////
  unsigned int *d_sourceData, *d_destData, *d_destDataPacked;
  unsigned int *d_codewords, *d_codewordlens;
  unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2;
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));
  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size));
  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));
  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int)));
  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int)));
  // printf("source data\n");
  // for (int i = 0; i < 200; i++) {
  //   printf("%d ", sourceData[i]);
  // }
  // printf("\n");
  // printf("codewords\n");
  // for (int i = 0; i < 200; i++) {
  //   printf("%d ", codewords[i]);
  // }
  // printf("\n");
  // printf("codeword lens\n");
  // for (int i = 0; i < 200; i++) {
  //   printf("%d ", codewordlens[i]);
  // }
  // printf("\n");
  // return;
  CUDA_SAFE_CALL(
      cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,
                            NUM_SYMBOLS * symbol_type_size,
                            cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,
                            NUM_SYMBOLS * symbol_type_size,
                            cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(
      cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));
  dim3 grid_size(num_blocks, 1, 1);
  dim3 block_size(num_block_threads, 1, 1);
  unsigned int sm_size;
  unsigned int NT = 10; // number of runs for each execution time
  //////////////////* CPU ENCODER *///////////////////////////////////
  unsigned int refbytesize;
  long long timer = get_time();
  cpu_vlc_encode((unsigned int *)sourceData, num_elements,
                 (unsigned int *)crefData, &refbytesize, codewords,
                 codewordlens);
  float msec = (float)((get_time() - timer) / 1000.0);
  printf("CPU Encoding time (CPU): %f (ms)\n", msec);
  printf("CPU Encoded to %d [B]\n", refbytesize);
  unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);
  //////////////////* END CPU *///////////////////////////////////
  //////////////////* SM64HUFF KERNEL *///////////////////////////////////
  grid_size.x = num_blocks;
  block_size.x = num_block_threads;
  sm_size = block_size.x * sizeof(unsigned int);
 #ifdef CACHECWLUT
  sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);
 #endif
  for (int i = 0; i < NT; i++) {
    vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(
        d_sourceData, d_codewords, d_codewordlens,
 #ifdef TESTING
        d_cw32, d_cw32len, d_cw32idx,
 #endif
        d_destData, d_cindex); // testedOK2
    cudaThreadSynchronize();
  }
  //   //////////////////* END KERNEL *///////////////////////////////////
 #ifdef TESTING
  unsigned int num_scan_elements = grid_size.x;
  preallocBlockSums(num_scan_elements);
  cudaMemset(d_destDataPacked, 0, mem_size);
  printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);
  prescanArray(d_cindex2, d_cindex, num_scan_elements);
  pack2<<<num_scan_elements / 32, 32>>>(
      (unsigned int *)d_destData, d_cindex, d_cindex2,
      (unsigned int *)d_destDataPacked, num_elements / num_scan_elements);
  cudaThreadSynchronize();
  CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");
  deallocBlockSums();
  // return;
  CUDA_SAFE_CALL(
      cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));
  compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints);
 #endif
  free(sourceData);
  free(destData);
  free(codewords);
  free(codewordlens);
  free(cw32);
  free(cw32len);
  free(crefData);
  CUDA_SAFE_CALL(cudaFree(d_sourceData));
  CUDA_SAFE_CALL(cudaFree(d_destData));
  CUDA_SAFE_CALL(cudaFree(d_destDataPacked));
  CUDA_SAFE_CALL(cudaFree(d_codewords));
  CUDA_SAFE_CALL(cudaFree(d_codewordlens));
  CUDA_SAFE_CALL(cudaFree(d_cw32));
  CUDA_SAFE_CALL(cudaFree(d_cw32len));
  CUDA_SAFE_CALL(cudaFree(d_cw32idx));
  CUDA_SAFE_CALL(cudaFree(d_cindex));
  CUDA_SAFE_CALL(cudaFree(d_cindex2));
  free(cindex2);
 }
--- a/examples/huffman/pabio_kernels_v2.cu
+++ b/examples/huffman/pabio_kernels_v2.cu
@ -1,62 +0,0 @@
 /*
 * Copyright Ana Balevic, 2008-2009. All rights reserved.
 */
 #ifndef _PABIO_KERNEL2_H_
 #define _PABIO_KERNEL2_H_
 #include "parameters.h"
 /* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
 *  Set numbits in the destination word out[kc] starting from the position startbit
 *  Implementation comments:
 *  Second atomic operation actually sets these bits to the value stored in the codeword; the other bits are left unotuched
 *  First atomic operation is a necessary prepration - we change only the bits that will be affected by the codeword to be written to 1s
 *  in order for set bits to work with using atomicand.
 *  TODOs: benchmark performance 1) gm atomics vs sm atomics; 2) memset at init time vs. atomicOr
 */
 __device__ void static put_bits_atomic2(unsigned int* out, unsigned int kc,
 								unsigned int startbit, unsigned int numbits,
 								unsigned int codeword) {
 	unsigned int cw32 = codeword;
 	unsigned int restbits = 32-startbit-numbits;
 	/* 1. Prepare the memory location */
 #ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
 	unsigned int mask = ((1<<numbits)-1);  // -> 0000...001111
 	mask<<=restbits;  //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
 	atomicAnd(&out[kc], ~mask);		//set 0s in the destination from startbit in the len of numbits
 #endif
 	/* 2. Write the codeword */
 	cw32 = cw32<<restbits;
 	atomicOr(&out[kc], cw32);
 }
 /* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
 *  Checkes if the part of the word to be written matches whole memory location, and if yes, avoids using the atmoics.
 *  Experience: no benefits, even a bit slower on CUDA.
 */
 __device__ void static put_bits_atomic2a(unsigned int* out, unsigned int kc,
 								unsigned int startbit, unsigned int numbits,
 								unsigned int codeword) {
 	unsigned int cw32 = codeword;
 	unsigned int restbits = 32-startbit-numbits;
 	/* 1. Prepare the memory location */
 #ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
 	unsigned int mask = ((1<<numbits)-1);  // -> 0000...001111
 	mask<<=restbits;  //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
 	atomicAnd(&out[kc], ~mask);		//set 0s in the destination from startbit in the len of numbits
 #endif
 	/* 2. Write the codeword */
 	if (startbit == 0 && restbits == 0) {
 		out[kc] = cw32;
 	} else {
 		cw32 = cw32<<restbits;
 		atomicOr(&out[kc], cw32);
 	}
 }
 #endif //ifndef _PABIO_KERNEL_H_
--- a/examples/huffman/pack_kernels.cu
+++ b/examples/huffman/pack_kernels.cu
@ -1,43 +0,0 @@
 #ifndef _PACK_KERNELS_H_
 #define _PACK_KERNELS_H_
 #include "parameters.h"
 __global__ static void pack2(unsigned int *srcData, unsigned int *cindex,
                             unsigned int *cindex2, unsigned int *dstData,
                             unsigned int original_num_block_elements) {
  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
  // source index
  unsigned int offset = tid * original_num_block_elements; // DPB,
  unsigned int bitsize = cindex[tid];
  // destination index
  unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32;
  unsigned int i, dw, tmp;
  dw = srcData[offset]; // load the first dword from srcData[]
  tmp = dw >> bit;      // cut off those bits that do not fit into the initial
                        // location in destData[]
  atomicOr(&dstData[dword], tmp); // fill up this initial location
  tmp = (bit == 0) ? 0 : (dw << 32 - bit);
  for (i = 1; i < bitsize / 32;
       i++) { // from now on, we have exclusive access to destData[]
    dw = srcData[offset + i]; // load next dword from srcData[]
    tmp |= dw >> bit;         // fill up tmp
    dstData[dword + i] = tmp; // write complete dword to destData[]
    tmp = (bit == 0) ? 0 : (dw << 32 - bit);
  }
  // exclusive access to dstData[] ends here
  // the remaining block can, or rather should be further optimized
  // write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible
  // by 32, in this case do nothing
  if (bit != 0 || bitsize % 32 != 0)
    atomicOr(&dstData[dword + i], tmp);
  if (bitsize % 32 != 0) {
    dw = srcData[offset + i];
    atomicOr(&dstData[dword + i], dw >> bit);
    atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit));
  }
 }
 #endif
--- a/examples/huffman/parameters.h
+++ b/examples/huffman/parameters.h
@ -1,27 +0,0 @@
 #ifndef _PARAMS_H_
 #define _PARAMS_H_
 typedef unsigned int uint;
 typedef unsigned char uint8;
 #define BENCH 0
 /* 0 - MEASURE TIME, NO TESTING
 ** 1 - TEST
 ** 2 - TEST & VERBOSE
 */
 #define TESTING
 #define DPT 4 // data (dwords) per thread
 #define CACHECWLUT // MAX DPT = 8
 //#define CACHESRCDATA		// MAX DPT = 4
 #define SMATOMICS
 #define MEMSET0
 #define MAX_SM_BLOCK_SIZE_GPU 16384 // B
 #define NUM_SYMBOLS 256 // fixed to 256.
 #endif
--- a/examples/huffman/print_helpers.h
+++ b/examples/huffman/print_helpers.h
@ -1,217 +0,0 @@
 #ifndef _PRINT_HELPERS_H_
 #define _PRINT_HELPERS_H_
 #include "parameters.h"
 #include <stdio.h>
 __inline void printdbg_data_bin(const char *filename, unsigned int *data,
                                unsigned int num_ints) {
  FILE *dump = fopen((const char *)filename, "wt");
  for (unsigned int i = 0; i < num_ints; i++) {
    unsigned int mask = 0x80000000;
    for (unsigned int j = 0; j < 32; j++) {
      if (data[i] & mask)
        fprintf(dump, "1"); // printf("1");
      else
        fprintf(dump, "0"); // printf("0");
      mask = mask >> 1;
    }
    fprintf(dump, "\n");
  }
  fclose(dump);
 }
 __inline void printdbg_data_int(const char *filename, unsigned int *data,
                                unsigned int num_ints) {
  FILE *dump = fopen((const char *)filename, "wt");
  for (unsigned int i = 0; i < num_ints; i++) {
    fprintf(dump, "%d: %d\n", i, data[i]);
  }
  fclose(dump);
 }
 __inline void printdbg_gpu_data_detailed(FILE *gpudump, unsigned int *cw32,
                                         unsigned int *cw32len,
                                         unsigned int *cw32idx,
                                         unsigned int num_elements) {
  for (unsigned int i = 0; i < num_elements; i++) {
    fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
            cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
    // print codeword:
    unsigned int mask = 0x80000000;
    mask = mask >> (32 - cw32len[i]);
    for (unsigned int j = 0; j < cw32len[i]; j++) {
      if (cw32[i] & mask)
        fprintf(gpudump, "1"); // printf("1");
      else
        fprintf(gpudump, "0"); // printf("0");
      mask = mask >> 1;
    }
    fprintf(gpudump, "\n");
  }
 }
 __inline void printdbg_gpu_data_detailed2(const char *filename,
                                          unsigned int *cw32,
                                          unsigned int *cw32len,
                                          unsigned int *cw32idx,
                                          unsigned int num_elements) {
  FILE *gpudump = fopen((const char *)filename, "wt");
  for (unsigned int i = 0; i < num_elements; i++) {
    fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
            cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
    // print codeword:
    unsigned int mask = 0x80000000;
    mask = mask >> (32 - cw32len[i]);
    for (unsigned int j = 0; j < cw32len[i]; j++) {
      if (cw32[i] & mask)
        fprintf(gpudump, "1"); // printf("1");
      else
        fprintf(gpudump, "0"); // printf("0");
      mask = mask >> 1;
    }
    fprintf(gpudump, "\n");
  }
  fclose(gpudump);
 }
 /************************************************************************/
 /* BIT PRINTS                                                         */
 /************************************************************************/
 __inline void printBits(unsigned char number) {
  unsigned char mask = 0x80;
  for (unsigned int j = 0; j < 8; j++) {
    if (number & mask)
      printf("1");
    else
      printf("0");
    mask = mask >> 1;
  }
  printf(" ");
 }
 __inline void print32Bits(unsigned int number) {
  unsigned int mask = 0x80000000;
  for (unsigned int j = 0; j < 32; j++) {
    if (number & mask)
      printf("1");
    else
      printf("0");
    mask = mask >> 1;
  }
  printf("\n");
 }
 __inline void print32BitsM(unsigned int marker) {
  for (unsigned int j = 0; j < 32; j++) {
    if (marker == (j + 1))
      printf("|");
    else
      printf(".");
  }
  printf("\n");
 }
 __inline void print_array_char_as_bits(unsigned char *a, unsigned int len) {
  printf(
      " ========================= Printing vector =======================\n");
  printf("Total number of elements is %d\n", len);
  for (unsigned int i = 0; i < len; i++) {
    printf("a[%d]=%d \t", i, a[i]);
    printBits(a[i]);
    printf("\n");
  }
  printf("\n");
  printf(
      " ==================================================================\n");
 }
 __inline void print_array_ints_as_bits(unsigned int *a, unsigned int len) {
  printf(
      " ========================= Printing vector =======================\n");
  for (unsigned int i = 0; i < len; i++) {
    print32Bits(a[i]);
    printf("\n");
  }
  printf("\n");
  printf(
      " ==================================================================\n");
 }
 __inline void print_compare_array_ints_as_bits(unsigned int *a, unsigned int *b,
                                               unsigned int len) {
  printf(
      " ========================= Printing vector =======================\n");
  for (unsigned int i = 0; i < len; i++) {
    print32Bits(a[i]);
    print32Bits(b[i]);
    printf("\n");
  }
  printf("\n");
  printf(
      " ==================================================================\n");
 }
 __inline void print_array_in_hex(unsigned int *a, unsigned int len) {
  printf(
      " ========================= Printing vector =======================\n");
  // printf("Total number of elements is %d\n", len);
  for (unsigned int i = 0; i < len; i++) {
    printf("%#X\t", a[i]);
  }
  printf("\n");
  printf(
      " ==================================================================\n");
 }
 /************************************************************************/
 /* ARRAY PRINTS                                                        */
 /***********************************************************************/
 template <typename T> __inline void print_array(T *a, unsigned int len) {
  printf(
      " ========================= Printing vector =======================\n");
  printf("Total number of elements is %d\n", len);
  for (unsigned int i = 0; i < len; i++) {
    printf("a[%d]=%d \t", i, a[i]);
  }
  printf("\n");
  printf(
      " ==================================================================\n");
 }
 template <typename ST, typename CT>
 __inline void print_rled_arrays(ST *rle_symbols, CT *rle_counts,
                                unsigned int rle_len) {
  ST current_symbol;
  CT current_count;
  printf(" ========================= Printing RLE vector "
         "=======================\n");
  printf(" Total number of RL Pairs is %d\n", rle_len);
  for (unsigned int k = 0; k < rle_len; k++) {
    current_symbol = rle_symbols[k];
    current_count = rle_counts[k];
    printf("(%d,%d) ,\t", current_symbol, current_count);
  }
  printf("\n");
 }
 __inline void print_packed_rle_array(unsigned int *rle, unsigned int rle_len) {
  unsigned short current_symbol;
  unsigned short current_count;
  printf(" ========================= Printing RLE vector "
         "=======================\n");
  printf(" Total number of RL Pairs is %d\n", rle_len);
  for (unsigned int k = 0; k < rle_len; k++) {
    current_symbol = (unsigned short)(rle[k] >> 16); // get the higher half-word
    current_count =
        (unsigned short)rle[k] & 0x0000FFFFF; // get the shorter half-word
    printf("(%d,%d) ,\t", current_symbol, current_count);
  }
  printf("\n");
 }
 #endif // _PRINT_HELPERS_H_
--- a/examples/huffman/run.sh
+++ b/examples/huffman/run.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 set -e
 # clang++ main_test_cu.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 clang -c -emit-llvm cpuencode.cpp
 llvm-as main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as main_test_cu-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator main_test_cu-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 llc --relocation-model=pic --filetype=obj  cpuencode.bc
 g++ -Wall -L../../build/runtime \
     -L../../build/runtime/threadPool -o pavle \
     -fPIC -no-pie host.o kernel.o cpuencode.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./pavle ../../rodinia-data/huffman/test1024_H2.206587175259.in
--- a/examples/huffman/scan.cu
+++ b/examples/huffman/scan.cu
@ -1,216 +0,0 @@
 /*
 * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.  This source code is a "commercial item" as
 * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer software" and "commercial computer software
 * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 */
 #ifndef _PRESCAN_CU_
 #define _PRESCAN_CU_
 // includes, kernels
 #include "cutil.h"
 #include "scanLargeArray_kernel.cu"
 #include <assert.h>
 #include <stdio.h>
 #define max(a, b) (a > b ? a : b)
 inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); }
 inline int floorPow2(int n) {
 #ifdef WIN32
  // method 2
  return 1 << (int)logb((float)n);
 #else
  // method 1
  // float nf = (float)n;
  // return 1 << (((*(int*)&nf) >> 23) - 127);
  int exp;
  frexp((float)n, &exp);
  return 1 << (exp - 1);
 #endif
 }
 #define BLOCK_SIZE 256
 static unsigned int **g_scanBlockSums;
 static unsigned int g_numEltsAllocated = 0;
 static unsigned int g_numLevelsAllocated = 0;
 static void preallocBlockSums(unsigned int maxNumElements) {
  assert(g_numEltsAllocated == 0); // shouldn't be called
  g_numEltsAllocated = maxNumElements;
  unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
  unsigned int numElts = maxNumElements;
  int level = 0;
  do {
    unsigned int numBlocks =
        max(1, (int)ceil((float)numElts / (2.f * blockSize)));
    if (numBlocks > 1)
      level++;
    numElts = numBlocks;
  } while (numElts > 1);
  g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *));
  g_numLevelsAllocated = level;
  numElts = maxNumElements;
  level = 0;
  do {
    unsigned int numBlocks =
        max(1, (int)ceil((float)numElts / (2.f * blockSize)));
    if (numBlocks > 1)
      CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++],
                                numBlocks * sizeof(unsigned int)));
    numElts = numBlocks;
  } while (numElts > 1);
  CUT_CHECK_ERROR("preallocBlockSums");
 }
 static void deallocBlockSums() {
  for (unsigned int i = 0; i < g_numLevelsAllocated; i++) {
    cudaFree(g_scanBlockSums[i]);
  }
  CUT_CHECK_ERROR("deallocBlockSums");
  free((void **)g_scanBlockSums);
  g_scanBlockSums = 0;
  g_numEltsAllocated = 0;
  g_numLevelsAllocated = 0;
 }
 static void prescanArrayRecursive(unsigned int *outArray,
                                  const unsigned int *inArray, int numElements,
                                  int level) {
  unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
  unsigned int numBlocks =
      max(1, (int)ceil((float)numElements / (2.f * blockSize)));
  unsigned int numThreads;
  if (numBlocks > 1)
    numThreads = blockSize;
  else if (isPowerOfTwo(numElements))
    numThreads = numElements / 2;
  else
    numThreads = floorPow2(numElements);
  unsigned int numEltsPerBlock = numThreads * 2;
  // if this is a non-power-of-2 array, the last block will be non-full
  // compute the smallest power of 2 able to compute its scan.
  unsigned int numEltsLastBlock =
      numElements - (numBlocks - 1) * numEltsPerBlock;
  unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2);
  unsigned int np2LastBlock = 0;
  unsigned int sharedMemLastBlock = 0;
  if (numEltsLastBlock != numEltsPerBlock) {
    np2LastBlock = 1;
    if (!isPowerOfTwo(numEltsLastBlock))
      numThreadsLastBlock = floorPow2(numEltsLastBlock);
    unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
    sharedMemLastBlock =
        sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace);
  }
  // padding space is used to avoid shared memory bank conflicts
  unsigned int extraSpace = numEltsPerBlock / NUM_BANKS;
  unsigned int sharedMemSize =
      sizeof(unsigned int) * (numEltsPerBlock + extraSpace);
 #ifdef DEBUG
  if (numBlocks > 1) {
    assert(g_numEltsAllocated >= numElements);
  }
 #endif
  // setup execution parameters
  // if NP2, we process the last block separately
  dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1);
  dim3 threads(numThreads, 1, 1);
  // make sure there are no CUDA errors before we start
  CUT_CHECK_ERROR("prescanArrayRecursive before kernels");
  // execute the scan
  if (numBlocks > 1) {
    prescan<true, false><<<grid, threads>>>(
        outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0);
    cudaThreadSynchronize();
    CUT_CHECK_ERROR("prescanWithBlockSums");
    if (np2LastBlock) {
      prescan<true, true><<<1, numThreadsLastBlock>>>(
          outArray, inArray, g_scanBlockSums[level], numEltsLastBlock,
          numBlocks - 1, numElements - numEltsLastBlock);
      cudaThreadSynchronize();
      CUT_CHECK_ERROR("prescanNP2WithBlockSums");
    }
    // After scanning all the sub-blocks, we are mostly done.  But now we
    // need to take all of the last values of the sub-blocks and scan those.
    // This will give us a new value that must be sdded to each block to
    // get the final results.
    // recursive (CPU) call
    prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level],
                          numBlocks, level + 1);
    uniformAdd<<<grid, threads>>>(outArray, g_scanBlockSums[level],
                                  numElements - numEltsLastBlock, 0, 0);
    cudaThreadSynchronize();
    CUT_CHECK_ERROR("uniformAdd");
    if (np2LastBlock) {
      uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level],
                                             numEltsLastBlock, numBlocks - 1,
                                             numElements - numEltsLastBlock);
      cudaThreadSynchronize();
      CUT_CHECK_ERROR("uniformAdd");
    }
  } else if (isPowerOfTwo(numElements)) {
    prescan<false, false>
        <<<grid, threads>>>(outArray, inArray, 0, numThreads * 2, 0, 0);
    cudaThreadSynchronize();
    CUT_CHECK_ERROR("prescan");
  } else {
    prescan<false, true>
        <<<grid, threads>>>(outArray, inArray, 0, numElements, 0, 0);
    cudaThreadSynchronize();
    CUT_CHECK_ERROR("prescanNP2");
  }
 }
 static void prescanArray(unsigned int *outArray, unsigned int *inArray,
                         int numElements) {
  prescanArrayRecursive(outArray, inArray, numElements, 0);
 }
 #endif // _PRESCAN_CU_
--- a/examples/huffman/scanLargeArray_kernel.cu
+++ b/examples/huffman/scanLargeArray_kernel.cu
@ -1,237 +0,0 @@
 /*
 * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.  This source code is a "commercial item" as
 * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer software" and "commercial computer software
 * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 */
 #ifndef _SCAN_BEST_KERNEL_CU_
 #define _SCAN_BEST_KERNEL_CU_
 // Define this to more rigorously avoid bank conflicts,
 // even at the lower (root) levels of the tree
 // Note that due to the higher addressing overhead, performance
 // is lower with ZERO_BANK_CONFLICTS enabled.  It is provided
 // as an example.
 //#define ZERO_BANK_CONFLICTS
 // 16 banks on G80
 #define NUM_BANKS 16
 #define LOG_NUM_BANKS 4
 #ifdef ZERO_BANK_CONFLICTS
 #define CONFLICT_FREE_OFFSET(index)                                            \
  ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
 #else
 #define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 // Work-efficient compute implementation of scan, one thread per 2 elements
 // Work-efficient: O(log(n)) steps, and O(n) adds.
 // Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
 // ping-ponging Also avoids most bank conflicts using single-element offsets
 // every NUM_BANKS elements.
 //
 // In addition, If ZERO_BANK_CONFLICTS is defined, uses
 //     n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
 // shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
 // using single-element offsets every NUM_BANKS elements, plus additional
 // single-element offsets after every NUM_BANKS^2 elements.
 //
 // Uses a balanced tree type algorithm.  See Blelloch, 1990 "Prefix Sums
 // and Their Applications", or Prins and Chatterjee PRAM course notes:
 // http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
 //
 // This work-efficient version is based on the algorithm presented in Guy
 // Blelloch's excellent paper "Prefix sums and their applications".
 // http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
 //
 // Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
 // is defined) Con: More instructions to compute bank-conflict-free shared
 // memory addressing, and slightly more shared memory storage used.
 //
 template <bool isNP2>
 __device__ static void
 loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
                       int baseIndex, int &ai, int &bi, int &mem_ai,
                       int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
  int thid = threadIdx.x;
  mem_ai = baseIndex + threadIdx.x;
  mem_bi = mem_ai + blockDim.x;
  ai = thid;
  bi = thid + blockDim.x;
  // compute spacing to avoid bank conflicts
  bankOffsetA = CONFLICT_FREE_OFFSET(ai);
  bankOffsetB = CONFLICT_FREE_OFFSET(bi);
  // Cache the computational window in shared memory
  // pad values beyond n with zeros
  s_data[ai + bankOffsetA] = g_idata[mem_ai];
  if (isNP2) // compile-time decision
  {
    s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
  } else {
    s_data[bi + bankOffsetB] = g_idata[mem_bi];
  }
 }
 template <bool isNP2>
 __device__ static void
 storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
                      int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
                      int bankOffsetB) {
  __syncthreads();
  // write results to global memory
  g_odata[mem_ai] = s_data[ai + bankOffsetA];
  if (isNP2) // compile-time decision
  {
    if (bi < n)
      g_odata[mem_bi] = s_data[bi + bankOffsetB];
  } else {
    g_odata[mem_bi] = s_data[bi + bankOffsetB];
  }
 }
 template <bool storeSum>
 __device__ static void clearLastElement(unsigned int *s_data,
                                        unsigned int *g_blockSums,
                                        int blockIndex) {
  if (threadIdx.x == 0) {
    int index = (blockDim.x << 1) - 1;
    index += CONFLICT_FREE_OFFSET(index);
    if (storeSum) // compile-time decision
    {
      // write this block's total sum to the corresponding index in the
      // blockSums array
      g_blockSums[blockIndex] = s_data[index];
    }
    // zero the last element in the scan so it will propagate back to the front
    s_data[index] = 0;
  }
 }
 __device__ static unsigned int buildSum(unsigned int *s_data) {
  unsigned int thid = threadIdx.x;
  unsigned int stride = 1;
  // build the sum in place up the tree
  for (int d = blockDim.x; d > 0; d >>= 1) {
    __syncthreads();
    if (thid < d) {
      int i = __mul24(__mul24(2, stride), thid);
      int ai = i + stride - 1;
      int bi = ai + stride;
      ai += CONFLICT_FREE_OFFSET(ai);
      bi += CONFLICT_FREE_OFFSET(bi);
      s_data[bi] += s_data[ai];
    }
    stride *= 2;
  }
  return stride;
 }
 __device__ static void scanRootToLeaves(unsigned int *s_data,
                                        unsigned int stride) {
  unsigned int thid = threadIdx.x;
  // traverse down the tree building the scan in place
  for (int d = 1; d <= blockDim.x; d *= 2) {
    stride >>= 1;
    __syncthreads();
    if (thid < d) {
      int i = __mul24(__mul24(2, stride), thid);
      int ai = i + stride - 1;
      int bi = ai + stride;
      ai += CONFLICT_FREE_OFFSET(ai);
      bi += CONFLICT_FREE_OFFSET(bi);
      unsigned int t = s_data[ai];
      s_data[ai] = s_data[bi];
      s_data[bi] += t;
    }
  }
 }
 template <bool storeSum>
 __device__ static void prescanBlock(unsigned int *data, int blockIndex,
                                    unsigned int *blockSums) {
  int stride = buildSum(data); // build the sum in place up the tree
  clearLastElement<storeSum>(data, blockSums,
                             (blockIndex == 0) ? blockIdx.x : blockIndex);
  scanRootToLeaves(data, stride); // traverse down tree to build the scan
 }
 template <bool storeSum, bool isNP2>
 __global__ static void
 prescan(unsigned int *g_odata, const unsigned int *g_idata,
        unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
  int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
  __shared__ unsigned int s_data[3072];
  // load data into shared memory
  loadSharedChunkFromMem<isNP2>(
      s_data, g_idata, n,
      (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
      bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
  // scan the data in each block
  prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
  // write results to device memory
  storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
                               bankOffsetA, bankOffsetB);
 }
 __global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
                                  int n, int blockOffset, int baseIndex) {
  __shared__ unsigned int uni;
  if (threadIdx.x == 0)
    uni = uniforms[blockIdx.x + blockOffset];
  unsigned int address =
      __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
  __syncthreads();
  // note two adds per thread
  g_data[address] += uni;
  g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
 }
 #endif // #ifndef _SCAN_BEST_KERNEL_CU_
--- a/examples/huffman/stats_logger.cpp
+++ b/examples/huffman/stats_logger.cpp
@ -1,43 +0,0 @@
 /*
 * Copyright 2009 Tjark Bringewat. All rights reserved.
 */
 #include "stats_logger.h"
 #include "stdafx.h"
 #include <cstdio>
 #include <map>
 #include <sstream>
 std::map<std::string, unsigned int> filenames;
 void LogStats(const char *graphname, const char *seriesname, float xValue,
              float yValue, const char *xAxisQuantity,
              const char *yAxisQuantity, const char *xAxisUnit,
              const char *yAxisUnit, const char *xAxisScaleType,
              const char *yAxisScaleType, unsigned int seriesnumber,
              const char *description) {
  std::ostringstream temp, temp2;
  temp << graphname << "__" << seriesname;
  size_t exists = filenames.count(temp.str());
  if (!exists)
    filenames[temp.str()] = seriesnumber;
  temp2 << graphname << "__" << filenames[temp.str()] << "_" << seriesname
        << ".txt";
  FILE *f;
  if (!exists) {
    f = fopen(temp2.str().c_str(), "wt");
    fprintf(f, "SERIES_NAME\n%s\n", seriesname);
    fprintf(f, "X_AXIS_QUANTITY\n%s\n", xAxisQuantity);
    fprintf(f, "Y_AXIS_QUANTITY\n%s\n", yAxisQuantity);
    fprintf(f, "X_AXIS_UNIT\n%s\n", xAxisUnit);
    fprintf(f, "Y_AXIS_UNIT\n%s\n", yAxisUnit);
    fprintf(f, "X_AXIS_SCALE_TYPE\n%s\n", xAxisScaleType);
    fprintf(f, "Y_AXIS_SCALE_TYPE\n%s\n", yAxisScaleType);
    fprintf(f, "DESCRIPTION\n%s\n", description);
    fprintf(f, "__DATA__\n");
  } else {
    f = fopen(temp2.str().c_str(), "at");
  }
  fprintf(f, "%f %f\n", xValue, yValue);
  fclose(f);
 }
--- a/examples/huffman/stats_logger.h
+++ b/examples/huffman/stats_logger.h
@ -1,45 +0,0 @@
 /*
 * Copyright Tjark Bringewat. All rights reserved.
 */
 #ifndef _STATS_LOGGER_H_
 #define _STATS_LOGGER_H_
 #include <cstring>
 #pragma warning(disable : 4996)
 extern "C" void
 LogStats(const char *graphname, const char *seriesname, float xValue,
         float yValue, const char *xAxisQuantity, const char *yAxisQuantity,
         const char *xAxisUnit = "", const char *yAxisUnit = "",
         const char *xAxisScaleType = "lin", const char *yAxisScaleType = "lin",
         unsigned int seriesnumber = 0, const char *description = "");
 inline void LogStats2(
    const char *graph, // Groups several functions into one graph. Only appears
                       // in the file name.
    const char *function, // Name of the particular function. Appears in file
                          // name and legend.
    float yValue, float xValue, const char *yAxisName = "Time",
    const char *yAxisUnit = "ms", const char *xAxisName = "Data size",
    const char *xAxisUnit = "MB",
    const char *yAxisScaleType = "lin", // Can be lin or log for linear or
                                        // logarithmic scale, respectively.
    const char *xAxisScaleType = "log",
    unsigned int fId =
        0, // Determines the order in which different functions are plotted to a
           // common graph. Only appears in the file name.
    const char *description = "") {
  LogStats(graph, function, xValue, yValue, xAxisName, yAxisName, xAxisUnit,
           yAxisUnit, xAxisScaleType, yAxisScaleType, fId, description);
  if (strcmp(xAxisUnit, "MB") == 0 && strcmp(yAxisUnit, "ms") == 0) {
    char buffer[100];
    strcpy(buffer, graph);
    strcat(buffer, "_datarate");
    LogStats(buffer, function, xValue, (xValue * 1000.0f) / (yValue * 1024.0f),
             xAxisName, "Data rate", xAxisUnit, "GB/s", xAxisScaleType,
             yAxisScaleType, fId, description);
  }
 }
 #endif
--- a/examples/huffman/stdafx.h
+++ b/examples/huffman/stdafx.h
@ -1,11 +0,0 @@
 #pragma once
 #include "cutil.h"
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
 #include <memory.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
--- a/examples/huffman/testdatagen.h
+++ b/examples/huffman/testdatagen.h
@ -1,83 +0,0 @@
 #ifndef _TESTDATA_GEN_H_
 #define _TESTDATA_GEN_H_
 #include "parameters.h"
 template <typename T>
 __inline__ void generateRLETestData(T *data, unsigned int num_blocks,
                                    unsigned int num_block_threads) {
  unsigned int i, j;
  /* generate first block*/
  for (i = 0; i < num_block_threads; i += 8) {
    data[i] = 1;
    data[i + 1] = 2;
    data[i + 2] = 3;
    data[i + 3] = 3;
    data[i + 4] = 3;
    data[i + 5] = 4;
    data[i + 6] = 4;
    data[i + 7] = 5;
  }
  /*  copy contents of the first block to all other blocks (for testing only)*/
  for (j = 1; j < num_blocks; j++)
    for (i = 0; i < num_block_threads; i++)
      *(data + j * num_block_threads + i) = data[i];
 }
 template <typename T>
 __inline__ void generateRLETestDataLongRuns1(T *data, unsigned int num_blocks,
                                             unsigned int num_block_threads,
                                             unsigned int avg_run_len) {
  unsigned int i, j;
  /* generate first block*/
  for (i = 0; i < num_block_threads / avg_run_len; i++)
    for (j = 0; j < avg_run_len; j++)
      data[i * avg_run_len + j] = i;
  /*  copy contents of the first block to all other blocks (for testing only)*/
  for (j = 1; j < num_blocks; j++)
    for (i = 0; i < num_block_threads; i++)
      *(data + j * num_block_threads + i) = data[i];
 }
 // VLE TEST DATA VER2.0
 // for testing only: generates codewords of the following lengths: 1, 2, 3, 4,
 // 4, 5, 6, 7
 //  and dummy odewords: 1, 10, 100, 1000, 1000, 10000, 100000, 1000000
 //  equals 0x01, 0x02, 0x4, 0x8, 0x8, 0x10, 0x20, 0x40
 //  num_symbols  =256. Must be multiple of 8.
 inline void generateCodewords(unsigned int *codewords,
                              unsigned int *codewordlens,
                              unsigned int num_symbols) {
  unsigned int idx, i, j, numbits, k; // val, k;
  /* Generate codeword lengths*/
  for (j = 0; j < num_symbols / 8; j++) {
    for (i = 0; i < 4; i++) { // generate first half of length 1,2 3, 4
      idx = j * 8 + i;
      codewordlens[idx] = i % 4 + 1;
    }
    for (i = 0; i < 4; i++) { // generate first half of length 4, 5, 6, 7
      idx = j * 8 + 4 + i;
      codewordlens[idx] = i % 4 + 4;
    }
  }
  /* Generate codewords*/
  for (k = 0; k < num_symbols; k++) {
    numbits = codewordlens[k];
    codewords[k] = 0x01 << (numbits - 1);
  }
 }
 inline void generateData(unsigned int *data, unsigned int num_elements,
                         unsigned int *codewords, unsigned int *codewordlens,
                         unsigned int num_symbols) {
  unsigned int i;
  for (i = 0; i < num_elements; i++) {
    data[i] = (unsigned int)(((float)rand() / (RAND_MAX + 1)) * num_symbols);
  }
 }
 #endif
--- a/examples/huffman/vlc_kernel_sm64huff.cu
+++ b/examples/huffman/vlc_kernel_sm64huff.cu
@ -1,160 +0,0 @@
 #ifndef _VLC_SM64HUFF_KERNEL_H_
 #define _VLC_SM64HUFF_KERNEL_H_
 #include "pabio_kernels_v2.cu"
 #include "parameters.h"
 #include <cstdio>
 #ifdef SMATOMICS
 /* HUFFMAN-FRIENDLY PAVLE
   CHARACTERISTICS:
   1. CACHE CW_LUT INTO SM, LOAD AS 2 INT ARRAYS
   2. PARALLEL PREFIX SUM
   3. PARALLEL BIT I/O USING SHARED-MEMORY ATOMIC OPERATIONS (COMAPTIBLE WITH
   CUDA1.3+)
   NOTES & ASSUMPTIONS:
   -	HUFFMAN-CODING FRIENDLY, SUPPORTS CODEWORDS OF 2X SIZE OF ORIGINAL
   SYMBOLS (BYTES). -	NUMBER OF THREADS PER BLOCK IS 256; IF YOU WANT TO PLAY
   WITH DIFFERENT NUMBERS, THE CW CACHING SHOULD BE MODIFIED (SEE DPT* KERNELS)
   -	SM usage: 1x size of the input data (REUSE) + size of CWLUT
                TURN ON CACHING FOR HIGH ENTROPY DATA!
 */
 __global__ static void
 vlc_encode_kernel_sm64huff(unsigned int *data, const unsigned int *gm_codewords,
                           const unsigned int *gm_codewordlens,
 #ifdef TESTING
                           unsigned int *cw32, unsigned int *cw32len,
                           unsigned int *cw32idx,
 #endif
                           unsigned int *out, unsigned int *outidx) {
  unsigned int kn = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int k = threadIdx.x;
  unsigned int kc, startbit, wrbits;
  unsigned long long cw64 = 0;
  unsigned int val32, codewordlen = 0;
  unsigned char tmpbyte, tmpcwlen;
  unsigned int tmpcw32;
  __shared__ unsigned int sm[3072];
  __shared__ unsigned int kcmax;
 #ifdef CACHECWLUT
  unsigned int *codewords = (unsigned int *)sm;
  unsigned int *codewordlens = (unsigned int *)(sm + NUM_SYMBOLS);
  unsigned int *as = (unsigned int *)(sm + 2 * NUM_SYMBOLS);
  /* Load the codewords and the original data*/
  codewords[k] = gm_codewords[k];
  codewordlens[k] = gm_codewordlens[k];
  val32 = data[kn];
  __syncthreads();
  for (unsigned int i = 0; i < 4; i++) {
    tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
    tmpcw32 = codewords[tmpbyte];
    tmpcwlen = codewordlens[tmpbyte];
    cw64 = (cw64 << tmpcwlen) | tmpcw32;
    codewordlen += tmpcwlen;
  }
 #else
  unsigned int *as = (unsigned int *)sm;
  val32 = data[kn];
  for (unsigned int i = 0; i < 4; i++) {
    tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
    tmpcw32 = gm_codewords[tmpbyte];
    tmpcwlen = gm_codewordlens[tmpbyte];
    cw64 = (cw64 << tmpcwlen) | tmpcw32;
    codewordlen += tmpcwlen;
  }
 #endif
  as[k] = codewordlen;
  __syncthreads();
  /* Prefix sum of codeword lengths (denoted in bits) [inplace implementation]
   */
  unsigned int offset = 1;
  /* Build the sum in place up the tree */
  for (unsigned int d = (blockDim.x) >> 1; d > 0; d >>= 1) {
    __syncthreads();
    if (k < d) {
      unsigned char ai = offset * (2 * k + 1) - 1;
      unsigned char bi = offset * (2 * k + 2) - 1;
      as[bi] += as[ai];
    }
    offset *= 2;
  }
  /* scan back down the tree */
  /* clear the last element */
  if (k == 0)
    as[blockDim.x - 1] = 0;
  // traverse down the tree building the scan in place
  for (unsigned int d = 1; d < blockDim.x; d *= 2) {
    offset >>= 1;
    __syncthreads();
    if (k < d) {
      unsigned char ai = offset * (2 * k + 1) - 1;
      unsigned char bi = offset * (2 * k + 2) - 1;
      unsigned int t = as[ai];
      as[ai] = as[bi];
      as[bi] += t;
    }
  }
  __syncthreads();
  if (k == blockDim.x - 1) {
    outidx[blockIdx.x] = as[k] + codewordlen;
    kcmax = (as[k] + codewordlen) / 32;
    // printf("kcmax: %d\n", kcmax);
  }
  /* Write the codes */
  kc = as[k] / 32;
  startbit = as[k] % 32;
  as[k] = 0U;
  __syncthreads();
  /* Part 1*/
  wrbits = codewordlen > (32 - startbit) ? (32 - startbit) : codewordlen;
  tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits));
  // if (wrbits == 32) as[kc] = tmpcw32;
  // //unnecessary overhead; increases number of branches else
  atomicOr(&as[kc], tmpcw32 << (32 - startbit -
                                wrbits)); // shift left in case it's shorter
                                          // then the available space
  codewordlen -= wrbits;
  /*Part 2*/
  if (codewordlen) {
    wrbits = codewordlen > 32 ? 32 : codewordlen;
    tmpcw32 =
        (unsigned int)(cw64 >> (codewordlen - wrbits)) & ((1 << wrbits) - 1);
    // if (wrbits == 32) as[kc+1] = tmpcw32;
    // else
    atomicOr(&as[kc + 1], tmpcw32 << (32 - wrbits));
    codewordlen -= wrbits;
  }
  /*Part 3*/
  if (codewordlen) {
    tmpcw32 = (unsigned int)(cw64 & ((1 << codewordlen) - 1));
    // if (wrbits == 32) as[kc+2] = tmpcw32;
    // else
    atomicOr(&as[kc + 2], tmpcw32 << (32 - codewordlen));
  }
  __syncthreads();
  if (k <= kcmax)
    out[kn] = as[k];
 }
 //////////////////////////////////////////////////////////////////////////////
 #endif
 #endif
--- a/Show More
+++ b/Show More