remove useless examples

2022-09-15 11:31:58 -04:00 · 2022-09-15 11:31:58 -04:00 · 9152feb24f
parent 49adfd026c
commit 9152feb24f
140 changed files with 0 additions and 67741 deletions
--- a/examples/backprop/backprop.c
+++ b/examples/backprop/backprop.c
@ -1,454 +0,0 @@
-#include "backprop.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-//#define OPEN
-
-#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
-
-#define fastcopy(to, from, len)                                                \
-  {                                                                            \
-    register char *_to, *_from;                                                \
-    register int _i, _l;                                                       \
-    _to = (char *)(to);                                                        \
-    _from = (char *)(from);                                                    \
-    _l = (len);                                                                \
-    for (_i = 0; _i < _l; _i++)                                                \
-      *_to++ = *_from++;                                                       \
-  }
-
-/*** Return random number between 0.0 and 1.0 ***/
-float drnd() { return ((float)rand() / (float)BIGRND); }
-
-/*** Return random number between -1.0 and 1.0 ***/
-float dpn1() { return ((drnd() * 2.0) - 1.0); }
-
-/*** The squashing function.  Currently, it's a sigmoid. ***/
-
-float squash(x)
-float x;
-{
-  float m;
-  // x = -x;
-  // m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
-  // return(1.0 / (1.0 + m));
-  return (1.0 / (1.0 + exp(-x)));
-}
-
-/*** Allocate 1d array of floats ***/
-
-float *alloc_1d_dbl(n)
-int n;
-{
-  float *new;
-
-  new = (float *)malloc((unsigned)(n * sizeof(float)));
-  if (new == NULL) {
-    printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
-    return (NULL);
-  }
-  return (new);
-}
-
-/*** Allocate 2d array of floats ***/
-
-float **alloc_2d_dbl(m, n)
-int m, n;
-{
-  int i;
-  float **new;
-
-  new = (float **)malloc((unsigned)(m * sizeof(float *)));
-  if (new == NULL) {
-    printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
-    return (NULL);
-  }
-
-  for (i = 0; i < m; i++) {
-    new[i] = alloc_1d_dbl(n);
-  }
-
-  return (new);
-}
-
-bpnn_randomize_weights(w, m, n) float **w;
-int m, n;
-{
-  int i, j;
-
-  for (i = 0; i <= m; i++) {
-    for (j = 0; j <= n; j++) {
-      w[i][j] = (float)rand() / RAND_MAX;
-      //  w[i][j] = dpn1();
-    }
-  }
-}
-
-bpnn_randomize_row(w, m) float *w;
-int m;
-{
-  int i;
-  for (i = 0; i <= m; i++) {
-    // w[i] = (float) rand()/RAND_MAX;
-    w[i] = 0.1;
-  }
-}
-
-bpnn_zero_weights(w, m, n) float **w;
-int m, n;
-{
-  int i, j;
-
-  for (i = 0; i <= m; i++) {
-    for (j = 0; j <= n; j++) {
-      w[i][j] = 0.0;
-    }
-  }
-}
-
-void bpnn_initialize(seed) {
-  printf("Random number generator seed: %d\n", seed);
-  srand(seed);
-}
-
-BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
-int n_in, n_hidden, n_out;
-{
-  BPNN *newnet;
-
-  newnet = (BPNN *)malloc(sizeof(BPNN));
-  if (newnet == NULL) {
-    printf("BPNN_CREATE: Couldn't allocate neural network\n");
-    return (NULL);
-  }
-
-  newnet->input_n = n_in;
-  newnet->hidden_n = n_hidden;
-  newnet->output_n = n_out;
-  newnet->input_units = alloc_1d_dbl(n_in + 1);
-  newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
-  newnet->output_units = alloc_1d_dbl(n_out + 1);
-
-  newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
-  newnet->output_delta = alloc_1d_dbl(n_out + 1);
-  newnet->target = alloc_1d_dbl(n_out + 1);
-
-  newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
-  newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
-
-  newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
-  newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
-
-  return (newnet);
-}
-
-void bpnn_free(net) BPNN *net;
-{
-  int n1, n2, i;
-
-  n1 = net->input_n;
-  n2 = net->hidden_n;
-
-  free((char *)net->input_units);
-  free((char *)net->hidden_units);
-  free((char *)net->output_units);
-
-  free((char *)net->hidden_delta);
-  free((char *)net->output_delta);
-  free((char *)net->target);
-
-  for (i = 0; i <= n1; i++) {
-    free((char *)net->input_weights[i]);
-    free((char *)net->input_prev_weights[i]);
-  }
-  free((char *)net->input_weights);
-  free((char *)net->input_prev_weights);
-
-  for (i = 0; i <= n2; i++) {
-    free((char *)net->hidden_weights[i]);
-    free((char *)net->hidden_prev_weights[i]);
-  }
-  free((char *)net->hidden_weights);
-  free((char *)net->hidden_prev_weights);
-
-  free((char *)net);
-}
-
-/*** Creates a new fully-connected network from scratch,
-     with the given numbers of input, hidden, and output units.
-     Threshold units are automatically included.  All weights are
-     randomly initialized.
-     Space is also allocated for temporary storage (momentum weights,
-     error computations, etc).
-***/
-
-BPNN *bpnn_create(n_in, n_hidden, n_out)
-int n_in, n_hidden, n_out;
-{
-
-  BPNN *newnet;
-
-  newnet = bpnn_internal_create(n_in, n_hidden, n_out);
-
-#ifdef INITZERO
-  bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
-#else
-  bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
-#endif
-  bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
-  bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
-  bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
-  bpnn_randomize_row(newnet->target, n_out);
-  return (newnet);
-}
-
-void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
-int n1, n2;
-{
-  float sum;
-  int j, k;
-
-  /*** Set up thresholding unit ***/
-  l1[0] = 1.0;
-#ifdef OPEN
-  omp_set_num_threads(NUM_THREAD);
-#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
-#endif
-  /*** For each unit in second layer ***/
-  for (j = 1; j <= n2; j++) {
-
-    /*** Compute weighted sum of its inputs ***/
-    sum = 0.0;
-    for (k = 0; k <= n1; k++) {
-      sum += conn[k][j] * l1[k];
-    }
-    l2[j] = squash(sum);
-  }
-}
-
-// extern "C"
-void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
-    *output, *err;
-int nj;
-{
-  int j;
-  float o, t, errsum;
-  errsum = 0.0;
-  for (j = 1; j <= nj; j++) {
-    o = output[j];
-    t = target[j];
-    delta[j] = o * (1.0 - o) * (t - o);
-    errsum += ABS(delta[j]);
-  }
-  *err = errsum;
-}
-
-void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
-                       err) float *delta_h,
-    *delta_o, *hidden, **who, *err;
-int nh, no;
-{
-  int j, k;
-  float h, sum, errsum;
-
-  errsum = 0.0;
-  for (j = 1; j <= nh; j++) {
-    h = hidden[j];
-    sum = 0.0;
-    for (k = 1; k <= no; k++) {
-      sum += delta_o[k] * who[j][k];
-    }
-    delta_h[j] = h * (1.0 - h) * sum;
-    errsum += ABS(delta_h[j]);
-  }
-  *err = errsum;
-}
-
-void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
-    **w, **oldw;
-{
-  float new_dw;
-  int k, j;
-  ly[0] = 1.0;
-  // eta = 0.3;
-  // momentum = 0.3;
-
-#ifdef OPEN
-  omp_set_num_threads(NUM_THREAD);
-#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw)          \
-    firstprivate(ndelta, nly, momentum)
-#endif
-  for (j = 1; j <= ndelta; j++) {
-    for (k = 0; k <= nly; k++) {
-      new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
-      w[k][j] += new_dw;
-      oldw[k][j] = new_dw;
-    }
-  }
-}
-
-void bpnn_feedforward(net) BPNN *net;
-{
-  int in, hid, out;
-
-  in = net->input_n;
-  hid = net->hidden_n;
-  out = net->output_n;
-
-  /*** Feed forward input activations. ***/
-  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
-                    hid);
-  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
-                    hid, out);
-}
-
-void bpnn_train(net, eo, eh) BPNN *net;
-float *eo, *eh;
-{
-  int in, hid, out;
-  float out_err, hid_err;
-
-  in = net->input_n;
-  hid = net->hidden_n;
-  out = net->output_n;
-
-  /*** Feed forward input activations. ***/
-  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
-                    hid);
-  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
-                    hid, out);
-
-  /*** Compute error on output and hidden units. ***/
-  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
-                    &out_err);
-  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
-                    net->hidden_weights, net->hidden_units, &hid_err);
-  *eo = out_err;
-  *eh = hid_err;
-
-  /*** Adjust input and hidden weights. ***/
-  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
-                      net->hidden_weights, net->hidden_prev_weights);
-  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
-                      net->input_weights, net->input_prev_weights);
-}
-
-void bpnn_save(net, filename) BPNN *net;
-char *filename;
-{
-  int n1, n2, n3, i, j, memcnt;
-  float dvalue, **w;
-  char *mem;
-  /// add//
-  FILE *pFile;
-  pFile = fopen(filename, "w+");
-  ///////
-  /*
-  if ((fd = creat(filename, 0644)) == -1) {
-    printf("BPNN_SAVE: Cannot create '%s'\n", filename);
-    return;
-  }
-  */
-
-  n1 = net->input_n;
-  n2 = net->hidden_n;
-  n3 = net->output_n;
-  printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
-  // fflush(stdout);
-
-  // write(fd, (char *) &n1, sizeof(int));
-  // write(fd, (char *) &n2, sizeof(int));
-  // write(fd, (char *) &n3, sizeof(int));
-
-  fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
-  fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
-  fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
-
-  memcnt = 0;
-  w = net->input_weights;
-  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
-  for (i = 0; i <= n1; i++) {
-    for (j = 0; j <= n2; j++) {
-      dvalue = w[i][j];
-      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
-      memcnt += sizeof(float);
-    }
-  }
-  // write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
-  fwrite(mem, (unsigned)(sizeof(float)),
-         (unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
-  free(mem);
-
-  memcnt = 0;
-  w = net->hidden_weights;
-  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
-  for (i = 0; i <= n2; i++) {
-    for (j = 0; j <= n3; j++) {
-      dvalue = w[i][j];
-      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
-      memcnt += sizeof(float);
-    }
-  }
-  // write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
-  fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
-         pFile);
-  free(mem);
-
-  fclose(pFile);
-  return;
-}
-
-BPNN *bpnn_read(filename)
-char *filename;
-{
-  char *mem;
-  BPNN *new;
-  int fd, n1, n2, n3, i, j, memcnt;
-
-  if ((fd = open(filename, 0, 0644)) == -1) {
-    return (NULL);
-  }
-
-  printf("Reading '%s'\n", filename); // fflush(stdout);
-
-  read(fd, (char *)&n1, sizeof(int));
-  read(fd, (char *)&n2, sizeof(int));
-  read(fd, (char *)&n3, sizeof(int));
-  new = bpnn_internal_create(n1, n2, n3);
-
-  printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
-  printf("Reading input weights..."); // fflush(stdout);
-
-  memcnt = 0;
-  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
-  read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
-  for (i = 0; i <= n1; i++) {
-    for (j = 0; j <= n2; j++) {
-      fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
-      memcnt += sizeof(float);
-    }
-  }
-  free(mem);
-
-  printf("Done\nReading hidden weights..."); // fflush(stdout);
-
-  memcnt = 0;
-  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
-  read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
-  for (i = 0; i <= n2; i++) {
-    for (j = 0; j <= n3; j++) {
-      fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
-      memcnt += sizeof(float);
-    }
-  }
-  free(mem);
-  close(fd);
-
-  printf("Done\n"); // fflush(stdout);
-
-  bpnn_zero_weights(new->input_prev_weights, n1, n2);
-  bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
-
-  return (new);
-}
--- a/examples/backprop/backprop.h
+++ b/examples/backprop/backprop.h
@ -1,50 +0,0 @@
-#ifndef _BACKPROP_H_
-#define _BACKPROP_H_
-
-#define BIGRND 0x7fffffff
-
-#define GPU
-#define THREADS 256
-#define WIDTH 16  // shared memory width
-#define HEIGHT 16 // shared memory height
-
-#define ETA 0.3      // eta value
-#define MOMENTUM 0.3 // momentum value
-#define NUM_THREAD 4 // OpenMP threads
-
-typedef struct {
-  int input_n;  /* number of input units */
-  int hidden_n; /* number of hidden units */
-  int output_n; /* number of output units */
-
-  float *input_units;  /* the input units */
-  float *hidden_units; /* the hidden units */
-  float *output_units; /* the output units */
-
-  float *hidden_delta; /* storage for hidden unit error */
-  float *output_delta; /* storage for output unit error */
-
-  float *target; /* storage for target vector */
-
-  float **input_weights;  /* weights from input to hidden layer */
-  float **hidden_weights; /* weights from hidden to output layer */
-
-  /*** The next two are for momentum ***/
-  float **input_prev_weights;  /* previous change on input to hidden wgt */
-  float **hidden_prev_weights; /* previous change on hidden to output wgt */
-} BPNN;
-
-/*** User-level functions ***/
-
-void bpnn_initialize();
-
-BPNN *bpnn_create();
-void bpnn_free();
-
-void bpnn_train();
-void bpnn_feedforward();
-
-void bpnn_save();
-BPNN *bpnn_read();
-
-#endif
--- a/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,615 +0,0 @@
-; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "backprop_cuda.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
-
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
-@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
-@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
-entry:
-  %input_cuda.addr = alloca float*, align 8
-  %output_hidden_cuda.addr = alloca float*, align 8
-  %input_hidden_cuda.addr = alloca float*, align 8
-  %hidden_partial_sum.addr = alloca float*, align 8
-  %in.addr = alloca i32, align 4
-  %hid.addr = alloca i32, align 4
-  %by = alloca i32, align 4
-  %tx = alloca i32, align 4
-  %ty = alloca i32, align 4
-  %index = alloca i32, align 4
-  %index_in = alloca i32, align 4
-  %i = alloca i32, align 4
-  %power_two = alloca i32, align 4
-  store float* %input_cuda, float** %input_cuda.addr, align 8
-  store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
-  store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
-  store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
-  store i32 %in, i32* %in.addr, align 4
-  store i32 %hid, i32* %hid.addr, align 4
-  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
-  store i32 %call, i32* %by, align 4
-  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call1, i32* %tx, align 4
-  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
-  store i32 %call2, i32* %ty, align 4
-  %0 = load i32, i32* %hid.addr, align 4
-  %add = add nsw i32 %0, 1
-  %mul = mul nsw i32 %add, 16
-  %1 = load i32, i32* %by, align 4
-  %mul3 = mul nsw i32 %mul, %1
-  %2 = load i32, i32* %hid.addr, align 4
-  %add4 = add nsw i32 %2, 1
-  %3 = load i32, i32* %ty, align 4
-  %mul5 = mul nsw i32 %add4, %3
-  %add6 = add nsw i32 %mul3, %mul5
-  %4 = load i32, i32* %tx, align 4
-  %add7 = add nsw i32 %add6, %4
-  %add8 = add nsw i32 %add7, 1
-  %5 = load i32, i32* %hid.addr, align 4
-  %add9 = add nsw i32 %5, 1
-  %add10 = add nsw i32 %add8, %add9
-  store i32 %add10, i32* %index, align 4
-  %6 = load i32, i32* %by, align 4
-  %mul11 = mul nsw i32 16, %6
-  %7 = load i32, i32* %ty, align 4
-  %add12 = add nsw i32 %mul11, %7
-  %add13 = add nsw i32 %add12, 1
-  store i32 %add13, i32* %index_in, align 4
-  %8 = load i32, i32* %tx, align 4
-  %cmp = icmp eq i32 %8, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  %9 = load float*, float** %input_cuda.addr, align 8
-  %10 = load i32, i32* %index_in, align 4
-  %idxprom = sext i32 %10 to i64
-  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
-  %11 = load float, float* %arrayidx, align 4
-  %12 = load i32, i32* %ty, align 4
-  %idxprom14 = sext i32 %12 to i64
-  %arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
-  store float %11, float* %arrayidx15, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  call void @llvm.nvvm.barrier0()
-  %13 = load float*, float** %input_hidden_cuda.addr, align 8
-  %14 = load i32, i32* %index, align 4
-  %idxprom16 = sext i32 %14 to i64
-  %arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
-  %15 = load float, float* %arrayidx17, align 4
-  %16 = load i32, i32* %ty, align 4
-  %idxprom18 = sext i32 %16 to i64
-  %arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
-  %17 = load i32, i32* %tx, align 4
-  %idxprom20 = sext i32 %17 to i64
-  %arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
-  store float %15, float* %arrayidx21, align 4
-  call void @llvm.nvvm.barrier0()
-  %18 = load i32, i32* %ty, align 4
-  %idxprom22 = sext i32 %18 to i64
-  %arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
-  %19 = load i32, i32* %tx, align 4
-  %idxprom24 = sext i32 %19 to i64
-  %arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
-  %20 = load float, float* %arrayidx25, align 4
-  %21 = load i32, i32* %ty, align 4
-  %idxprom26 = sext i32 %21 to i64
-  %arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
-  %22 = load float, float* %arrayidx27, align 4
-  %mul28 = fmul contract float %20, %22
-  %23 = load i32, i32* %ty, align 4
-  %idxprom29 = sext i32 %23 to i64
-  %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
-  %24 = load i32, i32* %tx, align 4
-  %idxprom31 = sext i32 %24 to i64
-  %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
-  store float %mul28, float* %arrayidx32, align 4
-  call void @llvm.nvvm.barrier0()
-  store i32 1, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %if.end
-  %25 = load i32, i32* %i, align 4
-  %conv = sitofp i32 %25 to float
-  %call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
-  %cmp34 = fcmp ole float %conv, %call33
-  br i1 %cmp34, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %26 = load i32, i32* %i, align 4
-  %conv35 = sitofp i32 %26 to float
-  %call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
-  %conv37 = fptosi float %call36 to i32
-  store i32 %conv37, i32* %power_two, align 4
-  %27 = load i32, i32* %ty, align 4
-  %28 = load i32, i32* %power_two, align 4
-  %rem = srem i32 %27, %28
-  %cmp38 = icmp eq i32 %rem, 0
-  br i1 %cmp38, label %if.then39, label %if.end54
-
-if.then39:                                        ; preds = %for.body
-  %29 = load i32, i32* %ty, align 4
-  %idxprom40 = sext i32 %29 to i64
-  %arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
-  %30 = load i32, i32* %tx, align 4
-  %idxprom42 = sext i32 %30 to i64
-  %arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
-  %31 = load float, float* %arrayidx43, align 4
-  %32 = load i32, i32* %ty, align 4
-  %33 = load i32, i32* %power_two, align 4
-  %div = sdiv i32 %33, 2
-  %add44 = add nsw i32 %32, %div
-  %idxprom45 = sext i32 %add44 to i64
-  %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
-  %34 = load i32, i32* %tx, align 4
-  %idxprom47 = sext i32 %34 to i64
-  %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
-  %35 = load float, float* %arrayidx48, align 4
-  %add49 = fadd contract float %31, %35
-  %36 = load i32, i32* %ty, align 4
-  %idxprom50 = sext i32 %36 to i64
-  %arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
-  %37 = load i32, i32* %tx, align 4
-  %idxprom52 = sext i32 %37 to i64
-  %arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
-  store float %add49, float* %arrayidx53, align 4
-  br label %if.end54
-
-if.end54:                                         ; preds = %if.then39, %for.body
-  call void @llvm.nvvm.barrier0()
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end54
-  %38 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %38, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %39 = load i32, i32* %ty, align 4
-  %idxprom55 = sext i32 %39 to i64
-  %arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
-  %40 = load i32, i32* %tx, align 4
-  %idxprom57 = sext i32 %40 to i64
-  %arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
-  %41 = load float, float* %arrayidx58, align 4
-  %42 = load float*, float** %input_hidden_cuda.addr, align 8
-  %43 = load i32, i32* %index, align 4
-  %idxprom59 = sext i32 %43 to i64
-  %arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
-  store float %41, float* %arrayidx60, align 4
-  call void @llvm.nvvm.barrier0()
-  %44 = load i32, i32* %tx, align 4
-  %cmp61 = icmp eq i32 %44, 0
-  br i1 %cmp61, label %if.then62, label %if.end71
-
-if.then62:                                        ; preds = %for.end
-  %45 = load i32, i32* %tx, align 4
-  %idxprom63 = sext i32 %45 to i64
-  %arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
-  %46 = load i32, i32* %ty, align 4
-  %idxprom65 = sext i32 %46 to i64
-  %arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
-  %47 = load float, float* %arrayidx66, align 4
-  %48 = load float*, float** %hidden_partial_sum.addr, align 8
-  %49 = load i32, i32* %by, align 4
-  %50 = load i32, i32* %hid.addr, align 4
-  %mul67 = mul nsw i32 %49, %50
-  %51 = load i32, i32* %ty, align 4
-  %add68 = add nsw i32 %mul67, %51
-  %idxprom69 = sext i32 %add68 to i64
-  %arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
-  store float %47, float* %arrayidx70, align 4
-  br label %if.end71
-
-if.end71:                                         ; preds = %if.then62, %for.end
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  ret i32 %0
-}
-
-; Function Attrs: convergent nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-; Function Attrs: alwaysinline convergent nounwind
-define internal float @_ZL7__log2ff(float %__a) #1 {
-entry:
-  %__a.addr = alloca float, align 4
-  store float %__a, float* %__a.addr, align 4
-  %0 = load float, float* %__a.addr, align 4
-  %call = call float @__nv_fast_log2f(float %0) #2
-  ret float %call
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
-entry:
-  %__a.addr = alloca float, align 4
-  %__b.addr = alloca float, align 4
-  store float %__a, float* %__a.addr, align 4
-  store float %__b, float* %__b.addr, align 4
-  %0 = load float, float* %__a.addr, align 4
-  %1 = load float, float* %__b.addr, align 4
-  %call = call float @__nv_fast_powf(float %0, float %1) #2
-  ret float %call
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
-entry:
-  %delta.addr = alloca float*, align 8
-  %hid.addr = alloca i32, align 4
-  %ly.addr = alloca float*, align 8
-  %in.addr = alloca i32, align 4
-  %w.addr = alloca float*, align 8
-  %oldw.addr = alloca float*, align 8
-  %by = alloca i32, align 4
-  %tx = alloca i32, align 4
-  %ty = alloca i32, align 4
-  %index = alloca i32, align 4
-  %index_y = alloca i32, align 4
-  %index_x = alloca i32, align 4
-  store float* %delta, float** %delta.addr, align 8
-  store i32 %hid, i32* %hid.addr, align 4
-  store float* %ly, float** %ly.addr, align 8
-  store i32 %in, i32* %in.addr, align 4
-  store float* %w, float** %w.addr, align 8
-  store float* %oldw, float** %oldw.addr, align 8
-  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
-  store i32 %call, i32* %by, align 4
-  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call1, i32* %tx, align 4
-  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
-  store i32 %call2, i32* %ty, align 4
-  %0 = load i32, i32* %hid.addr, align 4
-  %add = add nsw i32 %0, 1
-  %mul = mul nsw i32 %add, 16
-  %1 = load i32, i32* %by, align 4
-  %mul3 = mul nsw i32 %mul, %1
-  %2 = load i32, i32* %hid.addr, align 4
-  %add4 = add nsw i32 %2, 1
-  %3 = load i32, i32* %ty, align 4
-  %mul5 = mul nsw i32 %add4, %3
-  %add6 = add nsw i32 %mul3, %mul5
-  %4 = load i32, i32* %tx, align 4
-  %add7 = add nsw i32 %add6, %4
-  %add8 = add nsw i32 %add7, 1
-  %5 = load i32, i32* %hid.addr, align 4
-  %add9 = add nsw i32 %5, 1
-  %add10 = add nsw i32 %add8, %add9
-  store i32 %add10, i32* %index, align 4
-  %6 = load i32, i32* %by, align 4
-  %mul11 = mul nsw i32 16, %6
-  %7 = load i32, i32* %ty, align 4
-  %add12 = add nsw i32 %mul11, %7
-  %add13 = add nsw i32 %add12, 1
-  store i32 %add13, i32* %index_y, align 4
-  %8 = load i32, i32* %tx, align 4
-  %add14 = add nsw i32 %8, 1
-  store i32 %add14, i32* %index_x, align 4
-  %9 = load float*, float** %delta.addr, align 8
-  %10 = load i32, i32* %index_x, align 4
-  %idxprom = sext i32 %10 to i64
-  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
-  %11 = load float, float* %arrayidx, align 4
-  %conv = fpext float %11 to double
-  %mul15 = fmul contract double 3.000000e-01, %conv
-  %12 = load float*, float** %ly.addr, align 8
-  %13 = load i32, i32* %index_y, align 4
-  %idxprom16 = sext i32 %13 to i64
-  %arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
-  %14 = load float, float* %arrayidx17, align 4
-  %conv18 = fpext float %14 to double
-  %mul19 = fmul contract double %mul15, %conv18
-  %15 = load float*, float** %oldw.addr, align 8
-  %16 = load i32, i32* %index, align 4
-  %idxprom20 = sext i32 %16 to i64
-  %arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
-  %17 = load float, float* %arrayidx21, align 4
-  %conv22 = fpext float %17 to double
-  %mul23 = fmul contract double 3.000000e-01, %conv22
-  %add24 = fadd contract double %mul19, %mul23
-  %18 = load float*, float** %w.addr, align 8
-  %19 = load i32, i32* %index, align 4
-  %idxprom25 = sext i32 %19 to i64
-  %arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
-  %20 = load float, float* %arrayidx26, align 4
-  %conv27 = fpext float %20 to double
-  %add28 = fadd contract double %conv27, %add24
-  %conv29 = fptrunc double %add28 to float
-  store float %conv29, float* %arrayidx26, align 4
-  %21 = load float*, float** %delta.addr, align 8
-  %22 = load i32, i32* %index_x, align 4
-  %idxprom30 = sext i32 %22 to i64
-  %arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
-  %23 = load float, float* %arrayidx31, align 4
-  %conv32 = fpext float %23 to double
-  %mul33 = fmul contract double 3.000000e-01, %conv32
-  %24 = load float*, float** %ly.addr, align 8
-  %25 = load i32, i32* %index_y, align 4
-  %idxprom34 = sext i32 %25 to i64
-  %arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
-  %26 = load float, float* %arrayidx35, align 4
-  %conv36 = fpext float %26 to double
-  %mul37 = fmul contract double %mul33, %conv36
-  %27 = load float*, float** %oldw.addr, align 8
-  %28 = load i32, i32* %index, align 4
-  %idxprom38 = sext i32 %28 to i64
-  %arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
-  %29 = load float, float* %arrayidx39, align 4
-  %conv40 = fpext float %29 to double
-  %mul41 = fmul contract double 3.000000e-01, %conv40
-  %add42 = fadd contract double %mul37, %mul41
-  %conv43 = fptrunc double %add42 to float
-  %30 = load float*, float** %oldw.addr, align 8
-  %31 = load i32, i32* %index, align 4
-  %idxprom44 = sext i32 %31 to i64
-  %arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
-  store float %conv43, float* %arrayidx45, align 4
-  call void @llvm.nvvm.barrier0()
-  %32 = load i32, i32* %ty, align 4
-  %cmp = icmp eq i32 %32, 0
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %33 = load i32, i32* %by, align 4
-  %cmp46 = icmp eq i32 %33, 0
-  br i1 %cmp46, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true
-  %34 = load float*, float** %delta.addr, align 8
-  %35 = load i32, i32* %index_x, align 4
-  %idxprom47 = sext i32 %35 to i64
-  %arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
-  %36 = load float, float* %arrayidx48, align 4
-  %conv49 = fpext float %36 to double
-  %mul50 = fmul contract double 3.000000e-01, %conv49
-  %37 = load float*, float** %oldw.addr, align 8
-  %38 = load i32, i32* %index_x, align 4
-  %idxprom51 = sext i32 %38 to i64
-  %arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
-  %39 = load float, float* %arrayidx52, align 4
-  %conv53 = fpext float %39 to double
-  %mul54 = fmul contract double 3.000000e-01, %conv53
-  %add55 = fadd contract double %mul50, %mul54
-  %40 = load float*, float** %w.addr, align 8
-  %41 = load i32, i32* %index_x, align 4
-  %idxprom56 = sext i32 %41 to i64
-  %arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
-  %42 = load float, float* %arrayidx57, align 4
-  %conv58 = fpext float %42 to double
-  %add59 = fadd contract double %conv58, %add55
-  %conv60 = fptrunc double %add59 to float
-  store float %conv60, float* %arrayidx57, align 4
-  %43 = load float*, float** %delta.addr, align 8
-  %44 = load i32, i32* %index_x, align 4
-  %idxprom61 = sext i32 %44 to i64
-  %arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
-  %45 = load float, float* %arrayidx62, align 4
-  %conv63 = fpext float %45 to double
-  %mul64 = fmul contract double 3.000000e-01, %conv63
-  %46 = load float*, float** %oldw.addr, align 8
-  %47 = load i32, i32* %index_x, align 4
-  %idxprom65 = sext i32 %47 to i64
-  %arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
-  %48 = load float, float* %arrayidx66, align 4
-  %conv67 = fpext float %48 to double
-  %mul68 = fmul contract double 3.000000e-01, %conv67
-  %add69 = fadd contract double %mul64, %mul68
-  %conv70 = fptrunc double %add69 to float
-  %49 = load float*, float** %oldw.addr, align 8
-  %50 = load i32, i32* %index_x, align 4
-  %idxprom71 = sext i32 %50 to i64
-  %arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
-  store float %conv70, float* %arrayidx72, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
-
-; Function Attrs: alwaysinline convergent inlinehint nounwind
-define internal float @__nv_fast_log2f(float %a) #4 {
-  %call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
-  %1 = icmp ne i32 %call.i, 0
-  br i1 %1, label %2, label %4
-
-2:                                                ; preds = %0
-  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
-  br label %__nvvm_builtin_log2f.exit
-
-4:                                                ; preds = %0
-  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
-  br label %__nvvm_builtin_log2f.exit
-
-__nvvm_builtin_log2f.exit:                        ; preds = %4, %2
-  %retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
-  ret float %retval.0.i
-}
-
-; Function Attrs: convergent nounwind
-declare i32 @__nvvm_reflect(i8*) #5
-
-; Function Attrs: nounwind readnone
-declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
-
-; Function Attrs: nounwind readnone
-declare float @llvm.nvvm.lg2.approx.f(float) #3
-
-; Function Attrs: alwaysinline convergent inlinehint nounwind
-define internal float @__nv_fast_powf(float %a, float %b) #4 {
-  %call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
-  %1 = icmp ne i32 %call.i.i, 0
-  br i1 %1, label %2, label %4
-
-2:                                                ; preds = %0
-  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
-  br label %__nv_fast_log2f.exit
-
-4:                                                ; preds = %0
-  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
-  br label %__nv_fast_log2f.exit
-
-__nv_fast_log2f.exit:                             ; preds = %4, %2
-  %retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
-  %6 = fmul float %b, %retval.0.i.i
-  %call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
-  %7 = icmp ne i32 %call.i.i1, 0
-  br i1 %7, label %8, label %10
-
-8:                                                ; preds = %__nv_fast_log2f.exit
-  %9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
-  br label %__nv_exp2f.exit
-
-10:                                               ; preds = %__nv_fast_log2f.exit
-  %11 = call float @llvm.nvvm.ex2.approx.f(float %6)
-  br label %__nv_exp2f.exit
-
-__nv_exp2f.exit:                                  ; preds = %10, %8
-  %retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
-  ret float %retval.0.i.i2
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
-
-; Function Attrs: nounwind readnone
-declare float @llvm.nvvm.ex2.approx.f(float) #3
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
-!llvm.ident = !{!9}
-!nvvmir.version = !{!10}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
-!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
-!5 = !{null, !"align", i32 8}
-!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!7 = !{null, !"align", i32 16}
-!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!10 = !{i32 1, i32 4}
--- a/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
--- a/examples/backprop/backprop_cuda.cu
+++ b/examples/backprop/backprop_cuda.cu
@ -1,195 +0,0 @@
-#include <cuda.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-
-// includes, kernels
-#include "backprop.h"
-#include "backprop_cuda_kernel.cu"
-
-////////////////////////////////////////////////////////////////////////////////
-
-extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
-                                  int n2);
-
-extern "C" void bpnn_output_error(float *delta, float *target, float *output,
-                                  int nj, float *err);
-
-extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
-                                  int no, float **who, float *hidden,
-                                  float *err);
-
-extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
-                                    int nly, float **w, float **oldw);
-
-extern "C" int setup(int argc, char **argv);
-
-extern "C" float **alloc_2d_dbl(int m, int n);
-
-extern "C" float squash(float x);
-
-double gettime() {
-  struct timeval t;
-  gettimeofday(&t, NULL);
-  return t.tv_sec + t.tv_usec * 1e-6;
-}
-
-unsigned int num_threads = 0;
-unsigned int num_blocks = 0;
-
-////////////////////////////////////////////////////////////////////////////////
-// Program main
-////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  cudaSetDevice(0);
-  setup(argc, argv);
-}
-
-extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
-  int in, hid, out;
-  float out_err, hid_err;
-
-  in = net->input_n;
-  hid = net->hidden_n;
-  out = net->output_n;
-
-#ifdef GPU
-  int m = 0;
-  float *input_hidden_cuda;
-  float *input_cuda;
-  float *output_hidden_cuda;
-  float *partial_sum;
-  float *hidden_partial_sum;
-  float *hidden_delta_cuda;
-  float *input_prev_weights_cuda;
-  float sum;
-  float *input_weights_one_dim;
-  float *input_weights_prev_one_dim;
-  num_blocks = in / 16;
-  dim3 grid(1, num_blocks);
-  dim3 threads(16, 16);
-
-  input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
-  input_weights_prev_one_dim =
-      (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
-  partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
-
-  // this preprocessing stage is added to correct the bugs of wrong memcopy
-  // using two-dimensional net->inputweights
-  for (int k = 0; k <= in; k++) {
-    for (int j = 0; j <= hid; j++) {
-      input_weights_one_dim[m] = net->input_weights[k][j];
-      input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
-      m++;
-    }
-  }
-
-  cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
-  cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
-  cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
-  cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
-
-#endif
-
-#ifdef CPU
-
-  printf("Performing CPU computation\n");
-  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
-                    hid);
-
-#endif
-
-#ifdef GPU
-
-  printf("Performing GPU computation\n");
-
-  // printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
-
-  cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
-             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
-
-  bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
-                                            input_hidden_cuda,
-                                            hidden_partial_sum, in, hid);
-
-  cudaThreadSynchronize();
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) {
-    printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
-    exit(EXIT_FAILURE);
-  }
-
-  cudaMemcpy(partial_sum, hidden_partial_sum,
-             num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
-
-  for (int j = 1; j <= hid; j++) {
-    sum = 0.0;
-    for (int k = 0; k < num_blocks; k++) {
-      sum += partial_sum[k * hid + j - 1];
-    }
-    sum += net->input_weights[0][j];
-    net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
-  }
-#endif
-
-  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
-                    hid, out);
-  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
-                    &out_err);
-  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
-                    net->hidden_weights, net->hidden_units, &hid_err);
-  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
-                      net->hidden_weights, net->hidden_prev_weights);
-
-#ifdef CPU
-
-  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
-                      net->input_weights, net->input_prev_weights);
-
-#endif
-
-#ifdef GPU
-
-  cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
-  cudaMalloc((void **)&input_prev_weights_cuda,
-             (in + 1) * (hid + 1) * sizeof(float));
-
-  cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
-             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
-             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
-
-  bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
-                                              input_cuda, in, input_hidden_cuda,
-                                              input_prev_weights_cuda);
-
-  cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
-             cudaMemcpyDeviceToHost);
-  cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
-             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
-
-  for (int i = 0; i < (in + 1) * (hid + 1); i++) {
-    printf("%f ", input_weights_one_dim[i]);
-  }
-  printf("\n");
-
-  cudaFree(input_cuda);
-  cudaFree(output_hidden_cuda);
-  cudaFree(input_hidden_cuda);
-  cudaFree(hidden_partial_sum);
-  cudaFree(input_prev_weights_cuda);
-  cudaFree(hidden_delta_cuda);
-
-  free(partial_sum);
-  free(input_weights_one_dim);
-  free(input_weights_prev_one_dim);
-
-#endif
-}
--- a/examples/backprop/backprop_cuda_kernel.cu
+++ b/examples/backprop/backprop_cuda_kernel.cu
@ -1,96 +0,0 @@
-#ifndef _BACKPROP_CUDA_KERNEL_H_
-#define _BACKPROP_CUDA_KERNEL_H_
-
-#include "backprop.h"
-#include "cuda.h"
-#include "math.h"
-#include <stdio.h>
-
-__global__ void bpnn_layerforward_CUDA(float *input_cuda,
-                                       float *output_hidden_cuda,
-                                       float *input_hidden_cuda,
-                                       float *hidden_partial_sum, int in,
-                                       int hid) {
-  int by = blockIdx.y;
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-
-  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
-
-  int index_in = HEIGHT * by + ty + 1;
-
-  __shared__ float input_node[HEIGHT];
-  __shared__ float weight_matrix[HEIGHT][WIDTH];
-
-  if (tx == 0)
-    input_node[ty] = input_cuda[index_in];
-
-  __syncthreads();
-
-  weight_matrix[ty][tx] = input_hidden_cuda[index];
-
-  __syncthreads();
-
-  weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
-
-  __syncthreads();
-
-  for (int i = 1; i <= __log2f(HEIGHT); i++) {
-
-    int power_two = __powf(2, i);
-
-    if (ty % power_two == 0)
-      weight_matrix[ty][tx] =
-          weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
-
-    __syncthreads();
-  }
-
-  //__syncthreads();
-
-  input_hidden_cuda[index] = weight_matrix[ty][tx];
-
-  /*
-     for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
-
-             unsigned int power_two = i - 1;
-             if( (ty & power_two) == 0 ) {
-                  weight_matrix[ty][tx] = weight_matrix[ty][tx] +
-     weight_matrix[ty + power_two/2][tx];
-             }
-     }
-     */
-
-  __syncthreads();
-
-  if (tx == 0) {
-    hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
-  }
-}
-
-__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
-                                         int in, float *w, float *oldw) {
-
-  int by = blockIdx.y;
-
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-
-  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
-  int index_y = HEIGHT * by + ty + 1;
-  int index_x = tx + 1;
-  // eta = 0.3;
-  // momentum = 0.3;
-
-  w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
-  oldw[index] =
-      ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
-
-  __syncthreads();
-
-  if (ty == 0 && by == 0) {
-    w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
-    oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
-  }
-}
-#endif
--- a/examples/backprop/facetrain.c
+++ b/examples/backprop/facetrain.c
@ -1,48 +0,0 @@
-#include "backprop.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-extern char *strcpy();
-extern void exit();
-
-int layer_size = 0;
-
-backprop_face() {
-  BPNN *net;
-  int i;
-  float out_err, hid_err;
-  net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
-
-  printf("Input layer size : %d\n", layer_size);
-  load(net);
-  // entering the training kernel, only one iteration
-  printf("Starting training kernel\n");
-  bpnn_train_cuda(net, &out_err, &hid_err);
-  bpnn_free(net);
-  printf("Training done\n");
-}
-
-int setup(argc, argv)
-int argc;
-char *argv[];
-{
-
-  int seed;
-
-  if (argc != 2) {
-    fprintf(stderr, "usage: backprop <num of input elements>\n");
-    exit(0);
-  }
-  layer_size = atoi(argv[1]);
-  if (layer_size % 16 != 0) {
-    fprintf(stderr, "The number of input points must be divided by 16\n");
-    exit(0);
-  }
-
-  seed = 7;
-  bpnn_initialize(seed);
-  backprop_face();
-
-  exit(0);
-}
--- a/examples/backprop/imagenet.c
+++ b/examples/backprop/imagenet.c
@ -1,22 +0,0 @@
-#include "backprop.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-extern layer_size;
-
-load(net) BPNN *net;
-{
-  float *units;
-  int nr, nc, imgsize, i, j, k;
-
-  nr = layer_size;
-
-  imgsize = nr * nc;
-  units = net->input_units;
-
-  k = 1;
-  for (i = 0; i < nr; i++) {
-    units[k] = (float)rand() / RAND_MAX;
-    k++;
-  }
-}
--- a/examples/backprop/run.sh
+++ b/examples/backprop/run.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-clang -c -emit-llvm backprop.c
-clang -c -emit-llvm facetrain.c
-clang -c -emit-llvm imagenet.c
-
-llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
-../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-llc --relocation-model=pic --filetype=obj  backprop.bc
-llc --relocation-model=pic --filetype=obj  facetrain.bc
-llc --relocation-model=pic --filetype=obj  imagenet.bc
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o demo \
-    -fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
-    -lc -lx86Runtime -lthreadPool -lpthread
-
-./demo 1024 > res.log
-if grep -q -e "0.173289 0.259645 0.350836" res.log; then
-    echo "Pass"
-else
-    echo "Error result"
-    exit 1
-fi
--- a/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,307 +0,0 @@
-; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "bfs.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-%struct.Node = type { i32, i32 }
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
-entry:
-  %g_graph_nodes.addr = alloca %struct.Node*, align 8
-  %g_graph_edges.addr = alloca i32*, align 8
-  %g_graph_mask.addr = alloca i8*, align 8
-  %g_updating_graph_mask.addr = alloca i8*, align 8
-  %g_graph_visited.addr = alloca i8*, align 8
-  %g_cost.addr = alloca i32*, align 8
-  %no_of_nodes.addr = alloca i32, align 4
-  %tid = alloca i32, align 4
-  %i = alloca i32, align 4
-  %id = alloca i32, align 4
-  store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
-  store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
-  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
-  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
-  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
-  store i32* %g_cost, i32** %g_cost.addr, align 8
-  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
-  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %mul = mul i32 %call, 512
-  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %add = add i32 %mul, %call1
-  store i32 %add, i32* %tid, align 4
-  %0 = load i32, i32* %tid, align 4
-  %1 = load i32, i32* %no_of_nodes.addr, align 4
-  %cmp = icmp slt i32 %0, %1
-  br i1 %cmp, label %land.lhs.true, label %if.end26
-
-land.lhs.true:                                    ; preds = %entry
-  %2 = load i8*, i8** %g_graph_mask.addr, align 8
-  %3 = load i32, i32* %tid, align 4
-  %idxprom = sext i32 %3 to i64
-  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
-  %4 = load i8, i8* %arrayidx, align 1
-  %tobool = trunc i8 %4 to i1
-  br i1 %tobool, label %if.then, label %if.end26
-
-if.then:                                          ; preds = %land.lhs.true
-  %5 = load i8*, i8** %g_graph_mask.addr, align 8
-  %6 = load i32, i32* %tid, align 4
-  %idxprom2 = sext i32 %6 to i64
-  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
-  store i8 0, i8* %arrayidx3, align 1
-  %7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
-  %8 = load i32, i32* %tid, align 4
-  %idxprom4 = sext i32 %8 to i64
-  %arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
-  %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
-  %9 = load i32, i32* %starting, align 4
-  store i32 %9, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %if.then
-  %10 = load i32, i32* %i, align 4
-  %11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
-  %12 = load i32, i32* %tid, align 4
-  %idxprom6 = sext i32 %12 to i64
-  %arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
-  %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
-  %13 = load i32, i32* %no_of_edges, align 4
-  %14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
-  %15 = load i32, i32* %tid, align 4
-  %idxprom8 = sext i32 %15 to i64
-  %arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
-  %starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
-  %16 = load i32, i32* %starting10, align 4
-  %add11 = add nsw i32 %13, %16
-  %cmp12 = icmp slt i32 %10, %add11
-  br i1 %cmp12, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %17 = load i32*, i32** %g_graph_edges.addr, align 8
-  %18 = load i32, i32* %i, align 4
-  %idxprom13 = sext i32 %18 to i64
-  %arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
-  %19 = load i32, i32* %arrayidx14, align 4
-  store i32 %19, i32* %id, align 4
-  %20 = load i8*, i8** %g_graph_visited.addr, align 8
-  %21 = load i32, i32* %id, align 4
-  %idxprom15 = sext i32 %21 to i64
-  %arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
-  %22 = load i8, i8* %arrayidx16, align 1
-  %tobool17 = trunc i8 %22 to i1
-  br i1 %tobool17, label %if.end, label %if.then18
-
-if.then18:                                        ; preds = %for.body
-  %23 = load i32*, i32** %g_cost.addr, align 8
-  %24 = load i32, i32* %tid, align 4
-  %idxprom19 = sext i32 %24 to i64
-  %arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
-  %25 = load i32, i32* %arrayidx20, align 4
-  %add21 = add nsw i32 %25, 1
-  %26 = load i32*, i32** %g_cost.addr, align 8
-  %27 = load i32, i32* %id, align 4
-  %idxprom22 = sext i32 %27 to i64
-  %arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
-  store i32 %add21, i32* %arrayidx23, align 4
-  %28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
-  %29 = load i32, i32* %id, align 4
-  %idxprom24 = sext i32 %29 to i64
-  %arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
-  store i8 1, i8* %arrayidx25, align 1
-  br label %if.end
-
-if.end:                                           ; preds = %if.then18, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %30 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %30, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  br label %if.end26
-
-if.end26:                                         ; preds = %for.end, %land.lhs.true, %entry
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
-entry:
-  %g_graph_mask.addr = alloca i8*, align 8
-  %g_updating_graph_mask.addr = alloca i8*, align 8
-  %g_graph_visited.addr = alloca i8*, align 8
-  %g_over.addr = alloca i8*, align 8
-  %no_of_nodes.addr = alloca i32, align 4
-  %tid = alloca i32, align 4
-  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
-  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
-  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
-  store i8* %g_over, i8** %g_over.addr, align 8
-  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
-  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %mul = mul i32 %call, 512
-  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %add = add i32 %mul, %call1
-  store i32 %add, i32* %tid, align 4
-  %0 = load i32, i32* %tid, align 4
-  %1 = load i32, i32* %no_of_nodes.addr, align 4
-  %cmp = icmp slt i32 %0, %1
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
-  %3 = load i32, i32* %tid, align 4
-  %idxprom = sext i32 %3 to i64
-  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
-  %4 = load i8, i8* %arrayidx, align 1
-  %tobool = trunc i8 %4 to i1
-  br i1 %tobool, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true
-  %5 = load i8*, i8** %g_graph_mask.addr, align 8
-  %6 = load i32, i32* %tid, align 4
-  %idxprom2 = sext i32 %6 to i64
-  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
-  store i8 1, i8* %arrayidx3, align 1
-  %7 = load i8*, i8** %g_graph_visited.addr, align 8
-  %8 = load i32, i32* %tid, align 4
-  %idxprom4 = sext i32 %8 to i64
-  %arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
-  store i8 1, i8* %arrayidx5, align 1
-  %9 = load i8*, i8** %g_over.addr, align 8
-  store i8 1, i8* %9, align 1
-  %10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
-  %11 = load i32, i32* %tid, align 4
-  %idxprom6 = sext i32 %11 to i64
-  %arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
-  store i8 0, i8* %arrayidx7, align 1
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { convergent nounwind }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
-!llvm.ident = !{!9}
-!nvvmir.version = !{!10}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
-!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
-!5 = !{null, !"align", i32 8}
-!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!7 = !{null, !"align", i32 16}
-!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!10 = !{i32 1, i32 4}
--- a/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
--- a/examples/bfs/bfs.cu
+++ b/examples/bfs/bfs.cu
@ -1,213 +0,0 @@
-#include <cuda.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define MAX_THREADS_PER_BLOCK 512
-
-int no_of_nodes;
-int edge_list_size;
-FILE *fp;
-
-// Structure to hold a node information
-struct Node {
-  int starting;
-  int no_of_edges;
-};
-
-#include "kernel.cu"
-#include "kernel2.cu"
-
-void BFSGraph(int argc, char **argv);
-
-////////////////////////////////////////////////////////////////////////////////
-// Main Program
-////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  cudaSetDevice(0);
-  no_of_nodes = 0;
-  edge_list_size = 0;
-  BFSGraph(argc, argv);
-}
-
-void Usage(int argc, char **argv) {
-
-  fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
-}
-////////////////////////////////////////////////////////////////////////////////
-// Apply BFS on a Graph using CUDA
-////////////////////////////////////////////////////////////////////////////////
-void BFSGraph(int argc, char **argv) {
-
-  char *input_f;
-  if (argc != 2) {
-    Usage(argc, argv);
-    exit(0);
-  }
-
-  input_f = argv[1];
-  printf("Reading File\n");
-  // Read in Graph from a file
-  fp = fopen(input_f, "r");
-  if (!fp) {
-    printf("Error Reading graph file\n");
-    return;
-  }
-
-  int source = 0;
-
-  fscanf(fp, "%d", &no_of_nodes);
-
-  int num_of_blocks = 1;
-  int num_of_threads_per_block = no_of_nodes;
-
-  // Make execution Parameters according to the number of nodes
-  // Distribute threads across multiple Blocks if necessary
-  if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
-    num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
-    num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
-  }
-
-  // allocate host memory
-  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
-  bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
-  bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
-  bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
-
-  int start, edgeno;
-  // initalize the memory
-  for (unsigned int i = 0; i < no_of_nodes; i++) {
-    fscanf(fp, "%d %d", &start, &edgeno);
-    h_graph_nodes[i].starting = start;
-    h_graph_nodes[i].no_of_edges = edgeno;
-    h_graph_mask[i] = false;
-    h_updating_graph_mask[i] = false;
-    h_graph_visited[i] = false;
-  }
-
-  // read the source node from the file
-  fscanf(fp, "%d", &source);
-  source = 0;
-
-  // set the source node as true in the mask
-  h_graph_mask[source] = true;
-  h_graph_visited[source] = true;
-
-  fscanf(fp, "%d", &edge_list_size);
-
-  int id, cost;
-  int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
-  for (int i = 0; i < edge_list_size; i++) {
-    fscanf(fp, "%d", &id);
-    fscanf(fp, "%d", &cost);
-    h_graph_edges[i] = id;
-  }
-
-  if (fp)
-    fclose(fp);
-
-  printf("Read File\n");
-
-  // Copy the Node list to device memory
-  Node *d_graph_nodes;
-  cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
-  cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
-             cudaMemcpyHostToDevice);
-
-  // Copy the Edge List to device Memory
-  int *d_graph_edges;
-  cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
-  cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
-             cudaMemcpyHostToDevice);
-
-  // Copy the Mask to device memory
-  bool *d_graph_mask;
-  cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
-  cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
-             cudaMemcpyHostToDevice);
-
-  bool *d_updating_graph_mask;
-  cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
-  cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
-             sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
-
-  // Copy the Visited nodes array to device memory
-  bool *d_graph_visited;
-  cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
-  cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
-             cudaMemcpyHostToDevice);
-
-  // allocate mem for the result on host side
-  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
-  for (int i = 0; i < no_of_nodes; i++)
-    h_cost[i] = -1;
-  h_cost[source] = 0;
-
-  // allocate device memory for result
-  int *d_cost;
-  cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
-  cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
-
-  // make a bool to check if the execution is over
-  bool *d_over;
-  cudaMalloc((void **)&d_over, sizeof(bool));
-
-  printf("Copied Everything to GPU memory\n");
-
-  // setup execution parameters
-  dim3 grid(num_of_blocks, 1, 1);
-  dim3 threads(num_of_threads_per_block, 1, 1);
-
-  int k = 0;
-  printf("Start traversing the tree\n");
-  bool stop;
-  // Call the Kernel untill all the elements of Frontier are not false
-  do {
-    // if no thread changes this value then the loop stops
-    stop = false;
-    cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
-
-    Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
-                                 d_updating_graph_mask, d_graph_visited, d_cost,
-                                 no_of_nodes);
-    cudaDeviceSynchronize();
-    // check if kernel execution generated and error
-
-    Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
-                                  d_graph_visited, d_over, no_of_nodes);
-    cudaDeviceSynchronize();
-    // check if kernel execution generated and error
-
-    cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
-
-    k++;
-  } while (stop);
-
-  printf("Kernel Executed %d times\n", k);
-
-  // copy result from device to host
-  cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
-
-  // Store the result into a file
-  FILE *fpo = fopen("result.txt", "w");
-  for (int i = 0; i < no_of_nodes; i++)
-    fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
-  fclose(fpo);
-  printf("Result stored in result.txt\n");
-
-  // cleanup memory
-  free(h_graph_nodes);
-  free(h_graph_edges);
-  free(h_graph_mask);
-  free(h_updating_graph_mask);
-  free(h_graph_visited);
-  free(h_cost);
-
-  cudaFree(d_graph_nodes);
-  cudaFree(d_graph_edges);
-  cudaFree(d_graph_mask);
-  cudaFree(d_updating_graph_mask);
-  cudaFree(d_graph_visited);
-  cudaFree(d_cost);
-}
--- a/examples/bfs/kernel.cu
+++ b/examples/bfs/kernel.cu
@ -1,23 +0,0 @@
-#ifndef _KERNEL_H_
-#define _KERNEL_H_
-
-__global__ void
-Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
-{
-	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
-	if( tid<no_of_nodes && g_graph_mask[tid])
-	{
-		g_graph_mask[tid]=false;
-		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
-			{
-			int id = g_graph_edges[i];
-			if(!g_graph_visited[id])
-				{
-				g_cost[id]=g_cost[tid]+1;
-				g_updating_graph_mask[id]=true;
-				}
-			}
-	}
-}
-
-#endif
--- a/examples/bfs/kernel2.cu
+++ b/examples/bfs/kernel2.cu
@ -1,18 +0,0 @@
-#ifndef _KERNEL2_H_
-#define _KERNEL2_H_
-
-__global__ void
-Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
-{
-	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
-	if( tid<no_of_nodes && g_updating_graph_mask[tid])
-	{
-
-		g_graph_mask[tid]=true;
-		g_graph_visited[tid]=true;
-		*g_over=true;
-		g_updating_graph_mask[tid]=false;
-	}
-}
-
-#endif
--- a/examples/bfs/run.sh
+++ b/examples/bfs/run.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-set -e
-llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
-../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-
-g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
-    -o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
-
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-./bfs.out ../../rodinia-data/bfs/graph65536.txt
-if grep -q "0) cost:0" result.txt; then
-    echo "Pass"
-else
-    echo "Error result"
-    exit 1
-fi
--- a/examples/btree/common.h
+++ b/examples/btree/common.h
@ -1,343 +0,0 @@
-// # ifdef __cplusplus
-// extern "C" {
-// # endif
-
-// #ifndef LIST_H
-// # define LIST_H
-
-//===============================================================================================================================================================================================================200
-//	DEFINE/INCLUDE
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	INCLUDE (for some reason these are not recognized when defined in main
-// file before this one is included)
-//======================================================================================================================================================150
-
-#include <stdbool.h> // (in path known to compiler)			needed by true/false, bool
-#include <stdint.h>  // (in path known to compiler)			needed by uint32_t
-#include <stdlib.h>  // (in path known to compiler)			needed by malloc
-
-//======================================================================================================================================================150
-//	DEFINE
-//======================================================================================================================================================150
-
-#define fp float
-
-#define Version "1.5"
-
-#ifdef WINDOWS
-#define bool char
-#define false 0
-#define true 1
-#endif
-
-/* #define DEFAULT_ORDER 256 */
-
-#ifdef RD_WG_SIZE_0_0
-#define DEFAULT_ORDER RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define DEFAULT_ORDER RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define DEFAULT_ORDER RD_WG_SIZE
-#else
-#define DEFAULT_ORDER 256
-#endif
-
-/* #ifdef RD_WG_SIZE_1_0 */
-/*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
-/* #elif defined(RD_WG_SIZE_1) */
-/*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1 */
-/* #elif defined(RD_WG_SIZE) */
-/*         #define  DEFAULT_ORDER_2 RD_WG_SIZE */
-/* #else */
-/*         #define  DEFAULT_ORDER_2 256 */
-/* #endif */
-
-/* #define DEFAULT_ORDER 508 */
-
-#define malloc(size)                                                           \
-  ({                                                                           \
-    void *_tmp;                                                                \
-                                                                               \
-    if (!(_tmp = malloc(size))) {                                              \
-      fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__);    \
-      exit(-1);                                                                \
-    }                                                                          \
-                                                                               \
-    _tmp;                                                                      \
-  })
-
-//======================================================================================================================================================150
-//	STRUCTURES
-//======================================================================================================================================================150
-
-// struct list_item;
-typedef struct list_item list_item_t;
-
-typedef struct list_t {
-  list_item_t *head, *tail;
-  uint32_t length;
-  int32_t (*compare)(const void *key, const void *with);
-  void (*datum_delete)(void *);
-} list_t;
-
-typedef list_item_t *list_iterator_t;
-typedef list_item_t *list_reverse_iterator_t;
-
-/* Type representing the record
- * to which a given key refers.
- * In a real B+ tree system, the
- * record would hold data (in a database)
- * or a file (in an operating system)
- * or some other information.
- * Users can rewrite this part of the code
- * to change the type and content
- * of the value field.
- */
-typedef struct record {
-  int value;
-} record;
-
-/* Type representing a node in the B+ tree.
- * This type is general enough to serve for both
- * the leaf and the internal node.
- * The heart of the node is the array
- * of keys and the array of corresponding
- * pointers.  The relation between keys
- * and pointers differs between leaves and
- * internal nodes.  In a leaf, the index
- * of each key equals the index of its corresponding
- * pointer, with a maximum of order - 1 key-pointer
- * pairs.  The last pointer points to the
- * leaf to the right (or NULL in the case
- * of the rightmost leaf).
- * In an internal node, the first pointer
- * refers to lower nodes with keys less than
- * the smallest key in the keys array.  Then,
- * with indices i starting at 0, the pointer
- * at i + 1 points to the subtree with keys
- * greater than or equal to the key in this
- * node at index i.
- * The num_keys field is used to keep
- * track of the number of valid keys.
- * In an internal node, the number of valid
- * pointers is always num_keys + 1.
- * In a leaf, the number of valid pointers
- * to data is always num_keys.  The
- * last leaf pointer points to the next leaf.
- */
-typedef struct node {
-  void **pointers;
-  int *keys;
-  struct node *parent;
-  bool is_leaf;
-  int num_keys;
-  struct node *next; // Used for queue.
-} node;
-
-//
-typedef struct knode {
-  int location;
-  int indices[DEFAULT_ORDER + 1];
-  int keys[DEFAULT_ORDER + 1];
-  bool is_leaf;
-  int num_keys;
-} knode;
-
-struct list_item {
-  struct list_item *pred, *next;
-  void *datum;
-};
-
-//===============================================================================================================================================================================================================200
-//	PROTOTYPES
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-// Other
-//======================================================================================================================================================150
-
-void list_item_init(list_item_t *li, void *datum);
-
-void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
-
-void list_insert_item_tail(list_t *l, list_item_t *i);
-
-void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
-
-void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
-
-void list_insert_item_sorted(list_t *l, list_item_t *i);
-
-//======================================================================================================================================================150
-// ???
-//======================================================================================================================================================150
-
-void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
-               void (*datum_delete)(void *datum));
-
-void list_delete(list_t *l);
-
-void list_reset(list_t *l);
-
-void list_insert_head(list_t *l, void *v);
-
-void list_insert_tail(list_t *l, void *v);
-
-void list_insert_before(list_t *l, list_item_t *next, void *v);
-
-void list_insert_after(list_t *l, list_item_t *pred, void *v);
-
-void list_insert_sorted(list_t *l, void *v);
-
-void list_insert_item_head(list_t *l, list_item_t *i);
-
-void list_remove_item(list_t *l, list_item_t *i);
-
-void list_remove_head(list_t *l);
-
-void list_remove_tail(list_t *l);
-
-list_item_t *list_find_item(list_t *l, void *datum);
-
-list_item_t *list_get_head_item(list_t *l);
-
-list_item_t *list_get_tail_item(list_t *l);
-
-void *list_find(list_t *l, void *datum);
-
-void *list_get_head(list_t *l);
-
-void *list_get_tail(list_t *l);
-
-uint32_t list_get_length(list_t *l);
-
-bool list_is_empty(list_t *l);
-
-bool list_not_empty(list_t *l);
-
-void list_visit_items(list_t *l, void (*visitor)(void *v));
-
-void *list_item_get_datum(list_item_t *li);
-
-void list_iterator_init(list_t *l, list_iterator_t *li);
-
-void list_iterator_delete(list_iterator_t *li);
-
-void list_iterator_next(list_iterator_t *li);
-
-void list_iterator_prev(list_iterator_t *li);
-
-void *list_iterator_get_datum(list_iterator_t *li);
-
-bool list_iterator_is_valid(list_iterator_t *li);
-
-void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
-
-void list_reverse_iterator_delete(list_iterator_t *li);
-
-void list_reverse_iterator_next(list_iterator_t *li);
-
-void list_reverse_iterator_prev(list_iterator_t *li);
-
-void *list_reverse_iterator_get_datum(list_iterator_t *li);
-
-bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
-
-//======================================================================================================================================================150
-// Output and utility
-//======================================================================================================================================================150
-
-void *kmalloc(int size);
-
-long transform_to_cuda(node *n,
-                       bool verbose); // returns actual mem used in a long
-
-void usage_1(void);
-
-void usage_2(void);
-
-void enqueue(node *new_node);
-
-node *dequeue(void);
-
-int height(node *root);
-
-int path_to_root(node *root, node *child);
-
-void print_leaves(node *root);
-
-void print_tree(node *root);
-
-node *find_leaf(node *root, int key, bool verbose);
-
-record *find(node *root, int key, bool verbose);
-
-int cut(int length);
-
-//======================================================================================================================================================150
-// Insertion
-//======================================================================================================================================================150
-
-record *make_record(int value);
-
-node *make_node(void);
-
-node *make_leaf(void);
-
-int get_left_index(node *parent, node *left);
-
-node *insert_into_leaf(node *leaf, int key, record *pointer);
-
-node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
-                                       record *pointer);
-
-node *insert_into_node(node *root, node *parent, int left_index, int key,
-                       node *right);
-
-node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
-                                       int key, node *right);
-
-node *insert_into_parent(node *root, node *left, int key, node *right);
-
-node *insert_into_new_root(node *left, int key, node *right);
-
-node *start_new_tree(int key, record *pointer);
-
-node *insert(node *root, int key, int value);
-
-//======================================================================================================================================================150
-// Deletion
-//======================================================================================================================================================150
-
-int get_neighbor_index(node *n);
-
-node *adjust_root(node *root);
-
-node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
-                     int k_prime);
-
-node *redistribute_nodes(node *root, node *n, node *neighbor,
-                         int neighbor_index, int k_prime_index, int k_prime);
-
-node *delete_entry(node *root, node *n, int key, void *pointer);
-
-node *deleteVal(node *root, int key);
-
-//===============================================================================================================================================================================================================200
-//	HEADER
-//===============================================================================================================================================================================================================200
-
-// int main(	int argc,
-// char *argv []);
-
-//===============================================================================================================================================================================================================200
-//	END
-//===============================================================================================================================================================================================================200
-
-// #endif
-
-// # ifdef __cplusplus
-// }
-// # endif
--- a/examples/btree/kernel/kernel_gpu_cuda.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda.cu
@ -1,54 +0,0 @@
-//========================================================================================================================================================================================================200
-//	findK function
-//========================================================================================================================================================================================================200
-
-__global__ void
-findK(	long height,
-		knode *knodesD,
-		long knodes_elem,
-		record *recordsD,
-
-		long *currKnodeD,
-		long *offsetD,
-		int *keysD,
-		record *ansD)
-{
-
-	// private thread IDs
-	int thid = threadIdx.x;
-	int bid = blockIdx.x;
-
-	// processtree levels
-	int i;
-	for(i = 0; i < height; i++){
-
-		// if value is between the two keys
-		if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
-			// this conditional statement is inserted to avoid crush due to but in original code
-			// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
-			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
-			if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
-				offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
-			}
-		}
-		__syncthreads();
-
-		// set for next tree level
-		if(thid==0){
-			currKnodeD[bid] = offsetD[bid];
-		}
-		__syncthreads();
-
-	}
-
-	//At this point, we have a candidate leaf node which may contain
-	//the target record.  Check each key to hopefully find the record
-	if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
-		ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
-	}
-
-}
-
-//========================================================================================================================================================================================================200
-//	End
-//========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_2.cu
@ -1,70 +0,0 @@
-//========================================================================================================================================================================================================200
-//	findRangeK function
-//========================================================================================================================================================================================================200
-
-__global__ void
-findRangeK(	long height,
-
-			knode *knodesD,
-			long knodes_elem,
-
-			long *currKnodeD,
-			long *offsetD,
-			long *lastKnodeD,
-			long *offset_2D,
-			int *startD,
-			int *endD,
-			int *RecstartD,
-			int *ReclenD)
-{
-
-	// private thread IDs
-	int thid = threadIdx.x;
-	int bid = blockIdx.x;
-
-	// ???
-	int i;
-	for(i = 0; i < height; i++){
-
-		if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
-			// this conditional statement is inserted to avoid crush due to but in original code
-			// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
-			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
-			if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
-				offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
-			}
-		}
-		if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
-			// this conditional statement is inserted to avoid crush due to but in original code
-			// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
-			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
-			if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
-				offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
-			}
-		}
-		__syncthreads();
-
-		// set for next tree level
-		if(thid==0){
-			currKnodeD[bid] = offsetD[bid];
-			lastKnodeD[bid] = offset_2D[bid];
-		}
-		__syncthreads();
-	}
-
-	// Find the index of the starting record
-	if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
-		RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
-	}
-	__syncthreads();
-
-	// Find the index of the ending record
-	if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
-		ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
-	}
-
-}
-
-//========================================================================================================================================================================================================200
-//	End
-//========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
@ -1,292 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//========================================================================================================================================================================================================200
-//	DEFINE/INCLUDE
-//========================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	COMMON
-//======================================================================================================================================================150
-
-#include "../common.h"								// (in main program directory)			needed to recognized input variables
-
-//======================================================================================================================================================150
-//	UTILITIES
-//======================================================================================================================================================150
-
-#include "../util/cuda/cuda.h"					// (in path specified to compiler)	needed by for device functions
-#include "../util/timer/timer.h"					// (in path specified to compiler)	needed by timer
-
-//======================================================================================================================================================150
-//	KERNEL
-//======================================================================================================================================================150
-
-#include "./kernel_gpu_cuda.cu"						// (in current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
-
-//======================================================================================================================================================150
-//	HEADER
-//======================================================================================================================================================150
-
-#include "./kernel_gpu_cuda_wrapper.h"				// (in current directory)
-
-//========================================================================================================================================================================================================200
-//	KERNEL_GPU_CUDA_WRAPPER FUNCTION
-//========================================================================================================================================================================================================200
-
-void
-kernel_gpu_cuda_wrapper(record *records,
-						long records_mem,
-						knode *knodes,
-						long knodes_elem,
-						long knodes_mem,
-
-						int order,
-						long maxheight,
-						int count,
-
-						long *currKnode,
-						long *offset,
-						int *keys,
-						record *ans)
-{
-
-	//======================================================================================================================================================150
-	//	CPU VARIABLES
-	//======================================================================================================================================================150
-
-	// timer
-	long long time0;
-	long long time1;
-	long long time2;
-	long long time3;
-	long long time4;
-	long long time5;
-	long long time6;
-
-	time0 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU SETUP
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	INITIAL DRIVER OVERHEAD
-	//====================================================================================================100
-
-	cudaThreadSynchronize();
-
-	//====================================================================================================100
-	//	EXECUTION PARAMETERS
-	//====================================================================================================100
-
-	int numBlocks;
-	numBlocks = count;									// max # of blocks can be 65,535
-	int threadsPerBlock;
-	threadsPerBlock = order < 1024 ? order : 1024;
-
-	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
-
-	time1 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY				(MALLOC)
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	DEVICE IN
-	//====================================================================================================100
-
-	//==================================================50
-	//	recordsD
-	//==================================================50
-
-	record *recordsD;
-	cudaMalloc((void**)&recordsD, records_mem);
-	checkCUDAError("cudaMalloc  recordsD");
-
-	//==================================================50
-	//	knodesD
-	//==================================================50
-
-	knode *knodesD;
-	cudaMalloc((void**)&knodesD, knodes_mem);
-	checkCUDAError("cudaMalloc  recordsD");
-
-	//==================================================50
-	//	currKnodeD
-	//==================================================50
-
-	long *currKnodeD;
-	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
-	checkCUDAError("cudaMalloc  currKnodeD");
-
-	//==================================================50
-	//	offsetD
-	//==================================================50
-
-	long *offsetD;
-	cudaMalloc((void**)&offsetD, count*sizeof(long));
-	checkCUDAError("cudaMalloc  offsetD");
-
-	//==================================================50
-	//	keysD
-	//==================================================50
-
-	int *keysD;
-	cudaMalloc((void**)&keysD, count*sizeof(int));
-	checkCUDAError("cudaMalloc  keysD");
-
-	//====================================================================================================100
-	//	DEVICE IN/OUT
-	//====================================================================================================100
-
-	//==================================================50
-	//	ansD
-	//==================================================50
-
-	record *ansD;
-	cudaMalloc((void**)&ansD, count*sizeof(record));
-	checkCUDAError("cudaMalloc ansD");
-
-	time2 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY			COPY
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	GPU MEMORY				(MALLOC) COPY IN
-	//====================================================================================================100
-
-	//==================================================50
-	//	recordsD
-	//==================================================50
-
-	cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy memD");
-
-	//==================================================50
-	//	knodesD
-	//==================================================50
-
-	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy memD");
-
-	//==================================================50
-	//	currKnodeD
-	//==================================================50
-
-	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
-
-	//==================================================50
-	//	offsetD
-	//==================================================50
-
-	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
-
-	//==================================================50
-	//	keysD
-	//==================================================50
-
-	cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy keysD");
-
-	//====================================================================================================100
-	//	DEVICE IN/OUT
-	//====================================================================================================100
-
-	//==================================================50
-	//	ansD
-	//==================================================50
-
-	cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy ansD");
-
-	time3 = get_time();
-
-	//======================================================================================================================================================150
-	// findK kernel
-	//======================================================================================================================================================150
-
-	findK<<<numBlocks, threadsPerBlock>>>(	maxheight,
-
-											knodesD,
-											knodes_elem,
-
-											recordsD,
-
-											currKnodeD,
-											offsetD,
-											keysD,
-											ansD);
-	cudaThreadSynchronize();
-	checkCUDAError("findK");
-
-	time4 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY			COPY (CONTD.)
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	DEVICE IN/OUT
-	//====================================================================================================100
-
-	//==================================================50
-	//	ansD
-	//==================================================50
-
-	cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
-	checkCUDAError("cudaMemcpy ansD");
-
-	time5 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY DEALLOCATION
-	//======================================================================================================================================================150
-
-	cudaFree(recordsD);
-	cudaFree(knodesD);
-
-	cudaFree(currKnodeD);
-	cudaFree(offsetD);
-	cudaFree(keysD);
-	cudaFree(ansD);
-
-	time6 = get_time();
-
-	//======================================================================================================================================================150
-	//	DISPLAY TIMING
-	//======================================================================================================================================================150
-
-	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
-
-	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
-	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
-	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
-
-	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
-
-	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
-	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
-
-	printf("Total time:\n");
-	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
-
-//========================================================================================================================================================================================================200
-//	End
-//========================================================================================================================================================================================================200
-
-}
-
-//========================================================================================================================================================================================================200
-//	END
-//========================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
@ -1,23 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//========================================================================================================================================================================================================200
-//	KERNEL_GPU_CUDA_WRAPPER HEADER
-//========================================================================================================================================================================================================200
-
-void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
-                             long knodes_elem, long knodes_mem,
-
-                             int order, long maxheight, int count,
-
-                             long *currKnode, long *offset, int *keys,
-                             record *ans);
-
-//========================================================================================================================================================================================================200
-//	End
-//========================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
@ -1,347 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//========================================================================================================================================================================================================200
-//	INCLUDE
-//========================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	COMMON
-//======================================================================================================================================================150
-
-#include "../common.h"									// (in the main program folder)	needed to recognized input parameters
-
-//======================================================================================================================================================150
-//	UTILITIES
-//======================================================================================================================================================150
-
-#include "../util/cuda/cuda.h"							// (in library path specified to compiler)	needed by for device functions
-#include "../util/timer/timer.h"						// (in library path specified to compiler)	needed by timer
-
-//======================================================================================================================================================150
-//	KERNEL
-//======================================================================================================================================================150
-
-#include "./kernel_gpu_cuda_2.cu"						// (in the current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
-
-//======================================================================================================================================================150
-//	HEADER
-//======================================================================================================================================================150
-
-#include "./kernel_gpu_cuda_wrapper_2.h"				// (in the current directory)
-
-//========================================================================================================================================================================================================200
-//	FUNCTION
-//========================================================================================================================================================================================================200
-
-void
-kernel_gpu_cuda_wrapper_2(	knode *knodes,
-							long knodes_elem,
-							long knodes_mem,
-
-							int order,
-							long maxheight,
-							int count,
-
-							long *currKnode,
-							long *offset,
-							long *lastKnode,
-							long *offset_2,
-							int *start,
-							int *end,
-							int *recstart,
-							int *reclength)
-{
-
-	//======================================================================================================================================================150
-	//	CPU VARIABLES
-	//======================================================================================================================================================150
-
-	// timer
-	long long time0;
-	long long time1;
-	long long time2;
-	long long time3;
-	long long time4;
-	long long time5;
-	long long time6;
-
-	time0 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU SETUP
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	INITIAL DRIVER OVERHEAD
-	//====================================================================================================100
-
-	cudaThreadSynchronize();
-
-	//====================================================================================================100
-	//	EXECUTION PARAMETERS
-	//====================================================================================================100
-
-	int numBlocks;
-	numBlocks = count;
-	int threadsPerBlock;
-	threadsPerBlock = order < 1024 ? order : 1024;
-
-	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
-
-	time1 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY				MALLOC
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	DEVICE IN
-	//====================================================================================================100
-
-	//==================================================50
-	//	knodesD
-	//==================================================50
-
-	knode *knodesD;
-	cudaMalloc((void**)&knodesD, knodes_mem);
-	checkCUDAError("cudaMalloc  recordsD");
-
-	//==================================================50
-	//	currKnodeD
-	//==================================================50
-
-	long *currKnodeD;
-	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
-	checkCUDAError("cudaMalloc  currKnodeD");
-
-	//==================================================50
-	//	offsetD
-	//==================================================50
-
-	long *offsetD;
-	cudaMalloc((void**)&offsetD, count*sizeof(long));
-	checkCUDAError("cudaMalloc  offsetD");
-
-	//==================================================50
-	//	lastKnodeD
-	//==================================================50
-
-	long *lastKnodeD;
-	cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
-	checkCUDAError("cudaMalloc  lastKnodeD");
-
-	//==================================================50
-	//	offset_2D
-	//==================================================50
-
-	long *offset_2D;
-	cudaMalloc((void**)&offset_2D, count*sizeof(long));
-	checkCUDAError("cudaMalloc  offset_2D");
-
-	//==================================================50
-	//	startD
-	//==================================================50
-
-	int *startD;
-	cudaMalloc((void**)&startD, count*sizeof(int));
-	checkCUDAError("cudaMalloc startD");
-
-	//==================================================50
-	//	endD
-	//==================================================50
-
-	int *endD;
-	cudaMalloc((void**)&endD, count*sizeof(int));
-	checkCUDAError("cudaMalloc endD");
-
-	//====================================================================================================100
-	//	DEVICE IN/OUT
-	//====================================================================================================100
-
-	//==================================================50
-	//	ansDStart
-	//==================================================50
-
-	int *ansDStart;
-	cudaMalloc((void**)&ansDStart, count*sizeof(int));
-	checkCUDAError("cudaMalloc ansDStart");
-
-	//==================================================50
-	//	ansDLength
-	//==================================================50
-
-	int *ansDLength;
-	cudaMalloc((void**)&ansDLength, count*sizeof(int));
-	checkCUDAError("cudaMalloc ansDLength");
-
-	time2 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY			COPY
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	DEVICE IN
-	//====================================================================================================100
-
-	//==================================================50
-	//	knodesD
-	//==================================================50
-
-	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy memD");
-
-	//==================================================50
-	//	currKnodeD
-	//==================================================50
-
-	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
-
-	//==================================================50
-	//	offsetD
-	//==================================================50
-
-	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
-
-	//==================================================50
-	//	lastKnodeD
-	//==================================================50
-
-	cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
-
-	//==================================================50
-	//	offset_2D
-	//==================================================50
-
-	cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
-
-	//==================================================50
-	//	startD
-	//==================================================50
-
-	cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMemcpy startD");
-
-	//==================================================50
-	//	endD
-	//==================================================50
-
-	cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMemcpy endD");
-
-	//====================================================================================================100
-	//	DEVICE IN/OUT
-	//====================================================================================================100
-
-	//==================================================50
-	//	ansDStart
-	//==================================================50
-
-	cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMemcpy ansDStart");
-
-	//==================================================50
-	//	ansDLength
-	//==================================================50
-
-	cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
-	checkCUDAError("cudaMemcpy ansDLength");
-
-	time3 = get_time();
-
-	//======================================================================================================================================================150
-	//	KERNEL
-	//======================================================================================================================================================150
-
-	// [GPU] findRangeK kernel
-	findRangeK<<<numBlocks, threadsPerBlock>>>(	maxheight,
-												knodesD,
-												knodes_elem,
-
-												currKnodeD,
-												offsetD,
-												lastKnodeD,
-												offset_2D,
-												startD,
-												endD,
-												ansDStart,
-												ansDLength);
-	cudaThreadSynchronize();
-	checkCUDAError("findRangeK");
-
-	time4 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY			COPY (CONTD.)
-	//======================================================================================================================================================150
-
-	//====================================================================================================100
-	//	DEVICE IN/OUT
-	//====================================================================================================100
-
-	//==================================================50
-	//	ansDStart
-	//==================================================50
-
-	cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
-	checkCUDAError("cudaMemcpy ansDStart");
-
-	//==================================================50
-	//	ansDLength
-	//==================================================50
-
-	cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
-	checkCUDAError("cudaMemcpy ansDLength");
-
-	time5 = get_time();
-
-	//======================================================================================================================================================150
-	//	GPU MEMORY DEALLOCATION
-	//======================================================================================================================================================150
-
-	cudaFree(knodesD);
-
-	cudaFree(currKnodeD);
-	cudaFree(offsetD);
-	cudaFree(lastKnodeD);
-	cudaFree(offset_2D);
-	cudaFree(startD);
-	cudaFree(endD);
-	cudaFree(ansDStart);
-	cudaFree(ansDLength);
-
-	time6 = get_time();
-
-	//======================================================================================================================================================150
-	//	DISPLAY TIMING
-	//======================================================================================================================================================150
-
-	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
-
-	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
-	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
-	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
-
-	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
-
-	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
-	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
-
-	printf("Total time:\n");
-	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
-
-}
-
-//========================================================================================================================================================================================================200
-//	END
-//========================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
@ -1,23 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//========================================================================================================================================================================================================200
-//	KERNEL_GPU_CUDA_WRAPPER HEADER
-//========================================================================================================================================================================================================200
-
-void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
-
-                               int order, long maxheight, int count,
-
-                               long *currKnode, long *offset, long *lastKnode,
-                               long *offset_2, int *start, int *end,
-                               int *recstart, int *reclength);
-
-//========================================================================================================================================================================================================200
-//	End
-//========================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,332 +0,0 @@
-; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
-%struct.record = type { i32 }
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
-
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
-entry:
-  %height.addr = alloca i64, align 8
-  %knodesD.addr = alloca %struct.knode*, align 8
-  %knodes_elem.addr = alloca i64, align 8
-  %recordsD.addr = alloca %struct.record*, align 8
-  %currKnodeD.addr = alloca i64*, align 8
-  %offsetD.addr = alloca i64*, align 8
-  %keysD.addr = alloca i32*, align 8
-  %ansD.addr = alloca %struct.record*, align 8
-  %thid = alloca i32, align 4
-  %bid = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i64 %height, i64* %height.addr, align 8
-  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
-  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
-  store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
-  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
-  store i64* %offsetD, i64** %offsetD.addr, align 8
-  store i32* %keysD, i32** %keysD.addr, align 8
-  store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
-  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call, i32* %thid, align 4
-  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call1, i32* %bid, align 4
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %conv = sext i32 %0 to i64
-  %1 = load i64, i64* %height.addr, align 8
-  %cmp = icmp slt i64 %conv, %1
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %3 = load i64*, i64** %currKnodeD.addr, align 8
-  %4 = load i32, i32* %bid, align 4
-  %idxprom = sext i32 %4 to i64
-  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
-  %5 = load i64, i64* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
-  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
-  %6 = load i32, i32* %thid, align 4
-  %idxprom3 = sext i32 %6 to i64
-  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
-  %7 = load i32, i32* %arrayidx4, align 4
-  %8 = load i32*, i32** %keysD.addr, align 8
-  %9 = load i32, i32* %bid, align 4
-  %idxprom5 = sext i32 %9 to i64
-  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
-  %10 = load i32, i32* %arrayidx6, align 4
-  %cmp7 = icmp sle i32 %7, %10
-  br i1 %cmp7, label %land.lhs.true, label %if.end34
-
-land.lhs.true:                                    ; preds = %for.body
-  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %12 = load i64*, i64** %currKnodeD.addr, align 8
-  %13 = load i32, i32* %bid, align 4
-  %idxprom8 = sext i32 %13 to i64
-  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
-  %14 = load i64, i64* %arrayidx9, align 8
-  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
-  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
-  %15 = load i32, i32* %thid, align 4
-  %add = add nsw i32 %15, 1
-  %idxprom12 = sext i32 %add to i64
-  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
-  %16 = load i32, i32* %arrayidx13, align 4
-  %17 = load i32*, i32** %keysD.addr, align 8
-  %18 = load i32, i32* %bid, align 4
-  %idxprom14 = sext i32 %18 to i64
-  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
-  %19 = load i32, i32* %arrayidx15, align 4
-  %cmp16 = icmp sgt i32 %16, %19
-  br i1 %cmp16, label %if.then, label %if.end34
-
-if.then:                                          ; preds = %land.lhs.true
-  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %21 = load i64*, i64** %offsetD.addr, align 8
-  %22 = load i32, i32* %bid, align 4
-  %idxprom17 = sext i32 %22 to i64
-  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
-  %23 = load i64, i64* %arrayidx18, align 8
-  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
-  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
-  %24 = load i32, i32* %thid, align 4
-  %idxprom20 = sext i32 %24 to i64
-  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
-  %25 = load i32, i32* %arrayidx21, align 4
-  %conv22 = sext i32 %25 to i64
-  %26 = load i64, i64* %knodes_elem.addr, align 8
-  %cmp23 = icmp slt i64 %conv22, %26
-  br i1 %cmp23, label %if.then24, label %if.end
-
-if.then24:                                        ; preds = %if.then
-  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %28 = load i64*, i64** %offsetD.addr, align 8
-  %29 = load i32, i32* %bid, align 4
-  %idxprom25 = sext i32 %29 to i64
-  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
-  %30 = load i64, i64* %arrayidx26, align 8
-  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
-  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
-  %31 = load i32, i32* %thid, align 4
-  %idxprom29 = sext i32 %31 to i64
-  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
-  %32 = load i32, i32* %arrayidx30, align 4
-  %conv31 = sext i32 %32 to i64
-  %33 = load i64*, i64** %offsetD.addr, align 8
-  %34 = load i32, i32* %bid, align 4
-  %idxprom32 = sext i32 %34 to i64
-  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
-  store i64 %conv31, i64* %arrayidx33, align 8
-  br label %if.end
-
-if.end:                                           ; preds = %if.then24, %if.then
-  br label %if.end34
-
-if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
-  call void @llvm.nvvm.barrier0()
-  %35 = load i32, i32* %thid, align 4
-  %cmp35 = icmp eq i32 %35, 0
-  br i1 %cmp35, label %if.then36, label %if.end41
-
-if.then36:                                        ; preds = %if.end34
-  %36 = load i64*, i64** %offsetD.addr, align 8
-  %37 = load i32, i32* %bid, align 4
-  %idxprom37 = sext i32 %37 to i64
-  %arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
-  %38 = load i64, i64* %arrayidx38, align 8
-  %39 = load i64*, i64** %currKnodeD.addr, align 8
-  %40 = load i32, i32* %bid, align 4
-  %idxprom39 = sext i32 %40 to i64
-  %arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
-  store i64 %38, i64* %arrayidx40, align 8
-  br label %if.end41
-
-if.end41:                                         ; preds = %if.then36, %if.end34
-  call void @llvm.nvvm.barrier0()
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end41
-  %41 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %41, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %43 = load i64*, i64** %currKnodeD.addr, align 8
-  %44 = load i32, i32* %bid, align 4
-  %idxprom42 = sext i32 %44 to i64
-  %arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
-  %45 = load i64, i64* %arrayidx43, align 8
-  %arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
-  %keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
-  %46 = load i32, i32* %thid, align 4
-  %idxprom46 = sext i32 %46 to i64
-  %arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
-  %47 = load i32, i32* %arrayidx47, align 4
-  %48 = load i32*, i32** %keysD.addr, align 8
-  %49 = load i32, i32* %bid, align 4
-  %idxprom48 = sext i32 %49 to i64
-  %arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
-  %50 = load i32, i32* %arrayidx49, align 4
-  %cmp50 = icmp eq i32 %47, %50
-  br i1 %cmp50, label %if.then51, label %if.end63
-
-if.then51:                                        ; preds = %for.end
-  %51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
-  %52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %53 = load i64*, i64** %currKnodeD.addr, align 8
-  %54 = load i32, i32* %bid, align 4
-  %idxprom52 = sext i32 %54 to i64
-  %arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
-  %55 = load i64, i64* %arrayidx53, align 8
-  %arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
-  %indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
-  %56 = load i32, i32* %thid, align 4
-  %idxprom56 = sext i32 %56 to i64
-  %arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
-  %57 = load i32, i32* %arrayidx57, align 4
-  %idxprom58 = sext i32 %57 to i64
-  %arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
-  %value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
-  %58 = load i32, i32* %value, align 4
-  %59 = load %struct.record*, %struct.record** %ansD.addr, align 8
-  %60 = load i32, i32* %bid, align 4
-  %idxprom60 = sext i32 %60 to i64
-  %arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
-  %value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
-  store i32 %58, i32* %value62, align 4
-  br label %if.end63
-
-if.end63:                                         ; preds = %if.then51, %for.end
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-; Function Attrs: convergent nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nounwind }
-attributes #3 = { nounwind readnone }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,475 +0,0 @@
-; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
-
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
-entry:
-  %height.addr = alloca i64, align 8
-  %knodesD.addr = alloca %struct.knode*, align 8
-  %knodes_elem.addr = alloca i64, align 8
-  %currKnodeD.addr = alloca i64*, align 8
-  %offsetD.addr = alloca i64*, align 8
-  %lastKnodeD.addr = alloca i64*, align 8
-  %offset_2D.addr = alloca i64*, align 8
-  %startD.addr = alloca i32*, align 8
-  %endD.addr = alloca i32*, align 8
-  %RecstartD.addr = alloca i32*, align 8
-  %ReclenD.addr = alloca i32*, align 8
-  %thid = alloca i32, align 4
-  %bid = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i64 %height, i64* %height.addr, align 8
-  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
-  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
-  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
-  store i64* %offsetD, i64** %offsetD.addr, align 8
-  store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
-  store i64* %offset_2D, i64** %offset_2D.addr, align 8
-  store i32* %startD, i32** %startD.addr, align 8
-  store i32* %endD, i32** %endD.addr, align 8
-  store i32* %RecstartD, i32** %RecstartD.addr, align 8
-  store i32* %ReclenD, i32** %ReclenD.addr, align 8
-  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call, i32* %thid, align 4
-  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call1, i32* %bid, align 4
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %conv = sext i32 %0 to i64
-  %1 = load i64, i64* %height.addr, align 8
-  %cmp = icmp slt i64 %conv, %1
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %3 = load i64*, i64** %currKnodeD.addr, align 8
-  %4 = load i32, i32* %bid, align 4
-  %idxprom = sext i32 %4 to i64
-  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
-  %5 = load i64, i64* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
-  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
-  %6 = load i32, i32* %thid, align 4
-  %idxprom3 = sext i32 %6 to i64
-  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
-  %7 = load i32, i32* %arrayidx4, align 4
-  %8 = load i32*, i32** %startD.addr, align 8
-  %9 = load i32, i32* %bid, align 4
-  %idxprom5 = sext i32 %9 to i64
-  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
-  %10 = load i32, i32* %arrayidx6, align 4
-  %cmp7 = icmp sle i32 %7, %10
-  br i1 %cmp7, label %land.lhs.true, label %if.end34
-
-land.lhs.true:                                    ; preds = %for.body
-  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %12 = load i64*, i64** %currKnodeD.addr, align 8
-  %13 = load i32, i32* %bid, align 4
-  %idxprom8 = sext i32 %13 to i64
-  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
-  %14 = load i64, i64* %arrayidx9, align 8
-  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
-  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
-  %15 = load i32, i32* %thid, align 4
-  %add = add nsw i32 %15, 1
-  %idxprom12 = sext i32 %add to i64
-  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
-  %16 = load i32, i32* %arrayidx13, align 4
-  %17 = load i32*, i32** %startD.addr, align 8
-  %18 = load i32, i32* %bid, align 4
-  %idxprom14 = sext i32 %18 to i64
-  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
-  %19 = load i32, i32* %arrayidx15, align 4
-  %cmp16 = icmp sgt i32 %16, %19
-  br i1 %cmp16, label %if.then, label %if.end34
-
-if.then:                                          ; preds = %land.lhs.true
-  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %21 = load i64*, i64** %currKnodeD.addr, align 8
-  %22 = load i32, i32* %bid, align 4
-  %idxprom17 = sext i32 %22 to i64
-  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
-  %23 = load i64, i64* %arrayidx18, align 8
-  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
-  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
-  %24 = load i32, i32* %thid, align 4
-  %idxprom20 = sext i32 %24 to i64
-  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
-  %25 = load i32, i32* %arrayidx21, align 4
-  %conv22 = sext i32 %25 to i64
-  %26 = load i64, i64* %knodes_elem.addr, align 8
-  %cmp23 = icmp slt i64 %conv22, %26
-  br i1 %cmp23, label %if.then24, label %if.end
-
-if.then24:                                        ; preds = %if.then
-  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %28 = load i64*, i64** %currKnodeD.addr, align 8
-  %29 = load i32, i32* %bid, align 4
-  %idxprom25 = sext i32 %29 to i64
-  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
-  %30 = load i64, i64* %arrayidx26, align 8
-  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
-  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
-  %31 = load i32, i32* %thid, align 4
-  %idxprom29 = sext i32 %31 to i64
-  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
-  %32 = load i32, i32* %arrayidx30, align 4
-  %conv31 = sext i32 %32 to i64
-  %33 = load i64*, i64** %offsetD.addr, align 8
-  %34 = load i32, i32* %bid, align 4
-  %idxprom32 = sext i32 %34 to i64
-  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
-  store i64 %conv31, i64* %arrayidx33, align 8
-  br label %if.end
-
-if.end:                                           ; preds = %if.then24, %if.then
-  br label %if.end34
-
-if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
-  %35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %36 = load i64*, i64** %lastKnodeD.addr, align 8
-  %37 = load i32, i32* %bid, align 4
-  %idxprom35 = sext i32 %37 to i64
-  %arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
-  %38 = load i64, i64* %arrayidx36, align 8
-  %arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
-  %keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
-  %39 = load i32, i32* %thid, align 4
-  %idxprom39 = sext i32 %39 to i64
-  %arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
-  %40 = load i32, i32* %arrayidx40, align 4
-  %41 = load i32*, i32** %endD.addr, align 8
-  %42 = load i32, i32* %bid, align 4
-  %idxprom41 = sext i32 %42 to i64
-  %arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
-  %43 = load i32, i32* %arrayidx42, align 4
-  %cmp43 = icmp sle i32 %40, %43
-  br i1 %cmp43, label %land.lhs.true44, label %if.end75
-
-land.lhs.true44:                                  ; preds = %if.end34
-  %44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %45 = load i64*, i64** %lastKnodeD.addr, align 8
-  %46 = load i32, i32* %bid, align 4
-  %idxprom45 = sext i32 %46 to i64
-  %arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
-  %47 = load i64, i64* %arrayidx46, align 8
-  %arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
-  %keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
-  %48 = load i32, i32* %thid, align 4
-  %add49 = add nsw i32 %48, 1
-  %idxprom50 = sext i32 %add49 to i64
-  %arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
-  %49 = load i32, i32* %arrayidx51, align 4
-  %50 = load i32*, i32** %endD.addr, align 8
-  %51 = load i32, i32* %bid, align 4
-  %idxprom52 = sext i32 %51 to i64
-  %arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
-  %52 = load i32, i32* %arrayidx53, align 4
-  %cmp54 = icmp sgt i32 %49, %52
-  br i1 %cmp54, label %if.then55, label %if.end75
-
-if.then55:                                        ; preds = %land.lhs.true44
-  %53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %54 = load i64*, i64** %lastKnodeD.addr, align 8
-  %55 = load i32, i32* %bid, align 4
-  %idxprom56 = sext i32 %55 to i64
-  %arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
-  %56 = load i64, i64* %arrayidx57, align 8
-  %arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
-  %indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
-  %57 = load i32, i32* %thid, align 4
-  %idxprom60 = sext i32 %57 to i64
-  %arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
-  %58 = load i32, i32* %arrayidx61, align 4
-  %conv62 = sext i32 %58 to i64
-  %59 = load i64, i64* %knodes_elem.addr, align 8
-  %cmp63 = icmp slt i64 %conv62, %59
-  br i1 %cmp63, label %if.then64, label %if.end74
-
-if.then64:                                        ; preds = %if.then55
-  %60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %61 = load i64*, i64** %lastKnodeD.addr, align 8
-  %62 = load i32, i32* %bid, align 4
-  %idxprom65 = sext i32 %62 to i64
-  %arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
-  %63 = load i64, i64* %arrayidx66, align 8
-  %arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
-  %indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
-  %64 = load i32, i32* %thid, align 4
-  %idxprom69 = sext i32 %64 to i64
-  %arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
-  %65 = load i32, i32* %arrayidx70, align 4
-  %conv71 = sext i32 %65 to i64
-  %66 = load i64*, i64** %offset_2D.addr, align 8
-  %67 = load i32, i32* %bid, align 4
-  %idxprom72 = sext i32 %67 to i64
-  %arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
-  store i64 %conv71, i64* %arrayidx73, align 8
-  br label %if.end74
-
-if.end74:                                         ; preds = %if.then64, %if.then55
-  br label %if.end75
-
-if.end75:                                         ; preds = %if.end74, %land.lhs.true44, %if.end34
-  call void @llvm.nvvm.barrier0()
-  %68 = load i32, i32* %thid, align 4
-  %cmp76 = icmp eq i32 %68, 0
-  br i1 %cmp76, label %if.then77, label %if.end86
-
-if.then77:                                        ; preds = %if.end75
-  %69 = load i64*, i64** %offsetD.addr, align 8
-  %70 = load i32, i32* %bid, align 4
-  %idxprom78 = sext i32 %70 to i64
-  %arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
-  %71 = load i64, i64* %arrayidx79, align 8
-  %72 = load i64*, i64** %currKnodeD.addr, align 8
-  %73 = load i32, i32* %bid, align 4
-  %idxprom80 = sext i32 %73 to i64
-  %arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
-  store i64 %71, i64* %arrayidx81, align 8
-  %74 = load i64*, i64** %offset_2D.addr, align 8
-  %75 = load i32, i32* %bid, align 4
-  %idxprom82 = sext i32 %75 to i64
-  %arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
-  %76 = load i64, i64* %arrayidx83, align 8
-  %77 = load i64*, i64** %lastKnodeD.addr, align 8
-  %78 = load i32, i32* %bid, align 4
-  %idxprom84 = sext i32 %78 to i64
-  %arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
-  store i64 %76, i64* %arrayidx85, align 8
-  br label %if.end86
-
-if.end86:                                         ; preds = %if.then77, %if.end75
-  call void @llvm.nvvm.barrier0()
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end86
-  %79 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %79, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %81 = load i64*, i64** %currKnodeD.addr, align 8
-  %82 = load i32, i32* %bid, align 4
-  %idxprom87 = sext i32 %82 to i64
-  %arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
-  %83 = load i64, i64* %arrayidx88, align 8
-  %arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
-  %keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
-  %84 = load i32, i32* %thid, align 4
-  %idxprom91 = sext i32 %84 to i64
-  %arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
-  %85 = load i32, i32* %arrayidx92, align 4
-  %86 = load i32*, i32** %startD.addr, align 8
-  %87 = load i32, i32* %bid, align 4
-  %idxprom93 = sext i32 %87 to i64
-  %arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
-  %88 = load i32, i32* %arrayidx94, align 4
-  %cmp95 = icmp eq i32 %85, %88
-  br i1 %cmp95, label %if.then96, label %if.end105
-
-if.then96:                                        ; preds = %for.end
-  %89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %90 = load i64*, i64** %currKnodeD.addr, align 8
-  %91 = load i32, i32* %bid, align 4
-  %idxprom97 = sext i32 %91 to i64
-  %arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
-  %92 = load i64, i64* %arrayidx98, align 8
-  %arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
-  %indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
-  %93 = load i32, i32* %thid, align 4
-  %idxprom101 = sext i32 %93 to i64
-  %arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
-  %94 = load i32, i32* %arrayidx102, align 4
-  %95 = load i32*, i32** %RecstartD.addr, align 8
-  %96 = load i32, i32* %bid, align 4
-  %idxprom103 = sext i32 %96 to i64
-  %arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
-  store i32 %94, i32* %arrayidx104, align 4
-  br label %if.end105
-
-if.end105:                                        ; preds = %if.then96, %for.end
-  call void @llvm.nvvm.barrier0()
-  %97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %98 = load i64*, i64** %lastKnodeD.addr, align 8
-  %99 = load i32, i32* %bid, align 4
-  %idxprom106 = sext i32 %99 to i64
-  %arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
-  %100 = load i64, i64* %arrayidx107, align 8
-  %arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
-  %keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
-  %101 = load i32, i32* %thid, align 4
-  %idxprom110 = sext i32 %101 to i64
-  %arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
-  %102 = load i32, i32* %arrayidx111, align 4
-  %103 = load i32*, i32** %endD.addr, align 8
-  %104 = load i32, i32* %bid, align 4
-  %idxprom112 = sext i32 %104 to i64
-  %arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
-  %105 = load i32, i32* %arrayidx113, align 4
-  %cmp114 = icmp eq i32 %102, %105
-  br i1 %cmp114, label %if.then115, label %if.end127
-
-if.then115:                                       ; preds = %if.end105
-  %106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
-  %107 = load i64*, i64** %lastKnodeD.addr, align 8
-  %108 = load i32, i32* %bid, align 4
-  %idxprom116 = sext i32 %108 to i64
-  %arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
-  %109 = load i64, i64* %arrayidx117, align 8
-  %arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
-  %indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
-  %110 = load i32, i32* %thid, align 4
-  %idxprom120 = sext i32 %110 to i64
-  %arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
-  %111 = load i32, i32* %arrayidx121, align 4
-  %112 = load i32*, i32** %RecstartD.addr, align 8
-  %113 = load i32, i32* %bid, align 4
-  %idxprom122 = sext i32 %113 to i64
-  %arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
-  %114 = load i32, i32* %arrayidx123, align 4
-  %sub = sub nsw i32 %111, %114
-  %add124 = add nsw i32 %sub, 1
-  %115 = load i32*, i32** %ReclenD.addr, align 8
-  %116 = load i32, i32* %bid, align 4
-  %idxprom125 = sext i32 %116 to i64
-  %arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
-  store i32 %add124, i32* %arrayidx126, align 4
-  br label %if.end127
-
-if.end127:                                        ; preds = %if.then115, %if.end105
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-; Function Attrs: convergent nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nounwind }
-attributes #3 = { nounwind readnone }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/main.c
+++ b/examples/btree/main.c
--- a/examples/btree/run.sh
+++ b/examples/btree/run.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-set -e
-clang -c -emit-llvm util/timer/timer.c
-clang -c -emit-llvm util/num/num.c
-#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
-#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
-#clang++ kernel/kernel_gpu_cuda_wrapper.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
-#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
-clang -c -emit-llvm main.c
-
-llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
-llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
-../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
-../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
-../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
-../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
-
-llc --relocation-model=pic --filetype=obj  main.bc
-llc --relocation-model=pic --filetype=obj  cuda.bc
-llc --relocation-model=pic --filetype=obj  num.bc
-llc --relocation-model=pic --filetype=obj  timer.bc
-llc --relocation-model=pic --filetype=obj  kernel1.bc
-llc --relocation-model=pic --filetype=obj  kernel2.bc
-llc --relocation-model=pic --filetype=obj  host1.bc
-llc --relocation-model=pic --filetype=obj  host2.bc
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o b+tree.out \
-    -fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
-    -lc -lx86Runtime -lthreadPool -lpthread
-
-./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
-    command ../../rodinia-data/b+tree/command.txt
-if grep -q "0    840187    6001" output.txt; then
-    echo "Pass"
-else
-    echo "Error result"
-    exit 1
-fi
--- a/examples/btree/util/cuda/cuda.cu
+++ b/examples/btree/util/cuda/cuda.cu
@ -1,75 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================200
-//	SET_DEVICE CODE
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	INCLUDE/DEFINE
-//======================================================================================================================================================150
-
-#include "cuda.h"					// (in library path specified to compiler)
-
-//======================================================================================================================================================150
-//	FUNCTIONS
-//======================================================================================================================================================150
-
-//====================================================================================================100
-//	SET DEVICE
-//====================================================================================================100
-
-void setdevice(void){
-
-	// variables
-	int num_devices;
-	int device;
-
-	// work
-	cudaGetDeviceCount(&num_devices);
-	if (num_devices > 1) {
-
-		// variables
-		int max_multiprocessors;
-		int max_device;
-		cudaDeviceProp properties;
-
-		// initialize variables
-		max_multiprocessors = 0;
-		max_device = 0;
-
-		for (device = 0; device < num_devices; device++) {
-			cudaGetDeviceProperties(&properties, device);
-			if (max_multiprocessors < properties.multiProcessorCount) {
-				max_multiprocessors = properties.multiProcessorCount;
-				max_device = device;
-			}
-		}
-		cudaSetDevice(max_device);
-	}
-
-}
-
-//====================================================================================================100
-//	GET LAST ERROR
-//====================================================================================================100
-
-void checkCUDAError(const char *msg)
-{
-	cudaError_t err = cudaGetLastError();
-	if( cudaSuccess != err) {
-		// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
-		printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
-		fflush(NULL);
-		exit(EXIT_FAILURE);
-	}
-}
-
-//===============================================================================================================================================================================================================200
-//	END
-//===============================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/util/cuda/cuda.h
+++ b/examples/btree/util/cuda/cuda.h
@ -1,37 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================200
-//	SET_DEVICE HEADER
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	INCLUDE/DEFINE
-//======================================================================================================================================================150
-
-#include <stdio.h> // (in library path known to compiler)		needed by printf
-
-//======================================================================================================================================================150
-//	FUNCTION PROTOTYPES
-//======================================================================================================================================================150
-
-//====================================================================================================100
-//	SET DEVICE
-//====================================================================================================100
-
-void setdevice(void);
-
-//====================================================================================================100
-//	GET LAST ERROR
-//====================================================================================================100
-
-void checkCUDAError(const char *msg);
-
-//===============================================================================================================================================================================================================200
-//	END SET_DEVICE HEADER
-//===============================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/util/num/num.c
+++ b/examples/btree/util/num/num.c
@ -1,55 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================200
-//	DESCRIPTION
-//===============================================================================================================================================================================================================200
-
-// Returns:	0 if string does not represent integer
-//			1 if string represents integer
-
-//===============================================================================================================================================================================================================200
-//	NUM CODE
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	ISINTEGER FUNCTION
-//======================================================================================================================================================150
-
-int isInteger(char *str) {
-
-  //====================================================================================================100
-  //	make sure it's not empty
-  //====================================================================================================100
-
-  if (*str == '\0') {
-    return 0;
-  }
-
-  //====================================================================================================100
-  //	if any digit is not a number, return false
-  //====================================================================================================100
-
-  for (; *str != '\0'; str++) {
-    if (*str < 48 ||
-        *str >
-            57) { // digit characters (need to include . if checking for float)
-      return 0;
-    }
-  }
-
-  //====================================================================================================100
-  //	it got past all my checks so I think it's a number
-  //====================================================================================================100
-
-  return 1;
-}
-
-//===============================================================================================================================================================================================================200
-//	END NUM CODE
-//===============================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/util/num/num.h
+++ b/examples/btree/util/num/num.h
@ -1,21 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================200
-//	FILE HEADER
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	ISINTEGER FUNCTION PROTOTYPE
-//======================================================================================================================================================150
-
-int isInteger(char *str);
-
-//===============================================================================================================================================================================================================200
-//	END FILE HEADER
-//===============================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/util/timer/timer.c
+++ b/examples/btree/util/timer/timer.c
@ -1,36 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================200
-//	TIMER CODE
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	INCLUDE/DEFINE
-//======================================================================================================================================================150
-
-#include <stdlib.h>
-
-//======================================================================================================================================================150
-//	FUNCTIONS
-//======================================================================================================================================================150
-
-//====================================================================================================100
-//	DISPLAY TIME
-//====================================================================================================100
-
-// Returns the current system time in microseconds
-long long get_time() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return (tv.tv_sec * 1000000) + tv.tv_usec;
-}
-
-//===============================================================================================================================================================================================================200
-//	END TIMER CODE
-//===============================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/btree/util/timer/timer.h
+++ b/examples/btree/util/timer/timer.h
@ -1,21 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================200
-//	TIMER HEADER
-//===============================================================================================================================================================================================================200
-
-//======================================================================================================================================================150
-//	FUNCTION PROTOTYPES
-//======================================================================================================================================================150
-
-long long get_time();
-
-//===============================================================================================================================================================================================================200
-//	END TIMER HEADER
-//===============================================================================================================================================================================================================200
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/cfd/euler3d.cu
+++ b/examples/cfd/euler3d.cu
@ -1,662 +0,0 @@
-#include <fstream>
-#include <helper_cuda.h>
-#include <helper_timer.h>
-#include <iostream>
-
-/*
- * Options
- *
- */
-#define GAMMA 1.4f
-#define iterations 2
-// #ifndef block_length
-// 	#define block_length 192
-// #endif
-
-#define NDIM 3
-#define NNB 4
-
-#define RK 3 // 3rd order RK
-#define ff_mach 1.2f
-#define deg_angle_of_attack 0.0f
-
-/*
- * not options
- */
-
-#ifdef RD_WG_SIZE_0_0
-#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define BLOCK_SIZE_0 RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE_0 RD_WG_SIZE
-#else
-#define BLOCK_SIZE_0 192
-#endif
-
-#ifdef RD_WG_SIZE_1_0
-#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
-#elif defined(RD_WG_SIZE_1)
-#define BLOCK_SIZE_1 RD_WG_SIZE_1
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE_1 RD_WG_SIZE
-#else
-#define BLOCK_SIZE_1 192
-#endif
-
-#ifdef RD_WG_SIZE_2_0
-#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
-#elif defined(RD_WG_SIZE_1)
-#define BLOCK_SIZE_2 RD_WG_SIZE_2
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE_2 RD_WG_SIZE
-#else
-#define BLOCK_SIZE_2 192
-#endif
-
-#ifdef RD_WG_SIZE_3_0
-#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
-#elif defined(RD_WG_SIZE_3)
-#define BLOCK_SIZE_3 RD_WG_SIZE_3
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE_3 RD_WG_SIZE
-#else
-#define BLOCK_SIZE_3 192
-#endif
-
-#ifdef RD_WG_SIZE_4_0
-#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
-#elif defined(RD_WG_SIZE_4)
-#define BLOCK_SIZE_4 RD_WG_SIZE_4
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE_4 RD_WG_SIZE
-#else
-#define BLOCK_SIZE_4 192
-#endif
-
-// #if block_length > 128
-// #warning "the kernels may fail too launch on some systems if the block length
-// is too large" #endif
-
-#define VAR_DENSITY 0
-#define VAR_MOMENTUM 1
-#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
-#define NVAR (VAR_DENSITY_ENERGY + 1)
-
-/*
- * Generic functions
- */
-template <typename T> T *alloc(int N) {
-  T *t;
-  checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
-  return t;
-}
-
-template <typename T> void dealloc(T *array) {
-  checkCudaErrors(cudaFree((void *)array));
-}
-
-template <typename T> void copy(T *dst, T *src, int N) {
-  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
-                             cudaMemcpyDeviceToDevice));
-}
-
-template <typename T> void upload(T *dst, T *src, int N) {
-  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
-                             cudaMemcpyHostToDevice));
-}
-
-template <typename T> void download(T *dst, T *src, int N) {
-  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
-                             cudaMemcpyDeviceToHost));
-}
-
-void dump(float *variables, int nel, int nelr) {
-  float *h_variables = new float[nelr * NVAR];
-  download(h_variables, variables, nelr * NVAR);
-
-  {
-    std::ofstream file("density");
-    file << nel << " " << nelr << std::endl;
-    for (int i = 0; i < nel; i++)
-      file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
-  }
-
-  {
-    std::ofstream file("momentum");
-    file << nel << " " << nelr << std::endl;
-    for (int i = 0; i < nel; i++) {
-      for (int j = 0; j != NDIM; j++)
-        file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
-      file << std::endl;
-    }
-  }
-
-  {
-    std::ofstream file("density_energy");
-    file << nel << " " << nelr << std::endl;
-    for (int i = 0; i < nel; i++)
-      file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
-  }
-  delete[] h_variables;
-}
-
-/*
- * Element-based Cell-centered FVM solver functions
- */
-__constant__ float ff_variable[NVAR];
-__constant__ float3 ff_flux_contribution_momentum_x[1];
-__constant__ float3 ff_flux_contribution_momentum_y[1];
-__constant__ float3 ff_flux_contribution_momentum_z[1];
-__constant__ float3 ff_flux_contribution_density_energy[1];
-
-__global__ void cuda_initialize_variables(int nelr, float *variables) {
-  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
-  for (int j = 0; j < NVAR; j++)
-    variables[i + j * nelr] = ff_variable[j];
-}
-void initialize_variables(int nelr, float *variables) {
-  dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
-  cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
-  getLastCudaError("initialize_variables failed");
-}
-
-__device__ __host__ inline void compute_flux_contribution(
-    float &density, float3 &momentum, float &density_energy, float &pressure,
-    float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
-    float3 &fc_momentum_z, float3 &fc_density_energy) {
-  fc_momentum_x.x = velocity.x * momentum.x + pressure;
-  fc_momentum_x.y = velocity.x * momentum.y;
-  fc_momentum_x.z = velocity.x * momentum.z;
-
-  fc_momentum_y.x = fc_momentum_x.y;
-  fc_momentum_y.y = velocity.y * momentum.y + pressure;
-  fc_momentum_y.z = velocity.y * momentum.z;
-
-  fc_momentum_z.x = fc_momentum_x.z;
-  fc_momentum_z.y = fc_momentum_y.z;
-  fc_momentum_z.z = velocity.z * momentum.z + pressure;
-
-  float de_p = density_energy + pressure;
-  fc_density_energy.x = velocity.x * de_p;
-  fc_density_energy.y = velocity.y * de_p;
-  fc_density_energy.z = velocity.z * de_p;
-}
-
-__device__ inline void compute_velocity(float &density, float3 &momentum,
-                                        float3 &velocity) {
-  velocity.x = momentum.x / density;
-  velocity.y = momentum.y / density;
-  velocity.z = momentum.z / density;
-}
-
-__device__ inline float compute_speed_sqd(float3 &velocity) {
-  return velocity.x * velocity.x + velocity.y * velocity.y +
-         velocity.z * velocity.z;
-}
-
-__device__ inline float compute_pressure(float &density, float &density_energy,
-                                         float &speed_sqd) {
-  return (float(GAMMA) - float(1.0f)) *
-         (density_energy - float(0.5f) * density * speed_sqd);
-}
-
-__device__ inline float compute_speed_of_sound(float &density,
-                                               float &pressure) {
-  return sqrtf(float(GAMMA) * pressure / density);
-}
-
-__global__ void cuda_compute_step_factor(int nelr, float *variables,
-                                         float *areas, float *step_factors) {
-  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
-
-  float density = variables[i + VAR_DENSITY * nelr];
-  float3 momentum;
-  momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
-  momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
-  momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
-
-  float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
-
-  float3 velocity;
-  compute_velocity(density, momentum, velocity);
-  float speed_sqd = compute_speed_sqd(velocity);
-  float pressure = compute_pressure(density, density_energy, speed_sqd);
-  float speed_of_sound = compute_speed_of_sound(density, pressure);
-
-  // dt = float(0.5f) * sqrtf(areas[i]) /  (||v|| + c).... but when we do time
-  // stepping, this later would need to be divided by the area, so we just do it
-  // all at once
-  step_factors[i] =
-      float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
-}
-void compute_step_factor(int nelr, float *variables, float *areas,
-                         float *step_factors) {
-  dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
-  cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
-  getLastCudaError("compute_step_factor failed");
-}
-
-/*
- *
- *
- */
-__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
-                                  float *normals, float *variables,
-                                  float *fluxes) {
-  const float smoothing_coefficient = float(0.2f);
-  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
-
-  int j, nb;
-  float3 normal;
-  float normal_len;
-  float factor;
-
-  float density_i = variables[i + VAR_DENSITY * nelr];
-  float3 momentum_i;
-  momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
-  momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
-  momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
-
-  float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
-
-  float3 velocity_i;
-  compute_velocity(density_i, momentum_i, velocity_i);
-  float speed_sqd_i = compute_speed_sqd(velocity_i);
-  float speed_i = sqrtf(speed_sqd_i);
-  float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
-  float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
-  float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
-      flux_contribution_i_momentum_z;
-  float3 flux_contribution_i_density_energy;
-  compute_flux_contribution(
-      density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
-      flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
-      flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
-
-  float flux_i_density = float(0.0f);
-  float3 flux_i_momentum;
-  flux_i_momentum.x = float(0.0f);
-  flux_i_momentum.y = float(0.0f);
-  flux_i_momentum.z = float(0.0f);
-  float flux_i_density_energy = float(0.0f);
-
-  float3 velocity_nb;
-  float density_nb, density_energy_nb;
-  float3 momentum_nb;
-  float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
-      flux_contribution_nb_momentum_z;
-  float3 flux_contribution_nb_density_energy;
-  float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
-
-#pragma unroll
-  for (j = 0; j < NNB; j++) {
-    nb = elements_surrounding_elements[i + j * nelr];
-    normal.x = normals[i + (j + 0 * NNB) * nelr];
-    normal.y = normals[i + (j + 1 * NNB) * nelr];
-    normal.z = normals[i + (j + 2 * NNB) * nelr];
-    normal_len =
-        sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
-
-    if (nb >= 0) // a legitimate neighbor
-    {
-      density_nb = variables[nb + VAR_DENSITY * nelr];
-      momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
-      momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
-      momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
-      density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
-      compute_velocity(density_nb, momentum_nb, velocity_nb);
-      speed_sqd_nb = compute_speed_sqd(velocity_nb);
-      pressure_nb =
-          compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
-      speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
-      compute_flux_contribution(
-          density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
-          flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
-          flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
-
-      // artificial viscosity
-      factor = -normal_len * smoothing_coefficient * float(0.5f) *
-               (speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
-                speed_of_sound_nb);
-      flux_i_density += factor * (density_i - density_nb);
-      flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
-      flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
-      flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
-      flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
-
-      // accumulate cell-centered fluxes
-      factor = float(0.5f) * normal.x;
-      flux_i_density += factor * (momentum_nb.x + momentum_i.x);
-      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
-                                         flux_contribution_i_density_energy.x);
-      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
-                                     flux_contribution_i_momentum_x.x);
-      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
-                                     flux_contribution_i_momentum_y.x);
-      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
-                                     flux_contribution_i_momentum_z.x);
-
-      factor = float(0.5f) * normal.y;
-      flux_i_density += factor * (momentum_nb.y + momentum_i.y);
-      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
-                                         flux_contribution_i_density_energy.y);
-      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
-                                     flux_contribution_i_momentum_x.y);
-      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
-                                     flux_contribution_i_momentum_y.y);
-      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
-                                     flux_contribution_i_momentum_z.y);
-
-      factor = float(0.5f) * normal.z;
-      flux_i_density += factor * (momentum_nb.z + momentum_i.z);
-      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
-                                         flux_contribution_i_density_energy.z);
-      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
-                                     flux_contribution_i_momentum_x.z);
-      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
-                                     flux_contribution_i_momentum_y.z);
-      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
-                                     flux_contribution_i_momentum_z.z);
-    } else if (nb == -1) // a wing boundary
-    {
-      flux_i_momentum.x += normal.x * pressure_i;
-      flux_i_momentum.y += normal.y * pressure_i;
-      flux_i_momentum.z += normal.z * pressure_i;
-    } else if (nb == -2) // a far field boundary
-    {
-      factor = float(0.5f) * normal.x;
-      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
-      flux_i_density_energy +=
-          factor * (ff_flux_contribution_density_energy[0].x +
-                    flux_contribution_i_density_energy.x);
-      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
-                                     flux_contribution_i_momentum_x.x);
-      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
-                                     flux_contribution_i_momentum_y.x);
-      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
-                                     flux_contribution_i_momentum_z.x);
-
-      factor = float(0.5f) * normal.y;
-      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
-      flux_i_density_energy +=
-          factor * (ff_flux_contribution_density_energy[0].y +
-                    flux_contribution_i_density_energy.y);
-      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
-                                     flux_contribution_i_momentum_x.y);
-      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
-                                     flux_contribution_i_momentum_y.y);
-      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
-                                     flux_contribution_i_momentum_z.y);
-
-      factor = float(0.5f) * normal.z;
-      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
-      flux_i_density_energy +=
-          factor * (ff_flux_contribution_density_energy[0].z +
-                    flux_contribution_i_density_energy.z);
-      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
-                                     flux_contribution_i_momentum_x.z);
-      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
-                                     flux_contribution_i_momentum_y.z);
-      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
-                                     flux_contribution_i_momentum_z.z);
-    }
-  }
-
-  fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
-  fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
-  fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
-  fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
-  fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
-}
-void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
-                  float *variables, float *fluxes) {
-  dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
-  cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
-                                variables, fluxes);
-  getLastCudaError("compute_flux failed");
-}
-
-__global__ void cuda_time_step(int j, int nelr, float *old_variables,
-                               float *variables, float *step_factors,
-                               float *fluxes) {
-  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
-
-  float factor = step_factors[i] / float(RK + 1 - j);
-
-  variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
-                                      factor * fluxes[i + VAR_DENSITY * nelr];
-  variables[i + VAR_DENSITY_ENERGY * nelr] =
-      old_variables[i + VAR_DENSITY_ENERGY * nelr] +
-      factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
-  variables[i + (VAR_MOMENTUM + 0) * nelr] =
-      old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
-      factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
-  variables[i + (VAR_MOMENTUM + 1) * nelr] =
-      old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
-      factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
-  variables[i + (VAR_MOMENTUM + 2) * nelr] =
-      old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
-      factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
-}
-void time_step(int j, int nelr, float *old_variables, float *variables,
-               float *step_factors, float *fluxes) {
-  dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
-  cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
-                             fluxes);
-  getLastCudaError("update failed");
-}
-
-/*
- * Main function
- */
-int main(int argc, char **argv) {
-  printf("WG size of kernel:initialize = %d, WG size of "
-         "kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
-         "%d, WG size of kernel:time_step = %d\n",
-         BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
-
-  if (argc < 2) {
-    std::cout << "specify data file name" << std::endl;
-    return 0;
-  }
-  const char *data_file_name = argv[1];
-
-  cudaDeviceProp prop;
-  int dev;
-
-  checkCudaErrors(cudaSetDevice(0));
-
-  // set far field conditions and load them into constant memory on the gpu
-  {
-    float h_ff_variable[NVAR];
-    const float angle_of_attack =
-        float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
-
-    h_ff_variable[VAR_DENSITY] = float(1.4);
-
-    float ff_pressure = float(1.0f);
-    float ff_speed_of_sound =
-        sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
-    float ff_speed = float(ff_mach) * ff_speed_of_sound;
-
-    float3 ff_velocity;
-    ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
-    ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
-    ff_velocity.z = 0.0f;
-
-    h_ff_variable[VAR_MOMENTUM + 0] =
-        h_ff_variable[VAR_DENSITY] * ff_velocity.x;
-    h_ff_variable[VAR_MOMENTUM + 1] =
-        h_ff_variable[VAR_DENSITY] * ff_velocity.y;
-    h_ff_variable[VAR_MOMENTUM + 2] =
-        h_ff_variable[VAR_DENSITY] * ff_velocity.z;
-
-    h_ff_variable[VAR_DENSITY_ENERGY] =
-        h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
-        (ff_pressure / float(GAMMA - 1.0f));
-
-    float3 h_ff_momentum;
-    h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
-    h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
-    h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
-    float3 h_ff_flux_contribution_momentum_x;
-    float3 h_ff_flux_contribution_momentum_y;
-    float3 h_ff_flux_contribution_momentum_z;
-    float3 h_ff_flux_contribution_density_energy;
-    compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
-                              h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
-                              ff_velocity, h_ff_flux_contribution_momentum_x,
-                              h_ff_flux_contribution_momentum_y,
-                              h_ff_flux_contribution_momentum_z,
-                              h_ff_flux_contribution_density_energy);
-
-    // copy far field conditions to the gpu
-    checkCudaErrors(
-        cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
-    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
-                                       &h_ff_flux_contribution_momentum_x,
-                                       sizeof(float3)));
-    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
-                                       &h_ff_flux_contribution_momentum_y,
-                                       sizeof(float3)));
-    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
-                                       &h_ff_flux_contribution_momentum_z,
-                                       sizeof(float3)));
-
-    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
-                                       &h_ff_flux_contribution_density_energy,
-                                       sizeof(float3)));
-  }
-  int nel;
-  int nelr;
-
-  // read in domain geometry
-  float *areas;
-  int *elements_surrounding_elements;
-  float *normals;
-  {
-    std::ifstream file(data_file_name);
-
-    file >> nel;
-    nelr =
-        BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
-
-    float *h_areas = new float[nelr];
-    int *h_elements_surrounding_elements = new int[nelr * NNB];
-    float *h_normals = new float[nelr * NDIM * NNB];
-
-    // read in data
-    for (int i = 0; i < nel; i++) {
-      file >> h_areas[i];
-      for (int j = 0; j < NNB; j++) {
-        file >> h_elements_surrounding_elements[i + j * nelr];
-        if (h_elements_surrounding_elements[i + j * nelr] < 0)
-          h_elements_surrounding_elements[i + j * nelr] = -1;
-        h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
-                                                         // Fortran numbering
-
-        for (int k = 0; k < NDIM; k++) {
-          file >> h_normals[i + (j + k * NNB) * nelr];
-          h_normals[i + (j + k * NNB) * nelr] =
-              -h_normals[i + (j + k * NNB) * nelr];
-        }
-      }
-    }
-
-    // fill in remaining data
-    int last = nel - 1;
-    for (int i = nel; i < nelr; i++) {
-      h_areas[i] = h_areas[last];
-      for (int j = 0; j < NNB; j++) {
-        // duplicate the last element
-        h_elements_surrounding_elements[i + j * nelr] =
-            h_elements_surrounding_elements[last + j * nelr];
-        for (int k = 0; k < NDIM; k++)
-          h_normals[last + (j + k * NNB) * nelr] =
-              h_normals[last + (j + k * NNB) * nelr];
-      }
-    }
-
-    areas = alloc<float>(nelr);
-    upload<float>(areas, h_areas, nelr);
-
-    elements_surrounding_elements = alloc<int>(nelr * NNB);
-    upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
-                nelr * NNB);
-
-    normals = alloc<float>(nelr * NDIM * NNB);
-    upload<float>(normals, h_normals, nelr * NDIM * NNB);
-
-    delete[] h_areas;
-    delete[] h_elements_surrounding_elements;
-    delete[] h_normals;
-  }
-
-  // Create arrays and set initial conditions
-  float *variables = alloc<float>(nelr * NVAR);
-  initialize_variables(nelr, variables);
-
-  float *old_variables = alloc<float>(nelr * NVAR);
-  float *fluxes = alloc<float>(nelr * NVAR);
-  float *step_factors = alloc<float>(nelr);
-
-  // make sure all memory is floatly allocated before we start timing
-  initialize_variables(nelr, old_variables);
-  initialize_variables(nelr, fluxes);
-  cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
-  // make sure CUDA isn't still doing something before we start timing
-  cudaThreadSynchronize();
-
-  // these need to be computed the first time in order to compute time step
-  std::cout << "Starting..." << std::endl;
-
-  StopWatchInterface *timer = 0;
-  //	unsigned int timer = 0;
-
-  // CUT_SAFE_CALL( cutCreateTimer( &timer));
-  // CUT_SAFE_CALL( cutStartTimer( timer));
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
-  // Begin iterations
-  for (int i = 0; i < iterations; i++) {
-    copy<float>(old_variables, variables, nelr * NVAR);
-
-    // for the first iteration we compute the time step
-    compute_step_factor(nelr, variables, areas, step_factors);
-    getLastCudaError("compute_step_factor failed");
-
-    for (int j = 0; j < RK; j++) {
-      compute_flux(nelr, elements_surrounding_elements, normals, variables,
-                   fluxes);
-      getLastCudaError("compute_flux failed");
-      time_step(j, nelr, old_variables, variables, step_factors, fluxes);
-      getLastCudaError("time_step failed");
-    }
-  }
-
-  cudaThreadSynchronize();
-  //	CUT_SAFE_CALL( cutStopTimer(timer) );
-  sdkStopTimer(&timer);
-
-  std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
-            << " seconds per iteration" << std::endl;
-
-  std::cout << "Saving solution..." << std::endl;
-  dump(variables, nel, nelr);
-  std::cout << "Saved solution..." << std::endl;
-
-  std::cout << "Cleaning up..." << std::endl;
-  dealloc<float>(areas);
-  dealloc<int>(elements_surrounding_elements);
-  dealloc<float>(normals);
-
-  dealloc<float>(variables);
-  dealloc<float>(old_variables);
-  dealloc<float>(fluxes);
-  dealloc<float>(step_factors);
-
-  std::cout << "Done..." << std::endl;
-
-  return 0;
-}
--- a/examples/cfd/run.sh
+++ b/examples/cfd/run.sh
@ -1,15 +0,0 @@
-# # #!/bin/bash
-clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
-
-/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
-/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc  host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-
-g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
-
-./a.out ../rodinia-data/cfd/fvcorr.domn.097K
-# ./demo 1024
-# # # ./demo -f ../../data/matrix3.txt
-# # # run -f ../../data/gaussian/matrix3.txt
--- a/examples/dwt2d/common.h
+++ b/examples/dwt2d/common.h
@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2009, Jiri Matela
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _COMMON_H
-#define _COMMON_H
-
-// 24-bit multiplication is faster on G80,
-// but we must be sure to multiply integers
-// only within [-8M, 8M - 1] range
-#define IMUL(a, b) __mul24(a, b)
-
-////cuda timing macros
-//#define CTIMERINIT  cudaEvent_t cstart, cstop; \
-//                    cudaEventCreate(&cstart); \
-//                    cudaEventCreate(&cstop); \
-//                    float elapsedTime
-//#define CTIMERSTART(cstart) cudaEventRecord(cstart,0)
-//#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \
-//                          cudaEventSynchronize(cstop); \
-//                          cudaEventElapsedTime(&elapsedTime, cstart, cstop)
-
-// divide and round up macro
-#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
-
-#define cudaCheckError(msg)                                                    \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg,            \
-              cudaGetErrorString(err));                                        \
-      exit(-1);                                                                \
-    }                                                                          \
-  }
-
-#define cudaCheckAsyncError(msg)                                               \
-  {                                                                            \
-    cudaThreadSynchronize();                                                   \
-    cudaCheckError(msg);                                                       \
-  }
-
-#endif
--- a/examples/dwt2d/components.cu
+++ b/examples/dwt2d/components.cu
@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2009, Jiri Matela
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <unistd.h>
-#include <error.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <assert.h>
-
-#include "components.h"
-#include "common.h"
-
-#define THREADS 256
-
-/* Store 3 RGB float components */
-__device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
-{
-    d_r[pos] = (r/255.0f) - 0.5f;
-    d_g[pos] = (g/255.0f) - 0.5f;
-    d_b[pos] = (b/255.0f) - 0.5f;
-}
-
-/* Store 3 RGB intege components */
-__device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
-{
-    d_r[pos] = r - 128;
-    d_g[pos] = g - 128;
-    d_b[pos] = b - 128;
-}
-
-/* Store float component */
-__device__ void storeComponent(float *d_c, float c, int pos)
-{
-    d_c[pos] = (c/255.0f) - 0.5f;
-}
-
-/* Store integer component */
-__device__ void storeComponent(int *d_c, int c, int pos)
-{
-    d_c[pos] = c - 128;
-}
-
-/* Copy img src data into three separated component buffers */
-template<typename T>
-__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
-                                  unsigned char * d_src,
-                                  int pixels)
-{
-    int x  = threadIdx.x;
-    int gX = blockDim.x*blockIdx.x;
-
-    __shared__ unsigned char sData[THREADS*3];
-
-    /* Copy data to shared mem by 4bytes
-       other checks are not necessary, since
-       d_src buffer is aligned to sharedDataSize */
-    if ( (x*4) < THREADS*3 ) {
-        float *s = (float *)d_src;
-        float *d = (float *)sData;
-        d[x] = s[((gX*3)>>2) + x];
-    }
-    __syncthreads();
-
-    T r, g, b;
-
-    int offset = x*3;
-    r = (T)(sData[offset]);
-    g = (T)(sData[offset+1]);
-    b = (T)(sData[offset+2]);
-
-    int globalOutputPosition = gX + x;
-    if (globalOutputPosition < pixels) {
-        storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
-    }
-}
-
-/* Copy img src data into three separated component buffers */
-template<typename T>
-__global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
-{
-    int x  = threadIdx.x;
-    int gX = blockDim.x*blockIdx.x;
-
-    __shared__ unsigned char sData[THREADS];
-
-    /* Copy data to shared mem by 4bytes
-       other checks are not necessary, since
-       d_src buffer is aligned to sharedDataSize */
-    if ( (x*4) < THREADS) {
-        float *s = (float *)d_src;
-        float *d = (float *)sData;
-        d[x] = s[(gX>>2) + x];
-    }
-    __syncthreads();
-
-    T c;
-
-    c = (T)(sData[x]);
-
-    int globalOutputPosition = gX + x;
-    if (globalOutputPosition < pixels) {
-        storeComponent(d_c, c, globalOutputPosition);
-    }
-}
-
-
-/* Separate compoents of 8bit RGB source image */
-template<typename T>
-void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
-{
-    unsigned char * d_src;
-    int pixels      = width*height;
-    int alignedSize =  DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
-
-    /* Alloc d_src buffer */
-    cudaMalloc((void **)&d_src, alignedSize);
-    cudaCheckAsyncError("Cuda malloc")
-    cudaMemset(d_src, 0, alignedSize);
-
-    /* Copy data to device */
-    cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
-    cudaCheckError("Copy data to device")
-
-    /* Kernel */
-    dim3 threads(THREADS);
-    dim3 grid(alignedSize/(THREADS*3));
-    assert(alignedSize%(THREADS*3) == 0);
-    c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
-    cudaCheckAsyncError("CopySrcToComponents kernel")
-
-    /* Free Memory */
-    cudaFree(d_src);
-    cudaCheckAsyncError("Free memory")
-}
-template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
-template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
-
-
-/* Copy a 8bit source image data into a color compoment of type T */
-template<typename T>
-void bwToComponent(T *d_c, unsigned char * src, int width, int height)
-{
-    unsigned char * d_src;
-    int pixels      = width*height;
-    int alignedSize =  DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
-
-    /* Alloc d_src buffer */
-    cudaMalloc((void **)&d_src, alignedSize);
-    cudaCheckAsyncError("Cuda malloc")
-    cudaMemset(d_src, 0, alignedSize);
-
-    /* Copy data to device */
-    cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
-    cudaCheckError("Copy data to device")
-
-    /* Kernel */
-    dim3 threads(THREADS);
-    dim3 grid(alignedSize/(THREADS));
-    assert(alignedSize%(THREADS) == 0);
-    c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
-    cudaCheckAsyncError("CopySrcToComponent kernel")
-
-    /* Free Memory */
-    cudaFree(d_src);
-    cudaCheckAsyncError("Free memory")
-}
-
-template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
-template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);
--- a/examples/dwt2d/components.h
+++ b/examples/dwt2d/components.h
@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2009, Jiri Matela
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _COMPONENTS_H
-#define _COMPONENTS_H
-
-/* Separate compoents of source 8bit RGB image */
-template <typename T>
-void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
-                     int height);
-
-/* Copy a 8bit source image data into a color compoment of type T */
-template <typename T>
-void bwToComponent(T *d_c, unsigned char *src, int width, int height);
-
-#endif
--- a/examples/dwt2d/dwt.cu
+++ b/examples/dwt2d/dwt.cu
@ -1,385 +0,0 @@
-/*
- * Copyright (c) 2009, Jiri Matela
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdio.h>
-#include <fcntl.h>
-#include <assert.h>
-#include <errno.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <error.h>
-#include "dwt_cuda/dwt.h"
-#include "dwt_cuda/common.h"
-#include "dwt.h"
-#include "common.h"
-#include <iostream>
-#include <fstream>
-
-inline void fdwt(float *in, float *out, int width, int height, int levels)
-{
-        printf(" Running fdwt97 Float \n");
-        dwt_cuda::fdwt97(in, out, width, height, levels);
-}
-/*
-inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
-{
-        dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
-}
-*/
-
-
-
-inline void fdwt(int *in, int *out, int width, int height, int levels)
-{
-        printf(" Running fdwt53 Int \n");
-
-        dwt_cuda::fdwt53(in, out, width, height, levels);
-}
-/*
-inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
-{
-        dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
-}
-*/
-
-
-
-inline void rdwt(float *in, float *out, int width, int height, int levels)
-{
-        printf(" Running rdwt97 Float \n");
-
-        dwt_cuda::rdwt97(in, out, width, height, levels);
-}
-
-inline void rdwt(int *in, int *out, int width, int height, int levels)
-{
-        printf(" Running rdwt53 Int \n");
-
-        dwt_cuda::rdwt53(in, out, width, height, levels);
-}
-
-template<typename T>
-int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
-{
-    printf("\n*** %d stages of 2D forward DWT:\n", stages);
-
-    /* create backup of input, because each test iteration overwrites it */
-    const int size = pixHeight * pixWidth * sizeof(T);
-    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
-    cudaCheckError("Memcopy device to device");
-
-    /* Measure time of individual levels. */
-    if(forward)
-        fdwt(in, out, pixWidth, pixHeight, stages);
-    else
-        rdwt(in, out, pixWidth, pixHeight, stages);
-
-    // Measure overall time of DWT.
-/*    #ifdef GPU_DWT_TESTING_1
-
-    dwt_cuda::CudaDWTTester tester;
-    for(int i = tester.getNumIterations(); i--; ) {
-        // Recover input and measure one overall DWT run.
-        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
-        cudaCheckError("Memcopy device to device");
-        tester.beginTestIteration();
-        if(forward)
-            fdwt(in, out, pixWidth, pixHeight, stages);
-        else
-            rdwt(in, out, pixWidth, pixHeight, stages);
-        tester.endTestIteration();
-    }
-    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
-    #endif  // GPU_DWT_TESTING
-
-    cudaCheckAsyncError("DWT Kernel calls");
-*/    return 0;
-}
-template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
-template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
-
-
-
-/*
-template<typename T>
-int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
-{
-    printf("*** %d stages of 2D forward DWT:\n", stages);
-
-    // create backup of input, because each test iteration overwrites it
-    const int size = pixHeight * pixWidth * sizeof(T);
-    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
-    cudaCheckError("Memcopy device to device");
-
-    // Measure time of individual levels.
-    if(forward)
-        fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
-    else
-        rdwt(in, out, pixWidth, pixHeight, stages);
-
-    // Measure overall time of DWT.
-    #ifdef GPU_DWT_TESTING_1
-
-    dwt_cuda::CudaDWTTester tester;
-    for(int i = tester.getNumIterations(); i--; ) {
-        // Recover input and measure one overall DWT run.
-        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
-        cudaCheckError("Memcopy device to device");
-        tester.beginTestIteration();
-        if(forward)
-            fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
-        else
-            rdwt(in, out, pixWidth, pixHeight, stages);
-        tester.endTestIteration();
-    }
-    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
-    #endif  // GPU_DWT_TESTING
-
-    cudaCheckAsyncError("DWT Kernel calls");
-    return 0;
-}
-template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
-template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
-
-*/
-
-void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
-{
-    int i;
-    std::ofstream outputFile;
-    char outfile[strlen(filename)+strlen(".txt")];
-    strcpy(outfile, filename);
-    strcpy(outfile+strlen(filename), ".txt");
-    outputFile.open(outfile);
-
-
-    for(i = 0; i < samplesNum; i++) {
-        float r = (src[i]+0.5f) * 255;
-        if (r > 255) r = 255;
-        if (r < 0)   r = 0;
-        dst[i] = (unsigned char)r;
-        outputFile << "index: " << i  << " val: "<< r <<" \n";
-
-
-    }
-    outputFile.close();
-}
-
-void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
-{
-    int i;
-    std::ofstream outputFile;
-    char outfile[strlen(filename)+strlen(".txt")];
-    strcpy(outfile, filename);
-    strcpy(outfile+strlen(filename), ".txt");
-    outputFile.open(outfile);
-    for(i = 0; i < samplesNum; i++) {
-        int r = src[i]+128;
-        if (r > 255) r = 255;
-        if (r < 0)   r = 0;
-        dst[i] = (unsigned char)r;
-        // added this line to output check
-        outputFile << "index: " << i  << " val: "<< r <<" \n";
-    }
-    outputFile.close();
-}
-
-///* Write output linear orderd*/
-template<typename T>
-int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
-                const char * filename, const char * suffix)
-{
-    unsigned char * result;
-    T *gpu_output;
-    int i;
-    int size;
-    int samplesNum = pixWidth*pixHeight;
-
-    size = samplesNum*sizeof(T);
-    cudaMallocHost((void **)&gpu_output, size);
-    cudaCheckError("Malloc host");
-    memset(gpu_output, 0, size);
-    result = (unsigned char *)malloc(samplesNum);
-    cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
-    cudaCheckError("Memcopy device to host");
-
-    /* T to char */
-    samplesToChar(result, gpu_output, samplesNum, filename);
-
-    /* Write component */
-    char outfile[strlen(filename)+strlen(suffix)];
-    strcpy(outfile, filename);
-    strcpy(outfile+strlen(filename), suffix);
-    i = open(outfile, O_CREAT|O_WRONLY, 0644);
-    if (i == -1) {
-        error(0,errno,"cannot access %s", outfile);
-        return -1;
-    }
-    printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
-    ssize_t x ;
-    x = write(i, result, samplesNum);
-    close(i);
-
-    /* Clean up */
-    cudaFreeHost(gpu_output);
-    cudaCheckError("Cuda free host memory");
-    free(result);
-    if(x == 0) return 1;
-    return 0;
-}
-template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
-template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
-
-/* Write output visual ordered */
-template<typename T>
-int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
-                     int stages, const char * filename, const char * suffix)
-{
-    struct band {
-        int dimX;
-        int dimY;
-    };
-    struct dimensions {
-        struct band LL;
-        struct band HL;
-        struct band LH;
-        struct band HH;
-    };
-
-    unsigned char * result;
-    T *src, *dst;
-    int i,s;
-    int size;
-    int offset;
-    int yOffset;
-    int samplesNum = pixWidth*pixHeight;
-    struct dimensions * bandDims;
-
-    bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
-
-    bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
-    bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
-    bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
-    bandDims[0].HL.dimY = bandDims[0].LL.dimY;
-    bandDims[0].LH.dimX = bandDims[0].LL.dimX;
-    bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
-    bandDims[0].HH.dimX = bandDims[0].HL.dimX;
-    bandDims[0].HH.dimY = bandDims[0].LH.dimY;
-
-    for (i = 1; i < stages; i++) {
-        bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
-        bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
-        bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
-        bandDims[i].HL.dimY = bandDims[i].LL.dimY;
-        bandDims[i].LH.dimX = bandDims[i].LL.dimX;
-        bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
-        bandDims[i].HH.dimX = bandDims[i].HL.dimX;
-        bandDims[i].HH.dimY = bandDims[i].LH.dimY;
-    }
-
-#if 0
-    printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
-    for (i = 0; i < stages; i++) {
-        printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
-        printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
-        printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
-        printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
-    }
-#endif
-
-    size = samplesNum*sizeof(T);
-    cudaMallocHost((void **)&src, size);
-    cudaCheckError("Malloc host");
-    dst = (T*)malloc(size);
-    memset(src, 0, size);
-    memset(dst, 0, size);
-    result = (unsigned char *)malloc(samplesNum);
-    cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
-    cudaCheckError("Memcopy device to host");
-
-    // LL Band
-    size = bandDims[stages-1].LL.dimX * sizeof(T);
-    for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
-        memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
-    }
-
-    for (s = stages - 1; s >= 0; s--) {
-        // HL Band
-        size = bandDims[s].HL.dimX * sizeof(T);
-        offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
-        for (i = 0; i < bandDims[s].HL.dimY; i++) {
-            memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
-                src+offset+i*bandDims[s].HL.dimX,
-                size);
-        }
-
-        // LH band
-        size = bandDims[s].LH.dimX * sizeof(T);
-        offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
-        yOffset = bandDims[s].LL.dimY;
-        for (i = 0; i < bandDims[s].HL.dimY; i++) {
-            memcpy(dst+(yOffset+i)*pixWidth,
-                src+offset+i*bandDims[s].LH.dimX,
-                size);
-        }
-
-        //HH band
-        size = bandDims[s].HH.dimX * sizeof(T);
-        offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
-        yOffset = bandDims[s].HL.dimY;
-        for (i = 0; i < bandDims[s].HH.dimY; i++) {
-            memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
-                src+offset+i*bandDims[s].HH.dimX,
-                size);
-        }
-    }
-
-    /* Write component */
-    samplesToChar(result, dst, samplesNum, filename);
-
-    char outfile[strlen(filename)+strlen(suffix)];
-    strcpy(outfile, filename);
-    strcpy(outfile+strlen(filename), suffix);
-    i = open(outfile, O_CREAT|O_WRONLY, 0644);
-    if (i == -1) {
-        error(0,errno,"cannot access %s", outfile);
-        return -1;
-    }
-    printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
-    ssize_t x;
-    x = write(i, result, samplesNum);
-    close(i);
-
-    cudaFreeHost(src);
-    cudaCheckError("Cuda free host memory");
-    free(dst);
-    free(result);
-    free(bandDims);
-    if (x == 0) return 1;
-    return 0;
-}
-template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
-template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
--- a/examples/dwt2d/dwt.h
+++ b/examples/dwt2d/dwt.h
@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2009, Jiri Matela
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _DWT_H
-#define _DWT_H
-
-template <typename T>
-int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
-                int stages, bool forward);
-
-template <typename T>
-int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
-                     const char *filename, const char *suffix);
-template <typename T>
-int writeLinear(T *component_cuda, int width, int height, const char *filename,
-                const char *suffix);
-
-#endif
--- a/examples/dwt2d/dwt_cuda/common.cu
+++ b/examples/dwt2d/dwt_cuda/common.cu
@ -1,35 +0,0 @@
-///
-/// @file    common.cu
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-01-20 14:37
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-#include "common.h"
-
-namespace dwt_cuda {
-  bool CudaDWTTester::testRunning = false;
-}
--- a/examples/dwt2d/dwt_cuda/common.h
+++ b/examples/dwt2d/dwt_cuda/common.h
@ -1,232 +0,0 @@
-///
-/// @file    common.h
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @brief   Common stuff for all CUDA dwt functions.
-/// @date    2011-01-20 14:19
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-#ifndef DWT_COMMON_H
-#define DWT_COMMON_H
-
-#include <algorithm>
-#include <cstdio>
-#include <vector>
-
-// compile time minimum macro
-#define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
-
-// performance testing macros
-#if defined(GPU_DWT_TESTING)
-#define PERF_BEGIN                                                             \
-  {                                                                            \
-    dwt_cuda::CudaDWTTester PERF_TESTER;                                       \
-    for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) {             \
-      PERF_TESTER.beginTestIteration();
-
-#define PERF_END(PERF_NAME, PERF_W, PERF_H)                                    \
-  PERF_TESTER.endTestIteration();                                              \
-  }                                                                            \
-  PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H);                      \
-  }
-#else // GPU_DWT_TESTING
-#define PERF_BEGIN
-#define PERF_END(PERF_NAME, PERF_W, PERF_H)
-#endif // GPU_DWT_TESTING
-
-namespace dwt_cuda {
-
-/// Divide and round up.
-template <typename T>
-__device__ __host__ inline T divRndUp(const T &n, const T &d) {
-  return (n / d) + ((n % d) ? 1 : 0);
-}
-
-// 9/7 forward DWT lifting schema coefficients
-const float f97Predict1 = -1.586134342;  ///< forward 9/7 predict 1
-const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
-const float f97Predict2 = 0.8829110762;  ///< forward 9/7 predict 2
-const float f97Update2 = 0.4435068522;   ///< forward 9/7 update 2
-
-// 9/7 reverse DWT lifting schema coefficients
-const float r97update2 = -f97Update2;   ///< undo 9/7 update 2
-const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
-const float r97update1 = -f97Update1;   ///< undo 9/7 update 1
-const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
-
-// FDWT 9/7 scaling coefficients
-const float scale97Mul = 1.23017410491400f;
-const float scale97Div = 1.0 / scale97Mul;
-
-// 5/3 forward DWT lifting schema coefficients
-const float forward53Predict = -0.5f; /// forward 5/3 predict
-const float forward53Update = 0.25f;  /// forward 5/3 update
-
-// 5/3 forward DWT lifting schema coefficients
-const float reverse53Update = -forward53Update;   /// undo 5/3 update
-const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
-
-/// Functor which adds scaled sum of neighbors to given central pixel.
-struct AddScaledSum {
-  const float scale; // scale of neighbors
-  __device__ AddScaledSum(const float scale) : scale(scale) {}
-  __device__ void operator()(const float p, float &c, const float n) const {
-
-    // if(threadIdx.x == 0) {
-
-    //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
-    //   scale * (p + n) );
-
-    // }
-
-    c += scale * (p + n);
-  }
-};
-
-/// Returns index ranging from 0 to num threads, such that first half
-/// of threads get even indices and others get odd indices. Each thread
-/// gets different index.
-/// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
-///                              parityIdx:   0  2  4  6  1  3  5  7
-/// @tparam THREADS  total count of participating threads
-/// @return parity-separated index of thread
-template <int THREADS> __device__ inline int parityIdx() {
-  return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
-}
-
-/// size of shared memory
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-const int SHM_SIZE = 48 * 1024;
-#else
-const int SHM_SIZE = 16 * 1024;
-#endif
-
-/// Perrformance and return code tester.
-class CudaDWTTester {
-private:
-  static bool testRunning;  ///< true if any test is currently running
-  cudaEvent_t beginEvent;   ///< begin CUDA event
-  cudaEvent_t endEvent;     ///< end CUDA event
-  std::vector<float> times; ///< collected times
-  const bool disabled;      ///< true if this object is disabled
-public:
-  /// Checks CUDA related error.
-  /// @param status   return code to be checked
-  /// @param message  message to be shown if there was an error
-  /// @return true if there was no error, false otherwise
-  static bool check(const cudaError_t &status, const char *message) {
-#if defined(GPU_DWT_TESTING)
-    if ((!testRunning) && status != cudaSuccess) {
-      const char *errorString = cudaGetErrorString(status);
-      fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
-      fflush(stderr);
-      return false;
-    }
-#endif // GPU_DWT_TESTING
-    return true;
-  }
-
-  /// Checks last kernel call for errors.
-  /// @param message  description of the kernel call
-  /// @return true if there was no error, false otherwise
-  static bool checkLastKernelCall(const char *message) {
-#if defined(GPU_DWT_TESTING)
-    return testRunning ? true : check(cudaThreadSynchronize(), message);
-#else  // GPU_DWT_TESTING
-    return true;
-#endif // GPU_DWT_TESTING
-  }
-
-  /// Initializes DWT tester for time measurement
-  CudaDWTTester() : disabled(testRunning) {}
-
-  /// Gets rpefered number of iterations
-  int getNumIterations() { return disabled ? 1 : 31; }
-
-  /// Starts one test iteration.
-  void beginTestIteration() {
-    if (!disabled) {
-      cudaEventCreate(&beginEvent);
-      cudaEventCreate(&endEvent);
-      cudaEventRecord(beginEvent, 0);
-      testRunning = true;
-    }
-  }
-
-  /// Ends on etest iteration.
-  void endTestIteration() {
-    if (!disabled) {
-      float time;
-      testRunning = false;
-      cudaEventRecord(endEvent, 0);
-      cudaEventSynchronize(endEvent);
-      cudaEventElapsedTime(&time, beginEvent, endEvent);
-      cudaEventDestroy(beginEvent);
-      cudaEventDestroy(endEvent);
-      times.push_back(time);
-    }
-  }
-
-  /// Shows brief info about all iterations.
-  /// @param name   name of processing method
-  /// @param sizeX  width of processed image
-  /// @param sizeY  height of processed image
-  void showPerformance(const char *name, const int sizeX, const int sizeY) {
-    if (!disabled) {
-      // compute mean and median
-      std::sort(times.begin(), times.end());
-      double sum = 0;
-      for (int i = times.size(); i--;) {
-        sum += times[i];
-      }
-      const double median =
-          (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
-      printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
-             "(%d x %d)\n",
-             name, (sum / times.size()), median, times[times.size() - 1], sizeX,
-             sizeY);
-    }
-  }
-};
-
-/// Simple cudaMemcpy wrapped in performance tester.
-/// @param dest  destination bufer
-/// @param src   source buffer
-/// @param sx    width of copied image
-/// @param sy    height of copied image
-template <typename T>
-inline void memCopy(T *const dest, const T *const src, const size_t sx,
-                    const size_t sy) {
-  cudaError_t status;
-  PERF_BEGIN
-  status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
-  PERF_END("        memcpy", sx, sy)
-  CudaDWTTester::check(status, "memcpy device > device");
-}
-
-} // end of namespace dwt_cuda
-
-#endif // DWT_COMMON_CUDA_H
--- a/examples/dwt2d/dwt_cuda/dwt.h
+++ b/examples/dwt2d/dwt_cuda/dwt.h
@ -1,103 +0,0 @@
-///
-/// @file    dwt.h
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @brief   Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
-/// @date    2011-01-20 11:41
-///
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-///
-///
-/// Following conditions are common for all four DWT functions:
-/// - Both input and output images are stored in GPU memory with no padding
-///   of lines or interleaving of pixels.
-/// - DWT coefficients are stored as follows: Each band is saved as one
-///   consecutive chunk (no padding/stride/interleaving). Deepest level bands
-///   (smallest ones) are stored first (at the beginning of the input/output
-///   buffers), less deep bands follow. There is no padding between stored
-///   bands in the buffer. Order of bands of the same level in the buffer is
-///   following: Low-low band (or deeper level subbands) is stored first.
-///   Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low
-///   band is saved next and finally, the high-high band is saved. Out of all
-///   low-low bands, only th edeepest one is saved (right at the beginning of
-///   the buffer), others are replaced with deeper level subbands.
-/// - Input images of all functions won't be preserved (will be overwritten).
-/// - Input and output buffers can't overlap.
-/// - Size of output buffer must be greater or equal to size of input buffer.
-///
-/// There are no common compile time settings (buffer size, etc...) for
-/// all DWTs, because each DTW type needs different amount of GPU resources.
-/// Instead, each DWT type has its own compile time settings, which can be
-/// found in *.cu file, where it is implemented.
-///
-
-#ifndef DWT_CUDA_H
-#define DWT_CUDA_H
-
-namespace dwt_cuda {
-
-/// Forward 5/3 2D DWT. See common rules (above) for more details.
-/// @param in      Expected to be normalized into range [-128, 127].
-///                Will not be preserved (will be overwritten).
-/// @param out     output buffer on GPU
-/// @param sizeX   width of input image (in pixels)
-/// @param sizeY   height of input image (in pixels)
-/// @param levels  number of recursive DWT levels
-void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
-
-/// Reverse 5/3 2D DWT. See common rules (above) for more details.
-/// @param in      Input DWT coefficients. Format described in common rules.
-///                Will not be preserved (will be overwritten).
-/// @param out     output buffer on GPU - will contain original image
-///                in normalized range [-128, 127].
-/// @param sizeX   width of input image (in pixels)
-/// @param sizeY   height of input image (in pixels)
-/// @param levels  number of recursive DWT levels
-void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
-
-/// Forward 9/7 2D DWT. See common rules (above) for more details.
-/// @param in      Input DWT coefficients. Should be normalized (in range
-///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
-/// @param out     output buffer on GPU - format specified in common rules
-/// @param sizeX   width of input image (in pixels)
-/// @param sizeY   height of input image (in pixels)
-/// @param levels  number of recursive DWT levels
-void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
-
-/// Reverse 9/7 2D DWT. See common rules (above) for more details.
-/// @param in      Input DWT coefficients. Format described in common rules.
-///                Will not be preserved (will be overwritten).
-/// @param out     output buffer on GPU - will contain original image
-///                in normalized range [-0.5, 0.5].
-/// @param sizeX   width of input image (in pixels)
-/// @param sizeY   height of input image (in pixels)
-/// @param levels  number of recursive DWT levels
-void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
-
-} // namespace dwt_cuda
-
-#endif // DWT_CUDA_H
--- a/examples/dwt2d/dwt_cuda/fdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt53.cu
@ -1,400 +0,0 @@
-/// @file    fdwt53.cu
-/// @brief   CUDA implementation of forward 5/3 2D DWT.
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-02-04 13:23
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-
-#include "common.h"
-#include "transform_buffer.h"
-#include "io.h"
-
-namespace dwt_cuda {
-
-
-  /// Wraps buffer and methods needed for computing one level of 5/3 FDWT
-  /// using sliding window approach.
-  /// @tparam WIN_SIZE_X  width of sliding window
-  /// @tparam WIN_SIZE_Y  height of sliding window
-  template <int WIN_SIZE_X, int WIN_SIZE_Y>
-  class FDWT53 {
-  private:
-
-    /// Info needed for processing of one input column.
-    /// @tparam CHECKED_LOADER  true if column's loader should check boundaries
-    ///                         false if there are no near boudnaries to check
-    template <bool CHECKED_LOADER>
-    struct FDWT53Column {
-      /// loader for the column
-      VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
-
-      /// offset of the column in shared buffer
-      int offset;
-
-      // backup of first 3 loaded pixels (not transformed)
-      int pixel0, pixel1, pixel2;
-
-      /// Sets all fields to anything to prevent 'uninitialized' warnings.
-      __device__ void clear() {
-        offset = pixel0 = pixel1 = pixel2 = 0;
-        loader.clear();
-      }
-    };
-
-
-    /// Type of shared memory buffer for 5/3 FDWT transforms.
-    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
-
-    /// Actual shared buffer used for forward 5/3 DWT.
-    FDWT53Buffer buffer;
-
-    /// Difference between indices of two vertical neighbors in buffer.
-    enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
-
-
-    /// Forward 5/3 DWT predict operation.
-    struct Forward53Predict {
-      __device__ void operator() (const int p, int & c, const int n) const {
-        // c = n;
-        c -= (p + n) / 2;      // F.8, page 126, ITU-T Rec. T.800 final draft the real one
-      }
-    };
-
-
-    /// Forward 5/3 DWT update operation.
-    struct Forward53Update {
-      __device__ void operator() (const int p, int & c, const int n) const {
-        c += (p + n + 2) / 4;  // F.9, page 126, ITU-T Rec. T.800 final draft
-      }
-    };
-
-
-    /// Initializes one column: computes offset of the column in shared memory
-    /// buffer, initializes loader and finally uses it to load first 3 pixels.
-    /// @tparam CHECKED  true if loader of the column checks boundaries
-    /// @param column    (uninitialized) column info to be initialized
-    /// @param input     input image
-    /// @param sizeX     width of the input image
-    /// @param sizeY     height of the input image
-    /// @param colIndex  x-axis coordinate of the column (relative to the left
-    ///                  side of this threadblock's block of input pixels)
-    /// @param firstY    y-axis coordinate of first image row to be transformed
-
-	template <bool CHECKED>
-    __device__ void initColumn(FDWT53Column<CHECKED> & column,
-                               const int * const input,
-                               const int sizeX, const int sizeY,
-                               const int colIndex, const int firstY) {
-      // get offset of the column with index 'cId'
-      column.offset = buffer.getColumnOffset(colIndex);
-
-      // coordinates of the first pixel to be loaded
-      const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
-
-      if(blockIdx.y == 0) {
-        // topmost block - apply mirroring rules when loading first 3 rows
-        column.loader.init(sizeX, sizeY, firstX, firstY);
-
-        // load pixels in mirrored way
-        column.pixel2 = column.loader.loadFrom(input);  // loaded pixel #0
-        column.pixel1 = column.loader.loadFrom(input);  // loaded pixel #1
-        column.pixel0 = column.loader.loadFrom(input);  // loaded pixel #2
-
-        // reinitialize loader to start with pixel #1 again
-        column.loader.init(sizeX, sizeY, firstX, firstY + 1);
-      } else {
-        // non-topmost row - regular loading:
-        column.loader.init(sizeX, sizeY, firstX, firstY - 2);
-
-        // load 3 rows into the column
-        column.pixel0 = column.loader.loadFrom(input);
-        column.pixel1 = column.loader.loadFrom(input);
-        column.pixel2 = column.loader.loadFrom(input);
-        // Now, the next pixel, which will be loaded by loader, is pixel #1.
-      }
-
-	}
-
-
-    /// Loads and vertically transforms given column. Assumes that first 3
-    /// pixels are already loaded in column fields pixel0 ... pixel2.
-    /// @tparam CHECKED  true if loader of the column checks boundaries
-    /// @param column    column to be loaded and vertically transformed
-    /// @param input     pointer to input image data
-    template <bool CHECKED>
-    __device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
-                                               const int * const input) {
-	  // take 3 loaded pixels and put them into shared memory transform buffer
-      buffer[column.offset + 0 * STRIDE] = column.pixel0;
-      buffer[column.offset + 1 * STRIDE] = column.pixel1;
-      buffer[column.offset + 2 * STRIDE] = column.pixel2;
-
-      // load remaining pixels to be able to vertically transform the window
-
-      for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
-      {
-        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
-      }
-
-      // remember last 3 pixels for use in next iteration
-      column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
-      column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
-      column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
-
-      // vertically transform the column in transform buffer
-	  buffer.forEachVerticalOdd(column.offset, Forward53Predict());
-      buffer.forEachVerticalEven(column.offset, Forward53Update());
-
-    }
-
-
-    /// Actual implementation of 5/3 FDWT.
-    /// @tparam CHECK_LOADS   true if input loader must check boundaries
-    /// @tparam CHECK_WRITES  true if output writer must check boundaries
-    /// @param in        input image
-    /// @param out       output buffer
-    /// @param sizeX     width of the input image
-    /// @param sizeY     height of the input image
-    /// @param winSteps  number of sliding window steps
-    template <bool CHECK_LOADS, bool CHECK_WRITES>
-    __device__ void transform(const int * const in, int * const out,
-                              const int sizeX, const int sizeY,
-                              const int winSteps) {
-      // info about one main and one boundary columns processed by this thread
-      FDWT53Column<CHECK_LOADS> column;
-      FDWT53Column<CHECK_LOADS> boundaryColumn;  // only few threads use this
-
-      // Initialize all column info: initialize loaders, compute offset of
-      // column in shared buffer and initialize loader of column.
-      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
-	    initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
-
-
-      // first 3 threads initialize boundary columns, others do not use them
-      boundaryColumn.clear();
-      if(threadIdx.x < 3) {
-        // index of boundary column (relative x-axis coordinate of the column)
-        const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
-
-        // initialize the column
-        initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
-
-      }
-
-
-      // index of column which will be written into output by this thread
-	  const int outColumnIndex = parityIdx<WIN_SIZE_X>();
-
-      // offset of column which will be written by this thread into output
-      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
-
-      // initialize output writer for this thread
-      const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
-      VerticalDWTBandWriter<int, CHECK_WRITES> writer;
-	    writer.init(sizeX, sizeY, outputFirstX, firstY);
-			__syncthreads();
-
-
-      // Sliding window iterations:
-      // Each iteration assumes that first 3 pixels of each column are loaded.
-     for(int w = 0; w < winSteps; w++) {
-
-	 // For each column (including boundary columns): load and vertically
-        // transform another WIN_SIZE_Y lines.
-        loadAndVerticallyTransform(column, in);
-        if(threadIdx.x < 3) {
-          loadAndVerticallyTransform(boundaryColumn, in);
-        }
-
-        // wait for all columns to be vertically transformed and transform all
-        // output rows horizontally
-        __syncthreads();
-
-
-		buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
-        __syncthreads();
-
-        buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
-
-        // wait for all output rows to be transformed horizontally and write
-        // them into output buffer
-        __syncthreads();
-
-
-        for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
-          // Write low coefficients from output column into low band ...
-			writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
-          // ... and high coeficients into the high band.
-			writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
-        }
-
-        // before proceeding to next iteration, wait for all output columns
-        // to be written into the output
-        __syncthreads();
-
-	    }
-
-    }
-
-
-  public:
-    /// Determines, whether this block's pixels touch boundary and selects
-    /// right version of algorithm according to it - for many threadblocks, it
-    /// selects version which does not deal with boundary mirroring and thus is
-    /// slightly faster.
-    /// @param in     input image
-    /// @param out    output buffer
-    /// @param sx     width of the input image
-    /// @param sy     height of the input image
-    /// @param steps  number of sliding window steps
-    __device__ static void run(const int * const in, int * const out,
-                               const int sx, const int sy, const int steps) {
-        // if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
-      // object with transform buffer in shared memory
-      __shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
-
-	  // Compute limits of this threadblock's block of pixels and use them to
-      // determine, whether this threadblock will have to deal with boundary.
-      // (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
-      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
-      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
-      const bool atRightBoudary = maxX >= sx;
-      const bool atBottomBoudary = maxY >= sy;
-
-      // Select specialized version of code according to distance of this
-      // threadblock's pixels from image boundary.
-
-      // if(threadIdx.x == 0) {
-      //   printf("fdwt53 run");
-      // }
-      if(atBottomBoudary)
-      {
-        // near bottom boundary => check both writing and reading
-        fdwt53.transform<true, true>(in, out, sx, sy, steps);
-      } else if(atRightBoudary)
-      {
-        // near right boundary only => check writing only
-        fdwt53.transform<false, true>(in, out, sx, sy, steps);
-      } else
-      {
-        // no nearby boundary => check nothing
-        fdwt53.transform<false, false>(in, out, sx, sy, steps);
-      }
-    }
-    // }
-
-  }; // end of class FDWT53
-
-
-
-  /// Main GPU 5/3 FDWT entry point.
-  /// @tparam WIN_SX   width of sliding window to be used
-  /// @tparam WIN_SY   height of sliding window to be used
-  /// @param input     input image
-  /// @param output    output buffer
-  /// @param sizeX     width of the input image
-  /// @param sizeY     height of the input image
-  /// @param winSteps  number of sliding window steps
-  template <int WIN_SX, int WIN_SY>
-  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
-  __global__ void fdwt53Kernel(const int * const input, int * const output,
-                               const int sizeX, const int sizeY,
-                               const int winSteps) {
-    FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
-  }
-
-
-
-  /// Only computes optimal number of sliding window steps,
-  /// number of threadblocks and then lanches the 5/3 FDWT kernel.
-  /// @tparam WIN_SX  width of sliding window
-  /// @tparam WIN_SY  height of sliding window
-  /// @param in       input image
-  /// @param out      output buffer
-  /// @param sx       width of the input image
-  /// @param sy       height of the input image
-  template <int WIN_SX, int WIN_SY>
-  void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
-    // compute optimal number of steps of each sliding window
-
-    const int steps = divRndUp(sy, 15 * WIN_SY);
-
-	int gx = divRndUp(sx, WIN_SX);
-	int gy = divRndUp(sy, WIN_SY * steps);
-
-	printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
-
-    // prepare grid size
-    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
-    // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
-
-    // run kernel, possibly measure time and finally check the call
-    // PERF_BEGIN
-    fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
-    // PERF_END("        FDWT53", sx, sy)
-    // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
-    printf("fdwt53Kernel in launchFDWT53Kernel has finished");
-
-  }
-
-
-
-  /// Forward 5/3 2D DWT. See common rules (above) for more details.
-  /// @param in      Expected to be normalized into range [-128, 127].
-  ///                Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
-    // select right width of kernel for the size of the image
-
-    if(sizeX >= 960) {
-      launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
-    } else if (sizeX >= 480) {
-      launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
-    } else {
-      launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
-    }
-
-    // if this was not the last level, continue recursively with other levels
-    if(levels > 1) {
-      // copy output's LL band back into input buffer
-      const int llSizeX = divRndUp(sizeX, 2);
-      const int llSizeY = divRndUp(sizeY, 2);
-	 // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
-      memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
-
-      // run remaining levels of FDWT
-      fdwt53(in, out, llSizeX, llSizeY, levels - 1);
-    }
-  }
-
-
-
-} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/fdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt97.cu
@ -1,383 +0,0 @@
-///
-/// @file    fdwt97.cu
-/// @brief   CUDA implementation of forward 9/7 2D DWT.
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-01-20 13:18
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-
-#include "common.h"
-#include "transform_buffer.h"
-#include "io.h"
-
-
-namespace dwt_cuda {
-
-
-
-  /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
-  /// of specified size. Template arguments specify this size.
-  /// @tparam WIN_SIZE_X  width of sliding window
-  /// @tparam WIN_SIZE_Y  height of sliding window
-  template <int WIN_SIZE_X, int WIN_SIZE_Y>
-  class FDWT97 {
-  private:
-    /// Type of shared memory buffer used for 9/7 DWT.
-    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;
-
-    /// Actual shared buffer used for forward 9/7 DWT.
-    FDWT97Buffer buffer;
-
-    /// Difference of indices of two vertically neighboring items in buffer.
-    enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };
-
-
-    /// One thread's info about loading input image
-    /// @tparam CHECKED  true if loader should check for image boundaries
-    template <bool CHECKED>
-    struct FDWT97ColumnLoadingInfo {
-      /// Loader of pixels from some input image.
-      VerticalDWTPixelLoader<float, CHECKED> loader;
-
-      /// Offset of column loaded by loader. (Offset in shared buffer.)
-      int offset;
-    };
-
-
-    /// Horizontal 9/7 FDWT on specified lines of transform buffer.
-    /// @param lines      number of lines to be transformed
-    /// @param firstLine  index of the first line to be transformed
-    __device__ void horizontalFDWT97(const int lines, const int firstLine) {
-      __syncthreads();
-
-      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
-      __syncthreads();
-      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
-      __syncthreads();
-      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
-      __syncthreads();
-      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
-      __syncthreads();
-
-      buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);
-
-      __syncthreads();
-
-    }
-
-
-    /// Initializes one column of shared transform buffer with 7 input pixels.
-    /// Those 7 pixels will not be transformed. Also initializes given loader.
-    /// @tparam CHECKED     true if loader should check for image boundaries
-    /// @param column       (uninitialized) object for loading input pixels
-    /// @param columnIndex  index (not offset!) of the column to be loaded
-    ///                     (relative to threadblock's first column)
-    /// @param input        pointer to input image in GPU memory
-    /// @param sizeX        width of the input image
-    /// @param sizeY        height of the input image
-    /// @param firstY       index of first row to be loaded from image
-    template <bool CHECKED>
-    __device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
-                              const int columnIndex, const float * const input,
-                              const int sizeX, const int sizeY,
-                              const int firstY) {
-      // get offset of the column with index 'columnIndex'
-      column.offset = buffer.getColumnOffset(columnIndex);
-
-      // printf(" offset: %d  , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);
-
-      // x-coordinate of the first pixel to be loaded by given loader
-      const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
-
-      if(blockIdx.y == 0) {
-        // topmost block - apply mirroring rules when loading first 7 rows
-        column.loader.init(sizeX, sizeY, firstX, firstY);
-
-        // load pixels in mirrored way
-        buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
-        buffer[column.offset + 3 * STRIDE] =
-        buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
-        buffer[column.offset + 2 * STRIDE] =
-        buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
-        buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
-        buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);
-
-        // reinitialize loader to start with pixel #3 again
-        column.loader.init(sizeX, sizeY, firstX, firstY + 3);
-
-      } else {
-        // non-topmost row - regular loading:
-        column.loader.init(sizeX, sizeY, firstX, firstY - 4);
-
-        // load 7 rows into the transform buffer
-        for(int i = 0; i < 7; i++) {
-          buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
-
-        }
-      }
-      // Now, the next pixel, which will be loaded by loader, is pixel #3.
-    }
-
-
-    /// Loads another WIN_SIZE_Y pixels into given column using given loader.
-    /// @tparam CHECKED  true if loader should check for image boundaries
-    /// @param input     input image to load from
-    /// @param column    loader and offset of loaded column in shared buffer
-    template <bool CHECKED>
-    inline __device__ void loadWindowIntoColumn(const float * const input,
-                                  FDWT97ColumnLoadingInfo<CHECKED> & column) {
-      for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
-        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
-      }
-    }
-
-
-    /// Main GPU 9/7 FDWT entry point.
-    /// @tparam CHECK_LOADS   true if boundaries should be checked when loading
-    /// @tparam CHECK_WRITES  true if boundaries should be checked when writing
-    /// @param in        input image
-    /// @param out       output buffer
-    /// @param sizeX     width of the input image
-    /// @param sizeY     height of the input image
-    /// @param winSteps  number of steps of sliding window
-    template <bool CHECK_LOADS, bool CHECK_WRITES>
-    __device__ void transform(const float * const in, float * const out,
-                              const int sizeX, const int sizeY,
-                              const int winSteps) {
-      // info about columns loaded by this thread: one main column and possibly
-      // one boundary column. (Only some threads load some boundary column.)
-      FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
-      FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;
-
-      // Initialize first 7 lines of transform buffer.
-      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
-      initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);
-
-      // Some threads initialize boundary columns.
-      boundaryColumn.offset = 0;
-      boundaryColumn.loader.clear();
-      if(threadIdx.x < 7) {
-        // each thread among first 7 ones gets index of one of boundary columns
-        const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);
-
-        // Thread initializes offset of the boundary column (in shared buffer),
-        // first 7 pixels of the column and a loader for this column.
-        initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
-      }
-
-      // horizontally transform first 7 rows in all columns
-      horizontalFDWT97(7, 0);
-
-      // Index of column handled by this thread. (First half of threads handle
-      // even columns and others handle odd columns.)
-      const int outColumnIndex = parityIdx<WIN_SIZE_X>();
-
-      // writer of output linear bands - initialize it
-      const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
-      VerticalDWTBandWriter<float, CHECK_WRITES> writer;
-      writer.init(sizeX, sizeY, firstX, firstY);
-
-      // transform buffer offset of column transformed and saved by this thread
-      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
-
-      // (Each iteration of this loop assumes that first 7 rows of transform
-      // buffer are already loaded with horizontally transformed coefficients.)
-      for(int w = 0; w < winSteps; w++) {
-        // Load another WIN_SIZE_Y lines of thread's column into the buffer.
-        loadWindowIntoColumn(in, loadedColumn);
-
-        // some threads also load boundary columns
-        if(threadIdx.x < 7) {
-          loadWindowIntoColumn(in, boundaryColumn);
-        }
-
-        // horizontally transform all newly loaded lines
-        horizontalFDWT97(WIN_SIZE_Y, 7);
-
-        // Using 7 registers, remember current values of last 7 rows of
-        // transform buffer. These rows are transformed horizontally only
-        // and will be used in next iteration.
-        float last7Lines[7];
-        for(int i = 0; i < 7; i++) {
-          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
-        }
-
-        // vertically transform all central columns (do not scale yet)
-        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
-        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
-        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
-        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));
-
-        // Save all results of current window. Results are in transform buffer
-        // at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
-        // (They only served as a boundary for vertical FDWT.)
-
-        for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
-          const int index = outColumnOffset + i * STRIDE;
-          // Write low coefficients from column into low band ...
-          writer.writeLowInto(out, buffer[index] * scale97Div);
-          // ... and high coeficients into the high band.
-          writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
-        }
-
-        // Use last 7 remembered lines as first 7 lines for next iteration.
-        // As expected, these lines are already horizontally transformed.
-        for(int i = 0; i < 7; i++) {
-          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
-
-        }
-
-        // Wait for all writing threads before proceeding to loading new
-        // pixels in next iteration. (Not to overwrite those which
-        // are not written yet.)
-        __syncthreads();
-      }
-
-    }
-
-
-  public:
-    /// Runs one of specialized variants of 9/7 FDWT according to distance of
-    /// processed pixels to image boudnary. Some variants do not check for
-    /// boudnary and thus are slightly faster.
-    /// @param in     input image
-    /// @param out    output buffer
-    /// @param sx     width of the input image
-    /// @param sy     height of the input image
-    /// @param steps  number of steps of sliding window
-    __device__ static void run(const float * const input, float * const output,
-                               const int sx, const int sy, const int steps) {
-      // object with transform buffer in shared memory
-      __shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;
-
-      // Compute limits of this threadblock's block of pixels and use them to
-      // determine, whether this threadblock will have to deal with boundary.
-      // (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
-      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
-      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
-      const bool atRightBoudary = maxX >= sx;
-      const bool atBottomBoudary = maxY >= sy;
-
-      // Select specialized version of code according to distance of this
-      // threadblock's pixels from image boundary.
-      if(atBottomBoudary) {
-        // near bottom boundary => check both writing and reading
-        // printf("\n atBottomBoudary \n ");
-        fdwt97.transform<true, true>(input, output, sx, sy, steps);
-      } else if(atRightBoudary) {
-
-        // near right boundary only => check writing only
-        fdwt97.transform<false, true>(input, output, sx, sy, steps);
-      } else {
-
-        // no nearby boundary => check nothing
-        fdwt97.transform<false, false>(input, output, sx, sy, steps);
-      }
-    }
-
-  }; // end of class FDWT97
-
-
-
-  /// Main GPU 9/7 FDWT entry point.
-  /// @param input   input image
-  /// @parma output  output buffer
-  /// @param sx      width of the input image
-  /// @param sy      height of the input image
-  /// @param steps   number of steps of sliding window
-  template <int WIN_SX, int WIN_SY>
-  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
-  __global__ void fdwt97Kernel(const float * const input, float * const output,
-                               const int sx, const int sy, const int steps) {
-    // Excuse me, dear reader of this code - this call have to be here. If you
-    // try to simply put contents of following method right here, CUDA compiler
-    // (version 3.2) will spit tons of nonsense messy errors ...
-    // Hope they will not break it even more in future releases.
-    FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
-  }
-
-
-
-  /// Only computes optimal number of sliding window steps,
-  /// number of threadblocks and then lanches the 9/7 FDWT kernel.
-  /// @tparam WIN_SX  width of sliding window
-  /// @tparam WIN_SY  height of sliding window
-  /// @param in       input image
-  /// @param out      output buffer
-  /// @param sx       width of the input image
-  /// @param sy       height of the input image
-  template <int WIN_SX, int WIN_SY>
-  void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
-    // compute optimal number of steps of each sliding window
-    const int steps = divRndUp(sy, 15 * WIN_SY);
-
-    // prepare grid size
-    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
-    printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
-
-    // run kernel, possibly measure time and finally check the call
-    PERF_BEGIN
-    fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
-    PERF_END("        FDWT97", sx, sy)
-    CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
-  }
-
-
-
-  /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
-  /// @param in      Input DWT coefficients. Should be normalized (in range
-  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU - format specified in common rules
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
-    // select right width of kernel for the size of the image
-    if(sizeX >= 960) {
-      launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
-    } else if (sizeX >= 480) {
-      launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
-    } else {
-      launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
-    }
-
-    // if this was not the last level, continue recursively with other levels
-    if(levels > 1) {
-      // copy output's LL band back into input buffer
-      const int llSizeX = divRndUp(sizeX, 2);
-      const int llSizeY = divRndUp(sizeY, 2);
-      memCopy(in, out, llSizeX, llSizeY);
-
-      // run remaining levels of FDWT
-      fdwt97(in, out, llSizeX, llSizeY, levels - 1);
-    }
-  }
-
-
-
-} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/io.h
+++ b/examples/dwt2d/dwt_cuda/io.h
@ -1,440 +0,0 @@
-///
-/// @file:   io.h
-/// @brief   Manages loading and saving lineary stored bands and input images.
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-01-20 22:38
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-#ifndef IO_H
-#define IO_H
-
-#include "common.h"
-
-namespace dwt_cuda {
-
-/// Base for all IO classes - manages mirroring.
-class DWTIO {
-protected:
-  /// Handles mirroring of image at edges in a DWT correct way.
-  /// @param d      a position in the image (will be replaced by mirrored d)
-  /// @param sizeD  size of the image along the dimension of 'd'
-  __device__ static void mirror(int &d, const int &sizeD) {
-    // TODO: enable multiple mirroring:
-    //      if(sizeD > 1) {
-    //        if(d < 0) {
-    //          const int underflow = -1 - d;
-    //          const int phase = (underflow / (sizeD - 1)) & 1;
-    //          const int remainder = underflow % (sizeD - 1);
-    //          if(phase == 0) {
-    //            d = remainder + 1;
-    //          } else {
-    //            d = sizeD - 2 - remainder;
-    //          }
-    //        } else if(d >= sizeD) {
-    //          const int overflow = d - sizeD;
-    //          const int phase = (overflow / (sizeD - 1)) & 1;
-    //          const int remainder = overflow % (sizeD - 1);
-    //          if(phase == 0) {
-    //            d = sizeD - 2 - remainder;
-    //          } else {
-    //            d = remainder + 1;
-    //          }
-    //        }
-    //      } else {
-    //        d = 0;
-    //      }
-    // for test the mirror's use Feb 17
-    if (d >= sizeD) {
-      d = 2 * sizeD - 2 - d;
-    } else if (d < 0) {
-      d = -d;
-    }
-  }
-};
-
-/// Base class for pixel loader and writer - manages computing start index,
-/// stride and end of image for loading column of pixels.
-/// @tparam T        type of image pixels
-/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
-protected:
-  int end;    ///< index of bottom neightbor of last pixel of column
-  int stride; ///< increment of pointer to get to next pixel
-
-  /// Initializes pixel IO - sets end index and a position of first pixel.
-  /// @param sizeX   width of the image
-  /// @param sizeY   height of the image
-  /// @param firstX  x-coordinate of first pixel to use
-  /// @param firstY  y-coordinate of first pixel to use
-  /// @return index of pixel at position [x, y] in the image
-  __device__ int initialize(const int sizeX, const int sizeY, int firstX,
-                            int firstY) {
-    // initialize all pointers and stride
-    end = CHECKED ? (sizeY * sizeX + firstX) : 0;
-    stride = sizeX;
-    return firstX + sizeX * firstY;
-  }
-};
-
-/// Writes reverse transformed pixels directly into output image.
-/// @tparam T        type of output pixels
-/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-template <typename T, bool CHECKED>
-class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
-private:
-  int next; // index of the next pixel to be loaded
-
-public:
-  /// Initializes writer - sets output buffer and a position of first pixel.
-  /// @param sizeX   width of the image
-  /// @param sizeY   height of the image
-  /// @param firstX  x-coordinate of first pixel to write into
-  /// @param firstY  y-coordinate of first pixel to write into
-  __device__ void init(const int sizeX, const int sizeY, int firstX,
-                       int firstY) {
-    if (firstX < sizeX) {
-      next = this->initialize(sizeX, sizeY, firstX, firstY);
-    } else {
-      this->end = 0;
-      this->stride = 0;
-      next = 0;
-    }
-  }
-
-  /// Writes given value at next position and advances internal pointer while
-  /// correctly handling mirroring.
-  /// @param output  output image to write pixel into
-  /// @param value   value of the pixel to be written
-  __device__ void writeInto(T *const output, const T &value) {
-    if ((!CHECKED) || (next != this->end)) {
-      output[next] = value;
-      next += this->stride;
-    }
-  }
-};
-
-/// Loads pixels from input image.
-/// @tparam T        type of image input pixels
-/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-template <typename T, bool CHECKED>
-class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
-private:
-  int last; ///< index of last loaded pixel
-public:
-  //******************* FOR TEST **********************
-  __device__ int getlast() { return last; }
-  __device__ int getend() { return this->end; }
-  __device__ int getstride() { return this->stride; }
-  __device__ void setend(int a) { this->end = a; }
-  //******************* FOR TEST **********************
-
-  /// Initializes loader - sets input size and a position of first pixel.
-  /// @param sizeX   width of the image
-  /// @param sizeY   height of the image
-  /// @param firstX  x-coordinate of first pixel to load
-  /// @param firstY  y-coordinate of first pixel to load
-  __device__ void init(const int sizeX, const int sizeY, int firstX,
-                       int firstY) {
-    // correctly mirror x coordinate
-    this->mirror(firstX, sizeX);
-
-    // 'last' always points to already loaded pixel (subtract sizeX = stride)
-    last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
-    // last = (FirstX + sizeX * FirstY) - sizeX
-  }
-
-  /// Sets all fields to zeros, for compiler not to complain about
-  /// uninitialized stuff.
-  __device__ void clear() {
-    this->end = 0;
-    this->stride = 0;
-    this->last = 0;
-  }
-
-  /// Gets another pixel and advancees internal pointer to following one.
-  /// @param input  input image to load next pixel from
-  /// @return next pixel from given image
-  __device__ T loadFrom(const T *const input) {
-    last += this->stride;
-    if (CHECKED && (last == this->end)) {
-      last -= 2 * this->stride;
-      this->stride = -this->stride; // reverse loader's direction
-    }
-    // avoid reading from negative indices if loader is checked
-    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
-    // checked variant later
-    if (last < 0) {
-      return 0;
-    }
-
-    return input[last];
-    // return this->end;
-    // return last;
-    // return this->stride;
-  }
-};
-
-/// Base for band write and loader. Manages computing strides and pointers
-/// to first and last pixels in a linearly-stored-bands correct way.
-/// @tparam T        type of band coefficients
-/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
-protected:
-  /// index of bottom neighbor of last pixel of loaded column
-  int end;
-
-  /// increment of index to get from highpass band to the lowpass one
-  int strideHighToLow;
-
-  /// increment of index to get from the lowpass band to the highpass one
-  int strideLowToHigh;
-
-  /// Initializes IO - sets size of image and a position of first pixel.
-  /// @param imageSizeX   width of the image
-  /// @param imageSizeY   height of the image
-  /// @param firstX       x-coordinate of first pixel to use
-  ///                     (Parity determines vertically low or high band.)
-  /// @param firstY       y-coordinate of first pixel to use
-  ///                     (Parity determines horizontally low or high band.)
-  /// @return index of first item specified by firstX and firstY
-  __device__ int initialize(const int imageSizeX, const int imageSizeY,
-                            int firstX, int firstY) {
-    // index of first pixel (topmost one) of the column with index firstX
-    int columnOffset = firstX / 2;
-
-    // difference between indices of two vertically neighboring pixels
-    // in the same band
-    int verticalStride;
-
-    // resolve index of first pixel according to horizontal parity
-    if (firstX & 1) {
-      // first pixel in one of right bands
-      verticalStride = imageSizeX / 2;
-      columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
-      strideLowToHigh = (imageSizeX * imageSizeY) / 2;
-    } else {
-      // first pixel in one of left bands
-      verticalStride = imageSizeX / 2 + (imageSizeX & 1);
-      strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
-    }
-
-    // set the other stride
-    strideHighToLow = verticalStride - strideLowToHigh;
-
-    // compute index of coefficient which indicates end of image
-    if (CHECKED) {
-      end = columnOffset                          // right column
-            + (imageSizeY / 2) * verticalStride   // right row
-            + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
-    } else {
-      end = 0;
-    }
-
-    //***********for test**************
-    //	end = CHECKED;
-    //***********for test**************
-
-    // finally, return index of the first item
-    return columnOffset                      // right column
-           + (firstY / 2) * verticalStride   // right row
-           + (firstY & 1) * strideLowToHigh; // possibly in high band
-  }
-};
-
-/// Directly loads coefficients from four consecutively stored transformed
-/// bands.
-/// @tparam T        type of input band coefficients
-/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-template <typename T, bool CHECKED>
-class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
-private:
-  int last; ///< index of last loaded pixel
-
-  /// Checks internal index and possibly reverses direction of loader.
-  /// (Handles mirroring at the bottom of the image.)
-  /// @param input   input image to load next coefficient from
-  /// @param stride  stride to use now (one of two loader's strides)
-  /// @return loaded coefficient
-  __device__ T updateAndLoad(const T *const input, const int &stride) {
-    last += stride;
-    if (CHECKED && (last == this->end)) {
-      // undo last two updates of index (to get to previous mirrored item)
-      last -= (this->strideLowToHigh + this->strideHighToLow);
-
-      // swap and reverse strides (to move up in the loaded column now)
-      const int temp = this->strideLowToHigh;
-      this->strideLowToHigh = -this->strideHighToLow;
-      this->strideHighToLow = -temp;
-    }
-    if (last < 0) {
-      return 0;
-    }
-    // avoid reading from negative indices if loader is checked
-    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
-    // checked variant later
-    return input[last];
-  }
-
-public:
-  /// Initializes loader - sets input size and a position of first pixel.
-  /// @param imageSizeX   width of the image
-  /// @param imageSizeY   height of the image
-  /// @param firstX       x-coordinate of first pixel to load
-  ///                     (Parity determines vertically low or high band.)
-  /// @param firstY       y-coordinate of first pixel to load
-  ///                     (Parity determines horizontally low or high band.)
-  __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
-                       const int firstY) {
-    this->mirror(firstX, imageSizeX);
-    last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
-
-    // adjust to point to previous item
-    last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
-  }
-
-  /// Sets all fields to zeros, for compiler not to complain about
-  /// uninitialized stuff.
-  __device__ void clear() {
-    this->end = 0;
-    this->strideHighToLow = 0;
-    this->strideLowToHigh = 0;
-    this->last = 0;
-  }
-
-  /// Gets another coefficient from lowpass band and advances internal index.
-  /// Call this method first if position of first pixel passed to init
-  /// was in high band.
-  /// @param input   input image to load next coefficient from
-  /// @return next coefficient from the lowpass band of the given image
-  __device__ T loadLowFrom(const T *const input) {
-    return updateAndLoad(input, this->strideHighToLow);
-  }
-
-  /// Gets another coefficient from the highpass band and advances index.
-  /// Call this method first if position of first pixel passed to init
-  /// was in high band.
-  /// @param input   input image to load next coefficient from
-  /// @return next coefficient from the highbass band of the given image
-  __device__ T loadHighFrom(const T *const input) {
-    return updateAndLoad(input, this->strideLowToHigh);
-  }
-};
-
-/// Directly saves coefficients into four transformed bands.
-/// @tparam T        type of output band coefficients
-/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-template <typename T, bool CHECKED>
-class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
-private:
-  int next; ///< index of last loaded pixel
-
-  /// Checks internal index and possibly stops the writer.
-  /// (Handles mirroring at edges of the image.)
-  /// @param output  output buffer
-  /// @param item    item to put into the output
-  /// @param stride  increment of the pointer to get to next output index
-  __device__ int saveAndUpdate(T *const output, const T &item,
-                               const int &stride) {
-    //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
-    ////test, Mar 20
-    if ((!CHECKED) || (next != this->end)) {
-      // if(next == 4) {
-      //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
-      // }
-      output[next] = item;
-      next += stride;
-    }
-    //	}
-    // if((!CHECKED) || (next != this->end)) { //the real one
-    // output[next] = item;
-    // next += stride;  //stride has been test
-    // }
-    return next;
-  }
-
-public:
-  /// Initializes writer - sets output size and a position of first pixel.
-  /// @param output       output image
-  /// @param imageSizeX   width of the image
-  /// @param imageSizeY   height of the image
-  /// @param firstX       x-coordinate of first pixel to write
-  ///                     (Parity determines vertically low or high band.)
-  /// @param firstY       y-coordinate of first pixel to write
-  ///                     (Parity determines horizontally low or high band.)
-  __device__ void init(const int imageSizeX, const int imageSizeY,
-                       const int firstX, const int firstY) {
-    if (firstX < imageSizeX) {
-      next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
-    } else {
-      clear();
-    }
-  }
-
-  /// Sets all fields to zeros, for compiler not to complain about
-  /// uninitialized stuff.
-  __device__ void clear() {
-    this->end = 0;
-    this->strideHighToLow = 0;
-    this->strideLowToHigh = 0;
-    this->next = 0;
-  }
-
-  /// Writes another coefficient into the band which was specified using
-  /// init's firstX and firstY parameters and advances internal pointer.
-  /// Call this method first if position of first pixel passed to init
-  /// was in lowpass band.
-  /// @param output  output image
-  /// @param low     lowpass coefficient to save into the lowpass band
-  __device__ int writeLowInto(T *const output, const T &primary) {
-    return saveAndUpdate(output, primary, this->strideLowToHigh);
-  }
-
-  /// Writes another coefficient from the other band and advances pointer.
-  /// Call this method first if position of first pixel passed to init
-  /// was in highpass band.
-  /// @param output  output image
-  /// @param high    highpass coefficient to save into the highpass band
-  __device__ int writeHighInto(T *const output, const T &other) {
-    return saveAndUpdate(output, other, this->strideHighToLow);
-  }
-
-  //*******Add three functions to get private values*******
-  __device__ int getnext() { return next; }
-
-  __device__ int getend() { return this->end; }
-
-  __device__ int getstrideHighToLow() { return this->strideHighToLow; }
-
-  __device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
-
-  //*******Add three functions to get private values*******
-};
-
-} // namespace dwt_cuda
-
-#endif // IO_H
--- a/examples/dwt2d/dwt_cuda/rdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt53.cu
@ -1,360 +0,0 @@
-///
-/// @file    rdwt53.cu
-/// @brief   CUDA implementation of reverse 5/3 2D DWT.
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-02-04 14:19
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-
-#include "common.h"
-#include "transform_buffer.h"
-#include "io.h"
-
-
-namespace dwt_cuda {
-
-
-
-  /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
-  /// using sliding window and lifting schema.
-  /// @tparam WIN_SIZE_X  width of sliding window
-  /// @tparam WIN_SIZE_Y  height of sliding window
-  template <int WIN_SIZE_X, int WIN_SIZE_Y>
-  class RDWT53 {
-  private:
-
-    /// Shared memory buffer used for 5/3 DWT transforms.
-    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
-
-    /// Shared buffer used for reverse 5/3 DWT.
-    RDWT53Buffer buffer;
-
-    /// Difference between indices of two vertically neighboring items in buffer.
-    enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
-
-
-    /// Info needed for loading of one input column from input image.
-    /// @tparam CHECKED  true if loader should check boundaries
-    template <bool CHECKED>
-    struct RDWT53Column {
-      /// loader of pixels from column in input image
-      VerticalDWTBandLoader<int, CHECKED> loader;
-
-      /// Offset of corresponding column in shared buffer.
-      int offset;
-
-      /// Sets all fields to some values to avoid 'uninitialized' warnings.
-      __device__ void clear() {
-        offset = 0;
-        loader.clear();
-      }
-    };
-
-
-    /// 5/3 DWT reverse update operation.
-    struct Reverse53Update {
-      __device__ void operator() (const int p, int & c, const int n) const {
-        c -= (p + n + 2) / 4;  // F.3, page 118, ITU-T Rec. T.800 final draft
-      }
-    };
-
-
-    /// 5/3 DWT reverse predict operation.
-    struct Reverse53Predict {
-      __device__ void operator() (const int p, int & c, const int n) const {
-        c += (p + n) / 2;      // F.4, page 118, ITU-T Rec. T.800 final draft
-      }
-    };
-
-
-    /// Horizontal 5/3 RDWT on specified lines of transform buffer.
-    /// @param lines      number of lines to be transformed
-    /// @param firstLine  index of the first line to be transformed
-    __device__ void horizontalTransform(const int lines, const int firstLine) {
-      __syncthreads();
-      buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
-      __syncthreads();
-      buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
-      __syncthreads();
-    }
-
-
-    /// Using given loader, it loads another WIN_SIZE_Y coefficients
-    /// into specified column.
-    /// @tparam CHECKED  true if loader should check image boundaries
-    /// @param input     input coefficients to load from
-    /// @param col       info about loaded column
-    template <bool CHECKED>
-    inline __device__ void loadWindowIntoColumn(const int * const input,
-                                                RDWT53Column<CHECKED> & col) {
-      for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
-        buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
-        buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
-      }
-    }
-
-
-    /// Initializes one column of shared transform buffer with 7 input pixels.
-    /// Those 7 pixels will not be transformed. Also initializes given loader.
-    /// @tparam CHECKED  true if loader should check image boundaries
-    /// @param columnX   x coordinate of column in shared transform buffer
-    /// @param input     input image
-    /// @param sizeX     width of the input image
-    /// @param sizeY     height of the input image
-    /// @param loader    (uninitialized) info about loaded column
-    template <bool CHECKED>
-    __device__ void initColumn(const int columnX, const int * const input,
-                               const int sizeX, const int sizeY,
-                               RDWT53Column<CHECKED> & column,
-                               const int firstY) {
-      // coordinates of the first coefficient to be loaded
-      const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
-
-      // offset of the column with index 'colIndex' in the transform buffer
-      column.offset = buffer.getColumnOffset(columnX);
-
-      if(blockIdx.y == 0) {
-        // topmost block - apply mirroring rules when loading first 3 rows
-        column.loader.init(sizeX, sizeY, firstX, firstY);
-
-        // load pixels in mirrored way
-        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 0 * STRIDE] =
-        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
-      } else {
-        // non-topmost row - regular loading:
-        column.loader.init(sizeX, sizeY, firstX, firstY - 1);
-        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
-        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
-      }
-      // Now, the next coefficient, which will be loaded by loader, is #2.
-    }
-
-
-    /// Actual GPU 5/3 RDWT implementation.
-    /// @tparam CHECKED_LOADS   true if boundaries must be checked when reading
-    /// @tparam CHECKED_WRITES  true if boundaries must be checked when writing
-    /// @param in        input image (5/3 transformed coefficients)
-    /// @param out       output buffer (for reverse transformed image)
-    /// @param sizeX     width of the output image
-    /// @param sizeY     height of the output image
-    /// @param winSteps  number of sliding window steps
-    template<bool CHECKED_LOADS, bool CHECKED_WRITES>
-    __device__ void transform(const int * const in, int * const out,
-                              const int sizeX, const int sizeY,
-                              const int winSteps) {
-      // info about one main and one boundary column
-      RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
-
-      // index of first row to be transformed
-      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
-
-      // some threads initialize boundary columns
-      boundaryColumn.clear();
-      if(threadIdx.x < 3) {
-        // First 3 threads also handle boundary columns. Thread #0 gets right
-        // column #0, thread #1 get right column #1 and thread #2 left column.
-        const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
-
-        // Thread initializes offset of the boundary column (in shared
-        // buffer), first 3 pixels of the column and a loader for this column.
-        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
-      }
-
-      // All threads initialize central columns.
-      initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
-
-      // horizontally transform first 3 rows
-      horizontalTransform(3, 0);
-
-      // writer of output pixels - initialize it
-      const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
-      VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
-      writer.init(sizeX, sizeY, outX, firstY);
-
-      // offset of column (in transform buffer) saved by this thread
-      const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
-
-      // (Each iteration assumes that first 3 rows of transform buffer are
-      // already loaded with horizontally transformed pixels.)
-      for(int w = 0; w < winSteps; w++) {
-        // Load another WIN_SIZE_Y lines of this thread's column
-        // into the transform buffer.
-        loadWindowIntoColumn(in, column);
-
-        // possibly load boundary columns
-        if(threadIdx.x < 3) {
-          loadWindowIntoColumn(in, boundaryColumn);
-        }
-
-        // horizontally transform all newly loaded lines
-        horizontalTransform(WIN_SIZE_Y, 3);
-
-        // Using 3 registers, remember current values of last 3 rows
-        // of transform buffer. These rows are transformed horizontally
-        // only and will be used in next iteration.
-        int last3Lines[3];
-        last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
-        last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
-        last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
-
-        // vertically transform all central columns
-        buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
-        buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
-
-        // Save all results of current window. Results are in transform buffer
-        // at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
-        // (They only served as a boundary for vertical RDWT.)
-        for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
-          writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
-        }
-
-        // Use last 3 remembered lines as first 3 lines for next iteration.
-        // As expected, these lines are already horizontally transformed.
-        buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
-        buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
-        buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
-
-        // Wait for all writing threads before proceeding to loading new
-        // coeficients in next iteration. (Not to overwrite those which
-        // are not written yet.)
-        __syncthreads();
-      }
-    }
-
-
-  public:
-    /// Main GPU 5/3 RDWT entry point.
-    /// @param in     input image (5/3 transformed coefficients)
-    /// @param out    output buffer (for reverse transformed image)
-    /// @param sizeX  width of the output image
-    /// @param sizeY  height of the output image
-    /// @param winSteps  number of sliding window steps
-    __device__ static void run(const int * const input, int * const output,
-                               const int sx, const int sy, const int steps) {
-      // prepare instance with buffer in shared memory
-      __shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
-
-      // Compute limits of this threadblock's block of pixels and use them to
-      // determine, whether this threadblock will have to deal with boundary.
-      // (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
-      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
-      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
-      const bool atRightBoudary = maxX >= sx;
-      const bool atBottomBoudary = maxY >= sy;
-
-      // Select specialized version of code according to distance of this
-      // threadblock's pixels from image boundary.
-      if(atBottomBoudary) {
-        // near bottom boundary => check both writing and reading
-        rdwt53.transform<true, true>(input, output, sx, sy, steps);
-      } else if(atRightBoudary) {
-        // near right boundary only => check writing only
-        rdwt53.transform<false, true>(input, output, sx, sy, steps);
-      } else {
-        // no nearby boundary => check nothing
-        rdwt53.transform<false, false>(input, output, sx, sy, steps);
-      }
-    }
-
-  }; // end of class RDWT53
-
-
-
-  /// Main GPU 5/3 RDWT entry point.
-  /// @param in     input image (5/3 transformed coefficients)
-  /// @param out    output buffer (for reverse transformed image)
-  /// @param sizeX  width of the output image
-  /// @param sizeY  height of the output image
-  /// @param winSteps  number of sliding window steps
-  template <int WIN_SX, int WIN_SY>
-  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
-  __global__ void rdwt53Kernel(const int * const in, int * const out,
-                               const int sx, const int sy, const int steps) {
-    RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
-  }
-
-
-
-  /// Only computes optimal number of sliding window steps,
-  /// number of threadblocks and then lanches the 5/3 RDWT kernel.
-  /// @tparam WIN_SX  width of sliding window
-  /// @tparam WIN_SY  height of sliding window
-  /// @param in       input image
-  /// @param out      output buffer
-  /// @param sx       width of the input image
-  /// @param sy       height of the input image
-  template <int WIN_SX, int WIN_SY>
-  void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
-    // compute optimal number of steps of each sliding window
-    const int steps = divRndUp(sy, 15 * WIN_SY);
-
-    // prepare grid size
-    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
-
-    // finally transform this level
-    PERF_BEGIN
-    rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
-    PERF_END("        RDWT53", sx, sy)
-    CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
-  }
-
-
-
-  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
-  /// @param in      Input DWT coefficients. Format described in common rules.
-  ///                Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU - will contain original image
-  ///                in normalized range [-128, 127].
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
-    if(levels > 1) {
-      // let this function recursively reverse transform deeper levels first
-      const int llSizeX = divRndUp(sizeX, 2);
-      const int llSizeY = divRndUp(sizeY, 2);
-      rdwt53(in, out, llSizeX, llSizeY, levels - 1);
-
-      // copy reverse transformed LL band from output back into the input
-      memCopy(in, out, llSizeX, llSizeY);
-    }
-
-    // select right width of kernel for the size of the image
-    if(sizeX >= 960) {
-      launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
-    } else if (sizeX >= 480) {
-      launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
-    } else {
-      launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
-    }
-  }
-
-
-} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/rdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt97.cu
@ -1,363 +0,0 @@
-///
-/// @file    rdwt97.cu
-/// @brief   CUDA implementation of reverse 9/7 2D DWT.
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-02-03 21:59
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-
-#include "common.h"
-#include "transform_buffer.h"
-#include "io.h"
-
-
-namespace dwt_cuda {
-
-
-  /// Wraps shared memory buffer and methods for computing 9/7 RDWT using
-  /// lifting schema and sliding window.
-  /// @tparam WIN_SIZE_X  width of the sliding window
-  /// @tparam WIN_SIZE_Y  height of the sliding window
-  template <int WIN_SIZE_X, int WIN_SIZE_Y>
-  class RDWT97 {
-  private:
-
-    /// Info related to loading of one input column.
-    /// @tparam CHECKED true if boundary chould be checked,
-    ///                 false if there is no near boudnary
-    template <bool CHECKED>
-    struct RDWT97Column  {
-      /// laoder of input pxels for given column.
-      VerticalDWTBandLoader<float, CHECKED> loader;
-
-      /// Offset of loaded column in shared memory buffer.
-      int offset;
-
-      /// Sets all fields to some values to avoid 'uninitialized' warnings.
-      __device__ void clear() {
-        loader.clear();
-        offset = 0;
-      }
-    };
-
-
-    /// Shared memory buffer used for 9/7 DWT transforms.
-    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> RDWT97Buffer;
-
-    /// Shared buffer used for reverse 9/7 DWT.
-    RDWT97Buffer buffer;
-
-    /// Difference between indices of two vertical neighbors in buffer.
-    enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE };
-
-
-    /// Horizontal 9/7 RDWT on specified lines of transform buffer.
-    /// @param lines      number of lines to be transformed
-    /// @param firstLine  index of the first line to be transformed
-    __device__ void horizontalRDWT97(int lines, int firstLine) {
-      __syncthreads();
-      buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines);
-      __syncthreads();
-      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2));
-      __syncthreads();
-      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2));
-      __syncthreads();
-      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1));
-      __syncthreads();
-      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1));
-      __syncthreads();
-    }
-
-
-    /// Initializes one column of shared transform buffer with 7 input pixels.
-    /// Those 7 pixels will not be transformed. Also initializes given loader.
-    /// @tparam CHECKED  true if there are near image boundaries
-    /// @param colIndex  index of column in shared transform buffer
-    /// @param input     input image
-    /// @param sizeX     width of the input image
-    /// @param sizeY     height of the input image
-    /// @param column    (uninitialized) info about loading one column
-    /// @param firstY    index of first image row to be transformed
-    template <bool CHECKED>
-    __device__ void initColumn(const int colIndex, const float * const input,
-                               const int sizeX, const int sizeY,
-                               RDWT97Column<CHECKED> & column,
-                               const int firstY) {
-      // coordinates of the first coefficient to be loaded
-      const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
-
-      // offset of the column with index 'colIndex' in the transform buffer
-      column.offset = buffer.getColumnOffset(colIndex);
-
-      if(blockIdx.y == 0) {
-        // topmost block - apply mirroring rules when loading first 7 rows
-        column.loader.init(sizeX, sizeY, firstX, firstY);
-
-        // load pixels in mirrored way
-        buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 4 * STRIDE] =
-        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
-        buffer[column.offset + 5 * STRIDE] =
-        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 6 * STRIDE] =
-        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
-      } else {
-        // non-topmost row - regular loading:
-        column.loader.init(sizeX, sizeY, firstX, firstY - 3);
-        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
-        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
-        buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input);
-        buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input);
-      }
-      // Now, the next coefficient, which will be loaded by loader, is #4.
-    }
-
-
-    /// Using given loader, it loads another WIN_SIZE_Y coefficients
-    /// into specified column.
-    /// @tparam CHECKED  true if there are near image boundaries
-    /// @param col       info about loaded column
-    /// @param input     buffer with input coefficients
-    template <bool CHECKED>
-    inline __device__ void loadWindowIntoColumn(RDWT97Column<CHECKED> & col,
-                                                const float * const input) {
-      for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) {
-        buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
-        buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
-      }
-    }
-
-
-    /// Actual GPU 9/7 RDWT sliding window lifting schema implementation.
-    /// @tparam CHECKED_LOADS   true if loader should check boundaries
-    /// @tparam CHECKED_WRITES  true if boundaries should be taken into account
-    ///                         when writing into output buffer
-    /// @param in        input image (9/7 transformed coefficients)
-    /// @param out       output buffer (for reverse transformed image)
-    /// @param sizeX     width of the output image
-    /// @param sizeY     height of the output image
-    /// @param winSteps  number of steps of sliding window
-    template <bool CHECKED_LOADS, bool CHECKED_WRITES>
-    __device__ void transform(const float * const in, float * const out,
-                              const int sizeX, const int sizeY,
-                              const int winSteps) {
-      // info about one main column and one boundary column
-      RDWT97Column<CHECKED_LOADS> column;
-      RDWT97Column<CHECKED_LOADS> boundaryColumn;
-
-      // index of first image row to be transformed
-      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
-
-      // initialize boundary columns
-      boundaryColumn.clear();
-      if(threadIdx.x < 7) {
-        // each thread among first 7 ones gets index of one of boundary columns
-        const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
-
-        // Thread initializes offset of the boundary column (in shared
-        // buffer), first 7 pixels of the column and a loader for this column.
-        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
-      }
-
-      // All threads initialize central columns.
-      initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
-
-      // horizontally transform first 7 rows
-      horizontalRDWT97(7, 0);
-
-      // writer of output pixels - initialize it
-      const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
-      VerticalDWTPixelWriter<float, CHECKED_WRITES> writer;
-      writer.init(sizeX, sizeY, outputX, firstY);
-
-      // offset of column (in transform buffer) saved by this thread
-      const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
-
-      // (Each iteration assumes that first 7 rows of transform buffer are
-      // already loaded with horizontally transformed pixels.)
-      for(int w = 0; w < winSteps; w++) {
-        // Load another WIN_SIZE_Y lines of this thread's column
-        // into the transform buffer.
-        loadWindowIntoColumn(column, in);
-
-        // possibly load boundary columns
-        if(threadIdx.x < 7) {
-          loadWindowIntoColumn(boundaryColumn, in);
-        }
-
-        // horizontally transform all newly loaded lines
-        horizontalRDWT97(WIN_SIZE_Y, 7);
-
-        // Using 7 registers, remember current values of last 7 rows
-        // of transform buffer. These rows are transformed horizontally
-        // only and will be used in next iteration.
-        float last7Lines[7];
-        for(int i = 0; i < 7; i++) {
-          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
-        }
-
-        // vertically transform all central columns
-        buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset,
-                             WIN_SIZE_Y + 7, 0);
-        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2));
-        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2));
-        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1));
-        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1));
-
-        // Save all results of current window. Results are in transform buffer
-        // at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now.
-        // (They only served as a boundary for vertical RDWT.)
-        for(int i = 3; i < (3 + WIN_SIZE_Y); i++) {
-          writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]);
-        }
-
-        // Use last 7 remembered lines as first 7 lines for next iteration.
-        // As expected, these lines are already horizontally transformed.
-        for(int i = 0; i < 7; i++) {
-          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
-        }
-
-        // Wait for all writing threads before proceeding to loading new
-        // coeficients in next iteration. (Not to overwrite those which
-        // are not written yet.)
-        __syncthreads();
-      }
-    }
-
-
-  public:
-    /// Main GPU 9/7 RDWT entry point.
-    /// @param in     input image (9/7 transformed coefficients)
-    /// @param out    output buffer (for reverse transformed image)
-    /// @param sizeX  width of the output image
-    /// @param sizeY  height of the output image
-    __device__ static void run(const float * const input, float * const output,
-                               const int sx, const int sy, const int steps) {
-      // prepare instance with buffer in shared memory
-      __shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
-
-      // Compute limits of this threadblock's block of pixels and use them to
-      // determine, whether this threadblock will have to deal with boundary.
-      // (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
-      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
-      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
-      const bool atRightBoudary = maxX >= sx;
-      const bool atBottomBoudary = maxY >= sy;
-
-      // Select specialized version of code according to distance of this
-      // threadblock's pixels from image boundary.
-      if(atBottomBoudary) {
-        // near bottom boundary => check both writing and reading
-        rdwt97.transform<true, true>(input, output, sx, sy, steps);
-      } else if(atRightBoudary) {
-        // near right boundary only => check writing only
-        rdwt97.transform<false, true>(input, output, sx, sy, steps);
-      } else {
-        // no nearby boundary => check nothing
-        rdwt97.transform<false, false>(input, output, sx, sy, steps);
-      }
-    }
-
-  }; // end of class RDWT97
-
-
-
-  /// Main GPU 9/7 RDWT entry point.
-  /// @param in     input image (9/7 transformed coefficients)
-  /// @param out    output buffer (for reverse transformed image)
-  /// @param sizeX  width of the output image
-  /// @param sizeY  height of the output image
-  template <int WIN_SX, int WIN_SY>
-  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
-  __global__ void rdwt97Kernel(const float * const in, float * const out,
-                               const int sx, const int sy, const int steps) {
-    RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
-  }
-
-
-
-  /// Only computes optimal number of sliding window steps,
-  /// number of threadblocks and then lanches the 9/7 RDWT kernel.
-  /// @tparam WIN_SX  width of sliding window
-  /// @tparam WIN_SY  height of sliding window
-  /// @param in       input image
-  /// @param out      output buffer
-  /// @param sx       width of the input image
-  /// @param sy       height of the input image
-  template <int WIN_SX, int WIN_SY>
-  void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
-    // compute optimal number of steps of each sliding window
-    const int steps = divRndUp(sy, 15 * WIN_SY);
-
-    // prepare grid size
-    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
-
-    // finally launch kernel
-    PERF_BEGIN
-    rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
-    PERF_END("        RDWT97", sx, sy)
-    CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
-  }
-
-
-
-  /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
-  /// @param in      Input DWT coefficients. Format described in common rules.
-  ///                Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU - will contain original image
-  ///                in normalized range [-0.5, 0.5].
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
-    if(levels > 1) {
-      // let this function recursively reverse transform deeper levels first
-      const int llSizeX = divRndUp(sizeX, 2);
-      const int llSizeY = divRndUp(sizeY, 2);
-      rdwt97(in, out, llSizeX, llSizeY, levels - 1);
-
-      // copy reverse transformed LL band from output back into the input
-      memCopy(in, out, llSizeX, llSizeY);
-    }
-
-    // select right width of kernel for the size of the image
-    if(sizeX >= 960) {
-      launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
-    } else if (sizeX >= 480) {
-      launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
-    } else {
-      launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
-    }
-  }
-
-
-
-} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/transform_buffer.h
+++ b/examples/dwt2d/dwt_cuda/transform_buffer.h
@ -1,338 +0,0 @@
-/// line 248 the index
-/// @file    transform_buffer.h
-/// @brief   Buffer with separated even and odd columns and related algorithms.
-/// @author  Martin Jirman (207962@mail.muni.cz)
-/// @date    2011-01-20 18:33
-///
-///
-/// Copyright (c) 2011 Martin Jirman
-/// All rights reserved.
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions are met:
-///
-///     * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-///     * Redistributions in binary form must reproduce the above copyright
-///       notice, this list of conditions and the following disclaimer in the
-///       documentation and/or other materials provided with the distribution.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-///
-
-#ifndef TRANSFORM_BUFFER_H
-#define TRANSFORM_BUFFER_H
-
-namespace dwt_cuda {
-
-/// Buffer (in shared memory of GPU) where block of input image is stored,
-/// but odd and even lines are separated. (Generates less bank conflicts when
-/// using lifting schema.) All operations expect SIZE_X threads.
-/// Also implements basic building blocks of lifting schema.
-/// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
-///                     a number of threads participating on all operations.)
-///                     Must be divisible by 4.
-/// @tparam SIZE_Y      height of buffer (total number of lines)
-/// @tparam BOUNDARY_X  number of extra pixels at the left and right side
-///                     boundary is expected to be smaller than half SIZE_X
-///                     Must be divisible by 2.
-template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
-class TransformBuffer {
-public:
-  enum {
-    /// difference between pointers to two vertical neigbors
-    VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
-  };
-
-private:
-  enum {
-/// number of shared memory banks - needed for correct padding
-#ifdef __CUDA_ARCH__
-    SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
-#else
-    SHM_BANKS = 16, // for host code only - can be anything, won't be used
-#endif
-
-    /// size of one of two buffers (odd or even)
-    BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
-
-    /// unused space between two buffers
-    PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
-
-    /// offset of the odd columns buffer from the beginning of data buffer
-    ODD_OFFSET = BUFFER_SIZE + PADDING,
-  };
-
-  /// buffer for both even and odd columns
-  T data[2 * BUFFER_SIZE + PADDING];
-
-  /// Applies specified function to all central elements while also passing
-  /// previous and next elements as parameters.
-  /// @param count         count of central elements to apply function to
-  /// @param prevOffset    offset of first central element
-  /// @param midOffset     offset of first central element's predecessor
-  /// @param nextOffset    offset of first central element's successor
-  /// @param function      the function itself
-  template <typename FUNC>
-  __device__ void horizontalStep(const int count, const int prevOffset,
-                                 const int midOffset, const int nextOffset,
-                                 const FUNC &function) {
-    // number of unchecked iterations
-    const int STEPS = count / SIZE_X;
-
-    // items remaining after last unchecked iteration
-    const int finalCount = count % SIZE_X;
-
-    // offset of items processed in last (checked) iteration
-    const int finalOffset = count - finalCount;
-
-    // all threads perform fixed number of iterations ...
-    for (int i = 0; i < STEPS; i++) {
-      // for(int i = 0; i < 3; i++) {
-      const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
-      const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
-      T &center = data[midOffset + i * SIZE_X + threadIdx.x];
-      // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
-      function(previous, center, next); // the real one
-    }
-
-    // ... but not all threads participate on final iteration
-    if (threadIdx.x < finalCount) {
-      const T previous = data[prevOffset + finalOffset + threadIdx.x];
-      const T next = data[nextOffset + finalOffset + threadIdx.x];
-      T &center = data[midOffset + finalOffset + threadIdx.x];
-      // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
-      // kaixi
-      function(previous, center, next); // the real one
-    }
-  }
-
-public:
-  __device__ void getPrintData() {
-    //
-    for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
-      printf(" index: %d  data: %f \n ", i, data[i]);
-    }
-  }
-
-  /// Gets offset of the column with given index. Central columns have
-  /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
-  /// indices and right boundary columns indices start with NUM_LINES.
-  /// @param columnIndex  index of column to get pointer to
-  /// @return  offset of the first item of column with specified index
-  __device__ int getColumnOffset(int columnIndex) {
-    columnIndex += BOUNDARY_X;               // skip boundary
-    return columnIndex / 2                   // select right column
-           + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
-  }
-
-  /// Provides access to data of the transform buffer.
-  /// @param index  index of the item to work with
-  /// @return reference to item at given index
-  __device__ T &operator[](const int index) { return data[index]; }
-
-  /// Applies specified function to all horizontally even elements in
-  /// specified lines. (Including even elements in boundaries except
-  /// first even element in first left boundary.) SIZE_X threads participate
-  /// and synchronization is needed before result can be used.
-  /// @param firstLine  index of first line
-  /// @param numLines   count of lines
-  /// @param func       function to be applied on all even elements
-  ///                   parameters: previous (odd) element, the even
-  ///                   element itself and finally next (odd) element
-  template <typename FUNC>
-  __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
-                                        const FUNC &func) {
-    // number of even elemens to apply function to
-    const int count = numLines * VERTICAL_STRIDE - 1;
-    // offset of first even element
-    const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
-    // offset of odd predecessor of first even element
-    const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
-    // offset of odd successor of first even element
-    const int nextOffset = prevOffset + 1;
-
-    // if(threadIdx.x == 0) {
-
-    //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d
-    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
-    // }
-
-    // call generic horizontal step function
-    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
-  }
-
-  /// Applies given function to all horizontally odd elements in specified
-  /// lines. (Including odd elements in boundaries except last odd element
-  /// in last right boundary.) SIZE_X threads participate and synchronization
-  /// is needed before result can be used.
-  /// @param firstLine  index of first line
-  /// @param numLines   count of lines
-  /// @param func       function to be applied on all odd elements
-  ///                   parameters: previous (even) element, the odd
-  ///                   element itself and finally next (even) element
-  template <typename FUNC>
-  __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
-                                       const FUNC &func) {
-    // numbet of odd elements to apply function to
-    const int count = numLines * VERTICAL_STRIDE - 1;
-    // offset of even predecessor of first odd element
-    const int prevOffset = firstLine * VERTICAL_STRIDE;
-    // offset of first odd element
-    const int centerOffset = prevOffset + ODD_OFFSET;
-    // offset of even successor of first odd element
-    const int nextOffset = prevOffset + 1;
-
-    //  if(threadIdx.x == 0) {
-    //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d
-    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
-    // }
-
-    // call generic horizontal step function
-    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
-  }
-
-  /// Applies specified function to all even elements (except element #0)
-  /// of given column. Each thread takes care of one column, so there's
-  /// no need for synchronization.
-  /// @param columnOffset  offset of thread's column
-  /// @param f             function to be applied on all even elements
-  ///                      parameters: previous (odd) element, the even
-  ///                      element itself and finally next (odd) element
-  template <typename F>
-  __device__ void forEachVerticalEven(const int columnOffset, const F &f) {
-    if (SIZE_Y > 3) { // makes no sense otherwise
-      const int steps = SIZE_Y / 2 - 1;
-      for (int i = 0; i < steps; i++) {
-        const int row = 2 + i * 2;
-        const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
-        const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
-        f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
-
-        //--------------- FOR TEST -----------------
-        /*		__syncthreads();
-                        if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
-                                diffOut[2500]++;
-                                diffOut[diffOut[2500]] = 2;//data[columnOffset +
-           row * VERTICAL_STRIDE];
-                        }
-                        __syncthreads();
-        */		  //--------------- FOR TEST -----------------
-      }
-    }
-  }
-
-  /// Applies specified function to all odd elements of given column.
-  /// Each thread takes care of one column, so there's no need for
-  /// synchronization.
-  /// @param columnOffset  offset of thread's column
-  /// @param f             function to be applied on all odd elements
-  ///                      parameters: previous (even) element, the odd
-  ///                      element itself and finally next (even) element
-  template <typename F>
-  __device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
-    const int steps = (SIZE_Y - 1) / 2;
-    for (int i = 0; i < steps; i++) {
-      const int row = i * 2 + 1;
-      const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
-      const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
-
-      f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
-
-      //--------------- FOR TEST -----------------
-      /*		__syncthreads();
-                      if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
-                              diffOut[2500]++;
-                              diffOut[diffOut[2500]] = 1; //data[columnOffset +
-         row * VERTICAL_STRIDE];
-                      }
-
-                      __syncthreads();
-      */		  //--------------- FOR TEST -----------------
-    }
-  }
-
-  /// Scales elements at specified lines.
-  /// @param evenScale  scaling factor for horizontally even elements
-  /// @param oddScale   scaling factor for horizontally odd elements
-  /// @param numLines   number of lines, whose elements should be scaled
-  /// @param firstLine  index of first line to scale elements in
-  __device__ void scaleHorizontal(const T evenScale, const T oddScale,
-                                  const int firstLine, const int numLines) {
-    const int offset = firstLine * VERTICAL_STRIDE;
-    const int count = numLines * VERTICAL_STRIDE;
-    const int steps = count / SIZE_X;
-    const int finalCount = count % SIZE_X;
-    const int finalOffset = count - finalCount;
-
-    // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d,
-    // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
-    // finalCount, finalOffset);
-
-    // run iterations, whete all threads participate
-    for (int i = 0; i < steps; i++) {
-      data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
-      // if(threadIdx.x + i * SIZE_X + offset == 531) {
-      //   printf("threadidx 531: %d \n", threadIdx.x);
-      // }
-      // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
-      //   printf("threadidx 531: %d \n", threadIdx.x);
-      // }
-      data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
-    }
-
-    // some threads also finish remaining unscaled items
-    if (threadIdx.x < finalCount) {
-      data[threadIdx.x + finalOffset + offset] *= evenScale;
-      // if(threadIdx.x + finalOffset + offset == 531) {
-      //   printf("threadidx 531: %d \n", threadIdx.x);
-      // }
-      //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
-      //   printf("threadidx 531: %d \n", threadIdx.x);
-      // }
-      data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
-    }
-  }
-
-  /// Scales elements in specified column.
-  /// @param evenScale     scaling factor for vertically even elements
-  /// @param oddScale      scaling factor for vertically odd elements
-  /// @param columnOffset  offset of the column to work with
-  /// @param numLines      number of lines, whose elements should be scaled
-  /// @param firstLine     index of first line to scale elements in
-  __device__ void scaleVertical(const T evenScale, const T oddScale,
-                                const int columnOffset, const int numLines,
-                                const int firstLine) {
-    for (int i = firstLine; i < (numLines + firstLine); i++) {
-      if (i & 1) {
-        data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
-      } else {
-        data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
-      }
-    }
-  }
-
-  //****************For Test(Feb23), test inter parameters*************
-  __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
-  __device__ int getSHM_BANKS() { return SHM_BANKS; }
-  __device__ int getBuffersize() { return BUFFER_SIZE; }
-  __device__ int getPADDING() { return PADDING; }
-  __device__ int getODD_OFFSET() { return ODD_OFFSET; }
-
-  //****************For Test(Feb23), test inter parameters*************
-
-}; // end of class TransformBuffer
-
-} // namespace dwt_cuda
-
-#endif // TRANSFORM_BUFFER_H
--- a/examples/dwt2d/main.cu
+++ b/examples/dwt2d/main.cu
@ -1,401 +0,0 @@
-/*
- * Copyright (c) 2009, Jiri Matela
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <unistd.h>
-#include <error.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <sys/time.h>
-#include <getopt.h>
-
-#include "common.h"
-#include "components.h"
-#include "dwt.h"
-
-struct dwt {
-    char * srcFilename;
-    char * outFilename;
-    unsigned char *srcImg;
-    int pixWidth;
-    int pixHeight;
-    int components;
-    int dwtLvls;
-};
-
-int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
-{
-    // printf("Loading ipnput: %s\n", srcFilename);
-    char *path = "../../data/dwt2d/";
-    char *newSrc = NULL;
-
-    if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
-    {
-        newSrc[0] = '\0';
-        strcat(newSrc, path);
-        strcat(newSrc, srcFilename);
-        srcFilename= newSrc;
-    }
-    printf("Loading ipnput: %s\n", srcFilename);
-
-    //srcFilename = strcat("../../data/dwt2d/",srcFilename);
-    //read image
-    int i = open(srcFilename, O_RDONLY, 0644);
-    if (i == -1) {
-        error(0,errno,"cannot access %s", srcFilename);
-        return -1;
-    }
-    int ret = read(i, srcImg, inputSize);
-    printf("precteno %d, inputsize %d\n", ret, inputSize);
-    close(i);
-
-    return 0;
-}
-
-
-void usage() {
-    printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
-  -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
-  -c, --components\t\tnumber of color components, default 3\n\
-  -b, --depth\t\t\tbit depth, default 8\n\
-  -l, --level\t\t\tDWT level, default 3\n\
-  -D, --device\t\t\tcuda device\n\
-  -f, --forward\t\t\tforward transform\n\
-  -r, --reverse\t\t\treverse transform\n\
-  -9, --97\t\t\t9/7 transform\n\
-  -5, --53\t\t\t5/3 transform\n\
-  -w  --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
-}
-
-template <typename T>
-void processDWT(struct dwt *d, int forward, int writeVisual)
-{
-    int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
-
-    T *c_r_out, *backup ;
-    cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
-    cudaCheckError("Alloc device memory");
-    cudaMemset(c_r_out, 0, componentSize);
-    cudaCheckError("Memset device memory");
-
-    cudaMalloc((void**)&backup, componentSize); //< aligned component size
-    cudaCheckError("Alloc device memory");
-    cudaMemset(backup, 0, componentSize);
-    cudaCheckError("Memset device memory");
-
-    if (d->components == 3) {
-        /* Alloc two more buffers for G and B */
-        T *c_g_out, *c_b_out;
-        cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size
-        cudaCheckError("Alloc device memory");
-        cudaMemset(c_g_out, 0, componentSize);
-        cudaCheckError("Memset device memory");
-
-        cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
-        cudaCheckError("Alloc device memory");
-        cudaMemset(c_b_out, 0, componentSize);
-        cudaCheckError("Memset device memory");
-
-        /* Load components */
-        T *c_r, *c_g, *c_b;
-        cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
-        cudaCheckError("Alloc device memory");
-        cudaMemset(c_r, 0, componentSize);
-        cudaCheckError("Memset device memory");
-
-        cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size
-        cudaCheckError("Alloc device memory");
-        cudaMemset(c_g, 0, componentSize);
-        cudaCheckError("Memset device memory");
-
-        cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size
-        cudaCheckError("Alloc device memory");
-        cudaMemset(c_b, 0, componentSize);
-        cudaCheckError("Memset device memory");
-
-        rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
-
-
-        /* Compute DWT and always store into file */
-        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-        nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-        nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-
-        // -------test----------
-        // T *h_r_out=(T*)malloc(componentSize);
-		// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
-        // int ii;
-		// for(ii=0;ii<componentSize/sizeof(T);ii++) {
-			// fprintf(stderr, "%d ", h_r_out[ii]);
-			// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
-        // }
-        // -------test----------
-
-
-        /* Store DWT to file */
-        writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
-        // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
-        // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
-#ifdef OUTPUT
-        if (writeVisual) {
-            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
-            writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
-            writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
-        } else {
-            writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
-            writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
-            writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
-        }
-#endif
-
-
-        cudaFree(c_r);
-        cudaCheckError("Cuda free");
-        cudaFree(c_g);
-        cudaCheckError("Cuda free");
-        cudaFree(c_b);
-        cudaCheckError("Cuda free");
-        cudaFree(c_g_out);
-        cudaCheckError("Cuda free");
-        cudaFree(c_b_out);
-        cudaCheckError("Cuda free");
-
-    }
-    else if (d->components == 1) {
-        //Load component
-        T *c_r;
-        cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size
-        cudaCheckError("Alloc device memory");
-        cudaMemset(c_r, 0, componentSize);
-        cudaCheckError("Memset device memory");
-
-        bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
-
-        // Compute DWT
-        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-
-        // Store DWT to file
-// #ifdef OUTPUT
-        if (writeVisual) {
-            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
-        } else {
-            writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out");
-        }
-// #endif
-        cudaFree(c_r);
-        cudaCheckError("Cuda free");
-    }
-
-    cudaFree(c_r_out);
-    cudaCheckError("Cuda free device");
-    cudaFree(backup);
-    cudaCheckError("Cuda free device");
-}
-
-int main(int argc, char **argv)
-{
-    int optindex = 0;
-    char ch;
-    struct option longopts[] = {
-        {"dimension",   required_argument, 0, 'd'}, //dimensions of src img
-        {"components",  required_argument, 0, 'c'}, //numger of components of src img
-        {"depth",       required_argument, 0, 'b'}, //bit depth of src img
-        {"level",       required_argument, 0, 'l'}, //level of dwt
-        {"device",      required_argument, 0, 'D'}, //cuda device
-        {"forward",     no_argument,       0, 'f'}, //forward transform
-        {"reverse",     no_argument,       0, 'r'}, //reverse transform
-        {"97",          no_argument,       0, '9'}, //9/7 transform
-        {"53",          no_argument,       0, '5' }, //5/3transform
-        {"write-visual",no_argument,       0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
-        {"help",        no_argument,       0, 'h'}
-    };
-
-    int pixWidth    = 0; //<real pixWidth
-    int pixHeight   = 0; //<real pixHeight
-    int compCount   = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
-    int bitDepth    = 8;
-    int dwtLvls     = 3; //default numuber of DWT levels
-    int device      = 0;
-    int forward     = 1; //forward transform
-    int dwt97       = 1; //1=dwt9/7, 0=dwt5/3 transform
-    int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
-    char * pos;
-
-    while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) {
-        switch (ch) {
-        case 'd':
-            pixWidth = atoi(optarg);
-            pos = strstr(optarg, "x");
-            if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) {
-                usage();
-                return -1;
-            }
-            pixHeight = atoi(pos+1);
-            break;
-        case 'c':
-            compCount = atoi(optarg);
-            break;
-        case 'b':
-            bitDepth = atoi(optarg);
-            break;
-        case 'l':
-            dwtLvls = atoi(optarg);
-            break;
-        case 'D':
-            device = atoi(optarg);
-            break;
-        case 'f':
-            forward = 1;
-            break;
-        case 'r':
-            forward = 0;
-            break;
-        case '9':
-            dwt97 = 1;
-            break;
-        case '5':
-            dwt97 = 0;
-            break;
-        case 'w':
-            writeVisual = 1;
-            break;
-        case 'h':
-            usage();
-            return 0;
-        case '?':
-            return -1;
-        default :
-            usage();
-            return -1;
-        }
-    }
-	argc -= optind;
-	argv += optind;
-
-    if (argc == 0) { // at least one filename is expected
-        printf("Please supply src file name\n");
-        usage();
-        return -1;
-    }
-
-    if (pixWidth <= 0 || pixHeight <=0) {
-        printf("Wrong or missing dimensions\n");
-        usage();
-        return -1;
-    }
-
-    if (forward == 0) {
-        writeVisual = 0; //do not write visual when RDWT
-    }
-
-    // device init
-    int devCount;
-    cudaSetDevice(0);
-    cudaGetDeviceCount(&devCount);
-    cudaCheckError("Get device count");
-    if (devCount == 0) {
-        printf("No CUDA enabled device\n");
-        return -1;
-    }
-    if (device < 0 || device > devCount -1) {
-        printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
-               device, 0, devCount -1);
-        return -1;
-    }
-    cudaDeviceProp devProp;
-    cudaGetDeviceProperties(&devProp, device);
-    cudaCheckError("Get device properties");
-    // if (devProp.major < 1) {
-    //     printf("Device %d does not support CUDA\n", device);
-    //     return -1;
-    // }
-    printf("Using device %d: %s\n", device, devProp.name);
-    cudaSetDevice(device);
-    cudaCheckError("Set selected device");
-
-    struct dwt *d;
-    d = (struct dwt *)malloc(sizeof(struct dwt));
-    d->srcImg = NULL;
-    d->pixWidth = pixWidth;
-    d->pixHeight = pixHeight;
-    d->components = compCount;
-    d->dwtLvls  = dwtLvls;
-
-    // file names
-    d->srcFilename = (char *)malloc(strlen(argv[0]));
-    strcpy(d->srcFilename, argv[0]);
-    if (argc == 1) { // only one filename supplyed
-        d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
-        strcpy(d->outFilename, d->srcFilename);
-        strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
-    } else {
-        d->outFilename = strdup(argv[1]);
-    }
-
-    //Input review
-    printf("Source file:\t\t%s\n", d->srcFilename);
-    printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
-    printf(" Components count:\t%d\n", compCount);
-    printf(" Bit depth:\t\t%d\n", bitDepth);
-    printf(" DWT levels:\t\t%d\n", dwtLvls);
-    printf(" Forward transform:\t%d\n", forward);
-    printf(" 9/7 transform:\t\t%d\n", dwt97);
-
-    //data sizes
-    int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
-
-    //load img source image
-    cudaMallocHost((void **)&d->srcImg, inputSize);
-    cudaCheckError("Alloc host memory");
-    if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
-        return -1;
-
-    /* DWT */
-    if (forward == 1) {
-        if(dwt97 == 1 )
-            processDWT<float>(d, forward, writeVisual);
-        else // 5/3
-            processDWT<int>(d, forward, writeVisual);
-    }
-    else { // reverse
-        if(dwt97 == 1 )
-            processDWT<float>(d, forward, writeVisual);
-        else // 5/3
-            processDWT<int>(d, forward, writeVisual);
-    }
-
-    //writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g");
-    //writeComponent(g_wave_cuda, 512000, ".g");
-    //writeComponent(g_cuda, componentSize, ".g");
-    //writeComponent(b_wave_cuda, componentSize, ".b");
-    cudaFreeHost(d->srcImg);
-    cudaCheckError("Cuda free host");
-
-    return 0;
-}
--- a/examples/dwt2d/run.sh
+++ b/examples/dwt2d/run.sh
@ -1,8 +0,0 @@
-./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3
-# ./dwt2d 8.bmp -d 8x8 -f -5 -l 3
-# ./dwt2d 16.bmp -d 16x16 -f -5 -l 3
-# ./dwt2d 64.bmp -d 64x64 -f -5 -l 3
-
-# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
-# ls
-# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
--- a/examples/dwt2d/run_cpu.sh
+++ b/examples/dwt2d/run_cpu.sh
@ -1,7 +0,0 @@
-# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
-# ls
-# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
-# ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\
-./dwt2d 4.bmp  -d 4x4 -r -5 -l 3
-# ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3
-# ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3
--- a/examples/dwt2d/run_nvcc.sh
+++ b/examples/dwt2d/run_nvcc.sh
@ -1,14 +0,0 @@
-# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
-# ls
-# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
-# ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3
-./nvcc_dwt2d 4.bmp  -d 4x4 -f -5 -l 3
-# ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3
-# ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3
-# ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3
-# ./nvcc_dwt2d  16.bmp -d 16x16 -f -9 -l 3
-# ./nvcc_dwt2d 4.bmp  -d 4x4 -r -9 -l 3
-# ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3
-# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
-# ls
-# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
--- a/examples/dwt2d/test_compile_cpu.sh
+++ b/examples/dwt2d/test_compile_cpu.sh
@ -1,51 +0,0 @@
-
-
-#!/bin/bash
-
-clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu  dwt_cuda/common.cu  dwt_cuda/rdwt97.cu  dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
-
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-
-../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
-../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
-../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
-
-../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
-
-../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
-../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
-../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
-../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
-../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
-
-../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
-../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
-../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
-../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
-../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
-../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
-
-llc --relocation-model=pic --filetype=obj  common.bc
-llc --relocation-model=pic --filetype=obj  components.bc
-llc --relocation-model=pic --filetype=obj  fdwt53.bc
-
-llc --relocation-model=pic --filetype=obj  dwt.bc
-
-
-llc --relocation-model=pic --filetype=obj  host.bc
-
-llc --relocation-model=pic --filetype=obj  common_host.bc
-llc --relocation-model=pic --filetype=obj  components_host.bc
-llc --relocation-model=pic --filetype=obj  fdwt53_host.bc
-
-llc --relocation-model=pic --filetype=obj  dwt_host.bc
-
-
-llc --relocation-model=pic --filetype=obj  fdwt97_host.bc
-llc --relocation-model=pic --filetype=obj  rdwt97_host.bc
-llc --relocation-model=pic --filetype=obj  rdwt53_host.bc
-llc --relocation-model=pic --filetype=obj  fdwt97.bc
-llc --relocation-model=pic --filetype=obj  rdwt97.bc
-llc --relocation-model=pic --filetype=obj  rdwt53.bc
-
-g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread
--- a/examples/dwt2d/test_compile_nvcc.sh
+++ b/examples/dwt2d/test_compile_nvcc.sh
@ -1,9 +0,0 @@
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
-/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
-g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
--- a/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,396 +0,0 @@
-; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "gaussian.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.__cuda_builtin_blockDim_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
-
-$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
-
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
-entry:
-  %m_cuda.addr = alloca float*, align 8
-  %a_cuda.addr = alloca float*, align 8
-  %Size.addr = alloca i32, align 4
-  %t.addr = alloca i32, align 4
-  store float* %m_cuda, float** %m_cuda.addr, align 8
-  store float* %a_cuda, float** %a_cuda.addr, align 8
-  store i32 %Size, i32* %Size.addr, align 4
-  store i32 %t, i32* %t.addr, align 4
-  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
-  %mul = mul i32 %call1, %call2
-  %add = add i32 %call, %mul
-  %0 = load i32, i32* %Size.addr, align 4
-  %sub = sub nsw i32 %0, 1
-  %1 = load i32, i32* %t.addr, align 4
-  %sub3 = sub nsw i32 %sub, %1
-  %cmp = icmp uge i32 %add, %sub3
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %2 = load float*, float** %a_cuda.addr, align 8
-  %3 = load i32, i32* %Size.addr, align 4
-  %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
-  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %mul6 = mul i32 %call4, %call5
-  %call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %add8 = add i32 %mul6, %call7
-  %4 = load i32, i32* %t.addr, align 4
-  %add9 = add i32 %add8, %4
-  %add10 = add i32 %add9, 1
-  %mul11 = mul i32 %3, %add10
-  %idx.ext = zext i32 %mul11 to i64
-  %add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
-  %5 = load i32, i32* %t.addr, align 4
-  %idx.ext12 = sext i32 %5 to i64
-  %add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
-  %6 = load float, float* %add.ptr13, align 4
-  %7 = load float*, float** %a_cuda.addr, align 8
-  %8 = load i32, i32* %Size.addr, align 4
-  %9 = load i32, i32* %t.addr, align 4
-  %mul14 = mul nsw i32 %8, %9
-  %idx.ext15 = sext i32 %mul14 to i64
-  %add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
-  %10 = load i32, i32* %t.addr, align 4
-  %idx.ext17 = sext i32 %10 to i64
-  %add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
-  %11 = load float, float* %add.ptr18, align 4
-  %div = fdiv float %6, %11
-  %12 = load float*, float** %m_cuda.addr, align 8
-  %13 = load i32, i32* %Size.addr, align 4
-  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
-  %call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %mul21 = mul i32 %call19, %call20
-  %call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %add23 = add i32 %mul21, %call22
-  %14 = load i32, i32* %t.addr, align 4
-  %add24 = add i32 %add23, %14
-  %add25 = add i32 %add24, 1
-  %mul26 = mul i32 %13, %add25
-  %idx.ext27 = zext i32 %mul26 to i64
-  %add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
-  %15 = load i32, i32* %t.addr, align 4
-  %idx.ext29 = sext i32 %15 to i64
-  %add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
-  store float %div, float* %add.ptr30, align 4
-  br label %return
-
-return:                                           ; preds = %if.end, %if.then
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  ret i32 %0
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
-entry:
-  %m_cuda.addr = alloca float*, align 8
-  %a_cuda.addr = alloca float*, align 8
-  %b_cuda.addr = alloca float*, align 8
-  %Size.addr = alloca i32, align 4
-  %j1.addr = alloca i32, align 4
-  %t.addr = alloca i32, align 4
-  %xidx = alloca i32, align 4
-  %yidx = alloca i32, align 4
-  store float* %m_cuda, float** %m_cuda.addr, align 8
-  store float* %a_cuda, float** %a_cuda.addr, align 8
-  store float* %b_cuda, float** %b_cuda.addr, align 8
-  store i32 %Size, i32* %Size.addr, align 4
-  store i32 %j1, i32* %j1.addr, align 4
-  store i32 %t, i32* %t.addr, align 4
-  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
-  %mul = mul i32 %call1, %call2
-  %add = add i32 %call, %mul
-  %0 = load i32, i32* %Size.addr, align 4
-  %sub = sub nsw i32 %0, 1
-  %1 = load i32, i32* %t.addr, align 4
-  %sub3 = sub nsw i32 %sub, %1
-  %cmp = icmp uge i32 %add, %sub3
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  br label %if.end58
-
-if.end:                                           ; preds = %entry
-  %call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
-  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
-  %call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
-  %mul7 = mul i32 %call5, %call6
-  %add8 = add i32 %call4, %mul7
-  %2 = load i32, i32* %Size.addr, align 4
-  %3 = load i32, i32* %t.addr, align 4
-  %sub9 = sub nsw i32 %2, %3
-  %cmp10 = icmp uge i32 %add8, %sub9
-  br i1 %cmp10, label %if.then11, label %if.end12
-
-if.then11:                                        ; preds = %if.end
-  br label %if.end58
-
-if.end12:                                         ; preds = %if.end
-  %call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
-  %mul15 = mul i32 %call13, %call14
-  %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %add17 = add i32 %mul15, %call16
-  store i32 %add17, i32* %xidx, align 4
-  %call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
-  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
-  %mul20 = mul i32 %call18, %call19
-  %call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
-  %add22 = add i32 %mul20, %call21
-  store i32 %add22, i32* %yidx, align 4
-  %4 = load float*, float** %m_cuda.addr, align 8
-  %5 = load i32, i32* %Size.addr, align 4
-  %6 = load i32, i32* %xidx, align 4
-  %add23 = add nsw i32 %6, 1
-  %7 = load i32, i32* %t.addr, align 4
-  %add24 = add nsw i32 %add23, %7
-  %mul25 = mul nsw i32 %5, %add24
-  %8 = load i32, i32* %t.addr, align 4
-  %add26 = add nsw i32 %mul25, %8
-  %idxprom = sext i32 %add26 to i64
-  %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
-  %9 = load float, float* %arrayidx, align 4
-  %10 = load float*, float** %a_cuda.addr, align 8
-  %11 = load i32, i32* %Size.addr, align 4
-  %12 = load i32, i32* %t.addr, align 4
-  %mul27 = mul nsw i32 %11, %12
-  %13 = load i32, i32* %yidx, align 4
-  %14 = load i32, i32* %t.addr, align 4
-  %add28 = add nsw i32 %13, %14
-  %add29 = add nsw i32 %mul27, %add28
-  %idxprom30 = sext i32 %add29 to i64
-  %arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
-  %15 = load float, float* %arrayidx31, align 4
-  %mul32 = fmul contract float %9, %15
-  %16 = load float*, float** %a_cuda.addr, align 8
-  %17 = load i32, i32* %Size.addr, align 4
-  %18 = load i32, i32* %xidx, align 4
-  %add33 = add nsw i32 %18, 1
-  %19 = load i32, i32* %t.addr, align 4
-  %add34 = add nsw i32 %add33, %19
-  %mul35 = mul nsw i32 %17, %add34
-  %20 = load i32, i32* %yidx, align 4
-  %21 = load i32, i32* %t.addr, align 4
-  %add36 = add nsw i32 %20, %21
-  %add37 = add nsw i32 %mul35, %add36
-  %idxprom38 = sext i32 %add37 to i64
-  %arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
-  %22 = load float, float* %arrayidx39, align 4
-  %sub40 = fsub contract float %22, %mul32
-  store float %sub40, float* %arrayidx39, align 4
-  %23 = load i32, i32* %yidx, align 4
-  %cmp41 = icmp eq i32 %23, 0
-  br i1 %cmp41, label %if.then42, label %if.end58
-
-if.then42:                                        ; preds = %if.end12
-  %24 = load float*, float** %m_cuda.addr, align 8
-  %25 = load i32, i32* %Size.addr, align 4
-  %26 = load i32, i32* %xidx, align 4
-  %add43 = add nsw i32 %26, 1
-  %27 = load i32, i32* %t.addr, align 4
-  %add44 = add nsw i32 %add43, %27
-  %mul45 = mul nsw i32 %25, %add44
-  %28 = load i32, i32* %yidx, align 4
-  %29 = load i32, i32* %t.addr, align 4
-  %add46 = add nsw i32 %28, %29
-  %add47 = add nsw i32 %mul45, %add46
-  %idxprom48 = sext i32 %add47 to i64
-  %arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
-  %30 = load float, float* %arrayidx49, align 4
-  %31 = load float*, float** %b_cuda.addr, align 8
-  %32 = load i32, i32* %t.addr, align 4
-  %idxprom50 = sext i32 %32 to i64
-  %arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
-  %33 = load float, float* %arrayidx51, align 4
-  %mul52 = fmul contract float %30, %33
-  %34 = load float*, float** %b_cuda.addr, align 8
-  %35 = load i32, i32* %xidx, align 4
-  %add53 = add nsw i32 %35, 1
-  %36 = load i32, i32* %t.addr, align 4
-  %add54 = add nsw i32 %add53, %36
-  %idxprom55 = sext i32 %add54 to i64
-  %arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
-  %37 = load float, float* %arrayidx56, align 4
-  %sub57 = fsub contract float %37, %mul52
-  store float %sub57, float* %arrayidx56, align 4
-  br label %if.end58
-
-if.end58:                                         ; preds = %if.then, %if.then11, %if.then42, %if.end12
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  ret i32 %0
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { convergent nounwind }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
-!llvm.ident = !{!9}
-!nvvmir.version = !{!10}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
-!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
-!5 = !{null, !"align", i32 8}
-!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!7 = !{null, !"align", i32 16}
-!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!10 = !{i32 1, i32 4}
--- a/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
--- a/examples/gauss/gaussian.cu
+++ b/examples/gauss/gaussian.cu
@ -1,522 +0,0 @@
-/*-----------------------------------------------------------
- ** gaussian.cu -- The program is to solve a linear system Ax = b
- **   by using Gaussian Elimination. The algorithm on page 101
- **   ("Foundations of Parallel Programming") is used.
- **   The sequential version is gaussian.c.  This parallel
- **   implementation converts three independent for() loops
- **   into three Fans.  Use the data file ge_3.dat to verify
- **   the correction of the output.
- **
- ** Written by Andreas Kura, 02/15/95
- ** Modified by Chong-wei Xu, 04/20/95
- ** Modified by Chris Gregg for CUDA, 07/20/2009
- **-----------------------------------------------------------
- */
-#include "cuda_runtime.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-
-#ifdef TIMING
-#include "timing.h"
-#endif
-
-#ifdef RD_WG_SIZE_0_0
-#define MAXBLOCKSIZE RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define MAXBLOCKSIZE RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define MAXBLOCKSIZE RD_WG_SIZE
-#else
-#define MAXBLOCKSIZE 512
-#endif
-
-// 2D defines. Go from specific to general
-#ifdef RD_WG_SIZE_1_0
-#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
-#elif defined(RD_WG_SIZE_1)
-#define BLOCK_SIZE_XY RD_WG_SIZE_1
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE_XY RD_WG_SIZE
-#else
-#define BLOCK_SIZE_XY 1
-#endif
-
-#ifdef TIMING
-struct timeval tv;
-struct timeval tv_total_start, tv_total_end;
-struct timeval tv_h2d_start, tv_h2d_end;
-struct timeval tv_d2h_start, tv_d2h_end;
-struct timeval tv_kernel_start, tv_kernel_end;
-struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
-struct timeval tv_close_start, tv_close_end;
-float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
-      d2h_time = 0, close_time = 0, total_time = 0;
-#endif
-
-int Size;
-float *a, *b, *finalVec;
-float *m;
-
-FILE *fp;
-
-void InitProblemOnce(char *filename);
-void InitPerRun();
-void ForwardSub();
-void BackSub();
-__global__ void Fan1(float *m, float *a, int Size, int t);
-__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
-void InitMat(float *ary, int nrow, int ncol);
-void InitAry(float *ary, int ary_size);
-void PrintMat(float *ary, int nrow, int ncolumn);
-void PrintAry(float *ary, int ary_size);
-void PrintDeviceProperties();
-void checkCUDAError(const char *msg);
-
-unsigned int totalKernelTime = 0;
-
-// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
-void create_matrix(float *m, int size) {
-  int i, j;
-  float lamda = -0.01;
-  float coe[2 * size - 1];
-  float coe_i = 0.0;
-
-  for (i = 0; i < size; i++) {
-    coe_i = 10 * exp(lamda * i);
-    j = size - 1 + i;
-    coe[j] = coe_i;
-    j = size - 1 - i;
-    coe[j] = coe_i;
-  }
-
-  for (i = 0; i < size; i++) {
-    for (j = 0; j < size; j++) {
-      m[i * size + j] = coe[size - 1 - i + j];
-    }
-  }
-}
-
-int main(int argc, char *argv[]) {
-  printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
-         MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
-  int verbose = 1;
-  int i, j;
-  char flag;
-  if (argc < 2) {
-    printf("Usage: gaussian -f filename / -s size [-q]\n\n");
-    printf("-q (quiet) suppresses printing the matrix and result values.\n");
-    printf("-f (filename) path of input file\n");
-    printf(
-        "-s (size) size of matrix. Create matrix and rhs in this program \n");
-    printf(
-        "The first line of the file contains the dimension of the matrix, n.");
-    printf("The second line of the file is a newline.\n");
-    printf("The next n lines contain n tab separated values for the matrix.");
-    printf("The next line of the file is a newline.\n");
-    printf("The next line of the file is a 1xn vector with tab separated "
-           "values.\n");
-    printf("The next line of the file is a newline. (optional)\n");
-    printf("The final line of the file is the pre-computed solution. "
-           "(optional)\n");
-    printf("Example: matrix4.txt:\n");
-    printf("4\n");
-    printf("\n");
-    printf("-0.6	-0.5	0.7	0.3\n");
-    printf("-0.3	-0.9	0.3	0.7\n");
-    printf("-0.4	-0.5	-0.3	-0.8\n");
-    printf("0.0	-0.1	0.2	0.9\n");
-    printf("\n");
-    printf("-0.85	-0.68	0.24	-0.53\n");
-    printf("\n");
-    printf("0.7	0.0	-0.4	-0.5\n");
-    exit(0);
-  }
-
-  cudaSetDevice(0);
-
-  PrintDeviceProperties();
-  // char filename[100];
-  // sprintf(filename,"matrices/matrix%d.txt",size);
-
-  for (i = 1; i < argc; i++) {
-    if (argv[i][0] == '-') { // flag
-      flag = argv[i][1];
-      switch (flag) {
-      case 's': // platform
-        i++;
-        Size = atoi(argv[i]);
-        printf("Create matrix internally in parse, size = %d \n", Size);
-
-        a = (float *)malloc(Size * Size * sizeof(float));
-        create_matrix(a, Size);
-
-        b = (float *)malloc(Size * sizeof(float));
-        for (j = 0; j < Size; j++)
-          b[j] = 1.0;
-
-        m = (float *)malloc(Size * Size * sizeof(float));
-        break;
-      case 'f': // platform
-        i++;
-        printf("Read file from %s \n", argv[i]);
-        InitProblemOnce(argv[i]);
-        break;
-      case 'q': // quiet
-        verbose = 1;
-        break;
-      }
-    }
-  }
-
-  // InitProblemOnce(filename);
-
-  InitPerRun();
-  // begin timing
-  struct timeval time_start;
-  gettimeofday(&time_start, NULL);
-
-  // run kernels
-  ForwardSub();
-
-  // end timing
-  struct timeval time_end;
-  gettimeofday(&time_end, NULL);
-  unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
-                            (time_start.tv_sec * 1000000 + time_start.tv_usec);
-
-  if (verbose) {
-    printf("Matrix m is: \n");
-    PrintMat(m, Size, Size);
-
-    printf("Matrix a is: \n");
-    PrintMat(a, Size, Size);
-
-    printf("Array b is: \n");
-    PrintAry(b, Size);
-  }
-  BackSub();
-  if (verbose) {
-    printf("The final solution is: \n");
-    PrintAry(finalVec, Size);
-  }
-  printf("\nTime total (including memory transfers)\t%f sec\n",
-         time_total * 1e-6);
-  printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
-
-  /*printf("%d,%d\n",size,time_total);
-  fprintf(stderr,"%d,%d\n",size,time_total);*/
-
-  free(m);
-  free(a);
-  free(b);
-
-#ifdef TIMING
-  printf("Exec: %f\n", kernel_time);
-#endif
-}
-/*------------------------------------------------------
- ** PrintDeviceProperties
- **-----------------------------------------------------
- */
-void PrintDeviceProperties() {
-  cudaDeviceProp deviceProp;
-  int nDevCount = 0;
-
-  cudaGetDeviceCount(&nDevCount);
-  printf("Total Device found: %d", nDevCount);
-  for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
-    memset(&deviceProp, 0, sizeof(deviceProp));
-    if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
-      printf("\nDevice Name \t\t - %s ", deviceProp.name);
-      printf("\n**************************************");
-      printf("\nTotal Global Memory\t\t\t - %lu KB",
-             deviceProp.totalGlobalMem / 1024);
-      printf("\nShared memory available per block \t - %lu KB",
-             deviceProp.sharedMemPerBlock / 1024);
-      printf("\nNumber of registers per thread block \t - %d",
-             deviceProp.regsPerBlock);
-      printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
-      printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
-      printf("\nMaximum threads per block \t\t - %d",
-             deviceProp.maxThreadsPerBlock);
-      printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
-             deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
-             deviceProp.maxThreadsDim[2]);
-      printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
-             deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
-             deviceProp.maxGridSize[2]);
-      printf("\nTotal constant memory \t\t\t - %zu bytes",
-             deviceProp.totalConstMem);
-      printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
-      printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
-      printf("\nTexture Alignment \t\t\t - %zu bytes",
-             deviceProp.textureAlignment);
-      printf("\nDevice Overlap \t\t\t\t - %s",
-             deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
-      printf("\nNumber of Multi processors \t\t - %d\n\n",
-             deviceProp.multiProcessorCount);
-    } else
-      printf("\n%s", cudaGetErrorString(cudaGetLastError()));
-  }
-}
-
-/*------------------------------------------------------
- ** InitProblemOnce -- Initialize all of matrices and
- ** vectors by opening a data file specified by the user.
- **
- ** We used dynamic array *a, *b, and *m to allocate
- ** the memory storages.
- **------------------------------------------------------
- */
-void InitProblemOnce(char *filename) {
-  // char *filename = argv[1];
-
-  // printf("Enter the data file name: ");
-  // scanf("%s", filename);
-  printf("The file name is: %s\n", filename);
-
-  fp = fopen(filename, "r");
-
-  fscanf(fp, "%d", &Size);
-
-  a = (float *)malloc(Size * Size * sizeof(float));
-
-  InitMat(a, Size, Size);
-  printf("The input matrix a is:\n");
-  PrintMat(a, Size, Size);
-  b = (float *)malloc(Size * sizeof(float));
-
-  InitAry(b, Size);
-  printf("The input array b is:\n");
-  PrintAry(b, Size);
-
-  m = (float *)malloc(Size * Size * sizeof(float));
-}
-
-/*------------------------------------------------------
- ** InitPerRun() -- Initialize the contents of the
- ** multipier matrix **m
- **------------------------------------------------------
- */
-void InitPerRun() {
-  int i;
-  for (i = 0; i < Size * Size; i++)
-    *(m + i) = 0.0;
-}
-
-/*-------------------------------------------------------
- ** Fan1() -- Calculate multiplier matrix
- ** Pay attention to the index.  Index i give the range
- ** which starts from 0 to range-1.  The real values of
- ** the index should be adjust and related with the value
- ** of t which is defined on the ForwardSub().
- **-------------------------------------------------------
- */
-__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
-  // if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
-  // 		printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
-  // Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
-  // }
-
-  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
-    return;
-  *(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
-      *(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
-      *(a_cuda + Size * t + t);
-}
-
-/*-------------------------------------------------------
- ** Fan2() -- Modify the matrix A into LUD
- **-------------------------------------------------------
- */
-
-__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
-                     int j1, int t) {
-  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
-    return;
-  if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
-    return;
-
-  int xidx = blockIdx.x * blockDim.x + threadIdx.x;
-  int yidx = blockIdx.y * blockDim.y + threadIdx.y;
-  // printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
-  // blockDim.x: %d, blockDim.y:
-  // %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
-
-  a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
-      m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
-  // a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
-  if (yidx == 0) {
-    // printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
-    // printf("xidx:%d,yidx:%d\n",xidx,yidx);
-    b_cuda[xidx + 1 + t] -=
-        m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
-  }
-}
-
-/*------------------------------------------------------
- ** ForwardSub() -- Forward substitution of Gaussian
- ** elimination.
- **------------------------------------------------------
- */
-void ForwardSub() {
-  int t;
-  float *m_cuda, *a_cuda, *b_cuda;
-
-  int A = 1;
-  int B = 2;
-  int C = 3;
-  int D = 4;
-  int E = 5;
-  int F = 6;
-  // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
-  // A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
-  // threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
-
-  // allocate memory on GPU
-  cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
-
-  cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
-
-  cudaMalloc((void **)&b_cuda, Size * sizeof(float));
-
-  // copy memory to GPU
-  cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
-
-  int block_size, grid_size;
-
-  block_size = MAXBLOCKSIZE;
-  grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
-  printf("1d grid size: %d\n", grid_size);
-
-  dim3 dimBlock(block_size);
-  dim3 dimGrid(grid_size);
-  // dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
-
-  int blockSize2d, gridSize2d;
-  blockSize2d = BLOCK_SIZE_XY;
-  gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
-
-  dim3 dimBlockXY(blockSize2d, blockSize2d);
-
-  printf("BlockXY: %d \n", blockSize2d);
-  dim3 dimGridXY(gridSize2d, gridSize2d);
-
-#ifdef TIMING
-  gettimeofday(&tv_kernel_start, NULL);
-#endif
-  printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
-  // begin timing kernels
-  struct timeval time_start;
-  gettimeofday(&time_start, NULL);
-  for (t = 0; t < (Size - 1); t++) {
-    Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
-    cudaDeviceSynchronize();
-    Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
-    cudaDeviceSynchronize();
-    checkCUDAError("Fan2");
-  }
-  // end timing kernels
-  struct timeval time_end;
-  gettimeofday(&time_end, NULL);
-  totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
-                    (time_start.tv_sec * 1000000 + time_start.tv_usec);
-
-#ifdef TIMING
-  tvsub(&time_end, &tv_kernel_start, &tv);
-  kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
-#endif
-
-  // copy memory back to CPU
-  cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
-  cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
-  cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
-  cudaFree(m_cuda);
-  cudaFree(a_cuda);
-  cudaFree(b_cuda);
-}
-
-/*------------------------------------------------------
- ** BackSub() -- Backward substitution
- **------------------------------------------------------
- */
-
-void BackSub() {
-  // create a new vector to hold the final answer
-  finalVec = (float *)malloc(Size * sizeof(float));
-  // solve "bottom up"
-  int i, j;
-  for (i = 0; i < Size; i++) {
-    finalVec[Size - i - 1] = b[Size - i - 1];
-    for (j = 0; j < i; j++) {
-      finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
-                                finalVec[Size - j - 1];
-    }
-    finalVec[Size - i - 1] =
-        finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
-  }
-}
-
-void InitMat(float *ary, int nrow, int ncol) {
-  int i, j;
-
-  for (i = 0; i < nrow; i++) {
-    for (j = 0; j < ncol; j++) {
-      fscanf(fp, "%f", ary + Size * i + j);
-    }
-  }
-}
-
-/*------------------------------------------------------
- ** PrintMat() -- Print the contents of the matrix
- **------------------------------------------------------
- */
-void PrintMat(float *ary, int nrow, int ncol) {
-  return;
-  int i, j;
-
-  for (i = 0; i < nrow; i++) {
-    for (j = 0; j < ncol; j++) {
-      printf("%8.2f ", *(ary + Size * i + j));
-    }
-    printf("\n");
-  }
-  printf("\n");
-}
-
-/*------------------------------------------------------
- ** InitAry() -- Initialize the array (vector) by reading
- ** data from the data file
- **------------------------------------------------------
- */
-void InitAry(float *ary, int ary_size) {
-  int i;
-
-  for (i = 0; i < ary_size; i++) {
-    fscanf(fp, "%f", &ary[i]);
-  }
-}
-
-/*------------------------------------------------------
- ** PrintAry() -- Print the contents of the array (vector)
- **------------------------------------------------------
- */
-void PrintAry(float *ary, int ary_size) {
-  int i;
-  for (i = 0; i < ary_size; i++) {
-    printf("%.2f ", ary[i]);
-  }
-  printf("\n\n");
-}
-void checkCUDAError(const char *msg) {
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-}
--- a/examples/gauss/run.sh
+++ b/examples/gauss/run.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-set -e
-llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
-../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-
-g++ -Wall -L../../build/runtime \
-     -L../../build/runtime/threadPool \
-     -o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
-
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
-
-if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
-    echo "Pass"
-else
-    echo "Error result"
-    exit 1
-fi
--- a/examples/heartwall/AVI/avilib.c
+++ b/examples/heartwall/AVI/avilib.c
--- a/examples/heartwall/AVI/avilib.h
+++ b/examples/heartwall/AVI/avilib.h
@ -1,317 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- *  avilib.h
- *
- *  Copyright (C) Thomas Östreich - June 2001
- *  multiple audio track support Copyright (C) 2002 Thomas Östreich
- *
- *  Original code:
- *  Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
- *
- *  This file is part of transcode, a linux video stream processing tool
- *
- *  transcode is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  transcode is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with GNU Make; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-// #include <windows.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifndef AVILIB_H
-#define AVILIB_H
-
-#define AVI_MAX_TRACKS 8
-
-typedef struct {
-  unsigned long key;
-  unsigned long pos;
-  unsigned long len;
-} video_index_entry;
-
-typedef struct {
-  unsigned long pos;
-  unsigned long len;
-  unsigned long tot;
-} audio_index_entry;
-
-typedef struct track_s {
-
-  long a_fmt;   /* Audio format, see #defines below */
-  long a_chans; /* Audio channels, 0 for no audio */
-  long a_rate;  /* Rate in Hz */
-  long a_bits;  /* bits per audio sample */
-  long mp3rate; /* mp3 bitrate kbs*/
-
-  long audio_strn;   /* Audio stream number */
-  long audio_bytes;  /* Total number of bytes of audio data */
-  long audio_chunks; /* Chunks of audio data in the file */
-
-  char audio_tag[4]; /* Tag of audio data */
-  long audio_posc;   /* Audio position: chunk */
-  long audio_posb;   /* Audio position: byte within chunk */
-
-  long a_codech_off; /* absolut offset of audio codec information */
-  long a_codecf_off; /* absolut offset of audio codec information */
-
-  audio_index_entry *audio_index;
-
-} track_t;
-
-typedef struct {
-
-  long fdes; /* File descriptor of AVI file */
-  long mode; /* 0 for reading, 1 for writing */
-
-  long width;          /* Width  of a video frame */
-  long height;         /* Height of a video frame */
-  double fps;          /* Frames per second */
-  char compressor[8];  /* Type of compressor, 4 bytes + padding for 0 byte */
-  char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
-  long video_strn;     /* Video stream number */
-  long video_frames;   /* Number of video frames */
-  char video_tag[4];   /* Tag of video data */
-  long video_pos;      /* Number of next frame to be read
-                              (if index present) */
-
-  unsigned long max_len; /* maximum video chunk present */
-
-  track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
-
-  unsigned long pos; /* position in file */
-  long n_idx;        /* number of index entries actually filled */
-  long max_idx;      /* number of index entries actually allocated */
-
-  long v_codech_off; /* absolut offset of video codec (strh) info */
-  long v_codecf_off; /* absolut offset of video codec (strf) info */
-
-  unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
-  video_index_entry *video_index;
-
-  unsigned long last_pos; /* Position of last frame written */
-  unsigned long last_len; /* Length of last frame written */
-  int must_use_index;     /* Flag if frames are duplicated */
-  unsigned long movi_start;
-
-  int anum; // total number of audio tracks
-  int aptr; // current audio working track
-
-} avi_t;
-
-#define AVI_MODE_WRITE 0
-#define AVI_MODE_READ 1
-
-/* The error codes delivered by avi_open_input_file */
-
-#define AVI_ERR_SIZELIM                                                        \
-  1 /* The write of the data would exceed                                      \
-                                           the maximum size of the AVI file.   \
-                                           This is more a warning than an      \
-       error since the file may be closed safely */
-
-#define AVI_ERR_OPEN                                                           \
-  2 /* Error opening the AVI file - wrong path                                 \
-                                           name or file nor readable/writable  \
-     */
-
-#define AVI_ERR_READ 3 /* Error reading from AVI File */
-
-#define AVI_ERR_WRITE                                                          \
-  4 /* Error writing to AVI File,                                              \
-                                           disk full ??? */
-
-#define AVI_ERR_WRITE_INDEX                                                    \
-  5 /* Could not write index to AVI file                                       \
-                                           during close, file may still be     \
-                                           usable */
-
-#define AVI_ERR_CLOSE                                                          \
-  6 /* Could not write header to AVI file                                      \
-                                           or not truncate the file during     \
-       close, file is most probably corrupted */
-
-#define AVI_ERR_NOT_PERM                                                       \
-  7 /* Operation not permitted:                                                \
-                                           trying to read from a file open     \
-                                           for writing or vice versa */
-
-#define AVI_ERR_NO_MEM 8 /* malloc failed */
-
-#define AVI_ERR_NO_AVI 9 /* Not an AVI file */
-
-#define AVI_ERR_NO_HDRL                                                        \
-  10 /* AVI file has no has no header list,                                    \
-                                            corrupted ??? */
-
-#define AVI_ERR_NO_MOVI                                                        \
-  11 /* AVI file has no has no MOVI list,                                      \
-                                            corrupted ??? */
-
-#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
-
-#define AVI_ERR_NO_IDX                                                         \
-  13 /* The file has been opened with                                          \
-                                            getIndex==0, but an operation has  \
-        been performed that needs an index */
-
-/* Possible Audio formats */
-
-#ifndef WAVE_FORMAT_PCM
-#define WAVE_FORMAT_UNKNOWN (0x0000)
-#define WAVE_FORMAT_PCM (0x0001)
-#define WAVE_FORMAT_ADPCM (0x0002)
-#define WAVE_FORMAT_IBM_CVSD (0x0005)
-#define WAVE_FORMAT_ALAW (0x0006)
-#define WAVE_FORMAT_MULAW (0x0007)
-#define WAVE_FORMAT_OKI_ADPCM (0x0010)
-#define WAVE_FORMAT_DVI_ADPCM (0x0011)
-#define WAVE_FORMAT_DIGISTD (0x0015)
-#define WAVE_FORMAT_DIGIFIX (0x0016)
-#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
-#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
-#define WAVE_FORMAT_GSM610 (0x0031)
-#define IBM_FORMAT_MULAW (0x0101)
-#define IBM_FORMAT_ALAW (0x0102)
-#define IBM_FORMAT_ADPCM (0x0103)
-#endif
-
-avi_t *AVI_open_output_file(char *filename);
-void AVI_set_video(avi_t *AVI, int width, int height, double fps,
-                   char *compressor);
-void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
-                   long mp3rate);
-int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
-int AVI_dup_frame(avi_t *AVI);
-int AVI_write_audio(avi_t *AVI, char *data, long bytes);
-int AVI_append_audio(avi_t *AVI, char *data, long bytes);
-long AVI_bytes_remain(avi_t *AVI);
-int AVI_close(avi_t *AVI);
-long AVI_bytes_written(avi_t *AVI);
-
-avi_t *AVI_open_input_file(char *filename, int getIndex);
-avi_t *AVI_open_fd(int fd, int getIndex);
-int avi_parse_input_file(avi_t *AVI, int getIndex);
-long AVI_audio_mp3rate(avi_t *AVI);
-long AVI_video_frames(avi_t *AVI);
-int AVI_video_width(avi_t *AVI);
-int AVI_video_height(avi_t *AVI);
-double AVI_frame_rate(avi_t *AVI);
-char *AVI_video_compressor(avi_t *AVI);
-
-int AVI_audio_channels(avi_t *AVI);
-int AVI_audio_bits(avi_t *AVI);
-int AVI_audio_format(avi_t *AVI);
-long AVI_audio_rate(avi_t *AVI);
-long AVI_audio_bytes(avi_t *AVI);
-long AVI_audio_chunks(avi_t *AVI);
-
-long AVI_max_video_chunk(avi_t *AVI);
-
-long AVI_frame_size(avi_t *AVI, long frame);
-long AVI_audio_size(avi_t *AVI, long frame);
-int AVI_seek_start(avi_t *AVI);
-int AVI_set_video_position(avi_t *AVI, long frame);
-long AVI_get_video_position(avi_t *AVI, long frame);
-long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
-
-int AVI_set_audio_position(avi_t *AVI, long byte);
-int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
-
-long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
-
-long AVI_audio_codech_offset(avi_t *AVI);
-long AVI_audio_codecf_offset(avi_t *AVI);
-long AVI_video_codech_offset(avi_t *AVI);
-long AVI_video_codecf_offset(avi_t *AVI);
-
-int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
-                  long max_audbuf, long *len);
-
-void AVI_print_error(char *str);
-char *AVI_strerror();
-char *AVI_syserror();
-
-int AVI_scan(char *name);
-int AVI_dump(char *name, int mode);
-
-char *AVI_codec2str(short cc);
-int AVI_file_check(char *import_file);
-
-void AVI_info(avi_t *avifile);
-uint64_t AVI_max_size();
-int avi_update_header(avi_t *AVI);
-
-int AVI_set_audio_track(avi_t *AVI, int track);
-int AVI_get_audio_track(avi_t *AVI);
-int AVI_audio_tracks(avi_t *AVI);
-
-struct riff_struct {
-  unsigned char id[4]; /* RIFF */
-  unsigned long len;
-  unsigned char wave_id[4]; /* WAVE */
-};
-
-struct chunk_struct {
-  unsigned char id[4];
-  unsigned long len;
-};
-
-struct common_struct {
-  unsigned short wFormatTag;
-  unsigned short wChannels;
-  unsigned long dwSamplesPerSec;
-  unsigned long dwAvgBytesPerSec;
-  unsigned short wBlockAlign;
-  unsigned short wBitsPerSample; /* Only for PCM */
-};
-
-struct wave_header {
-  struct riff_struct riff;
-  struct chunk_struct format;
-  struct common_struct common;
-  struct chunk_struct data;
-};
-
-struct AVIStreamHeader {
-  long fccType;
-  long fccHandler;
-  long dwFlags;
-  long dwPriority;
-  long dwInitialFrames;
-  long dwScale;
-  long dwRate;
-  long dwStart;
-  long dwLength;
-  long dwSuggestedBufferSize;
-  long dwQuality;
-  long dwSampleSize;
-};
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/heartwall/AVI/avimod.c
+++ b/examples/heartwall/AVI/avimod.c
@ -1,130 +0,0 @@
-// #ifdef __cplusplus
-// extern "C" {
-// #endif
-
-//===============================================================================================================================================================================================================
-//	DEFINE / INCLUDE
-//===============================================================================================================================================================================================================
-#include "avimod.h"
-
-//===============================================================================================================================================================================================================
-//	FUNCTIONS
-//===============================================================================================================================================================================================================
-
-// Flips the specified image and crops it to the specified dimensions
-// If scaled == true, all values are scaled to the range [0.0, 1.0
-fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
-                    int converted) {
-
-  // fixed dimensions for cropping or not cropping, square vertices starting
-  // from initial point in top left corner going down and right
-  int top;
-  int bottom;
-  int left;
-  int right;
-  if (cropped == 1) {
-    top = 0;
-    bottom = 0;
-    left = 0;
-    right = 0;
-  } else {
-    top = 0;
-    bottom = height - 1;
-    left = 0;
-    right = width - 1;
-  }
-
-  // dimensions of new cropped image
-  int height_new = bottom - top + 1;
-  int width_new = right - left + 1;
-
-  // counters
-  int i, j;
-
-  // allocate memory for cropped/flipped frame
-  fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
-
-  // crop/flip and scale frame
-  fp temp;
-  if (scaled) {
-    fp scale = 1.0 / 255.0;
-    for (i = 0; i < height_new; i++) {  // rows
-      for (j = 0; j < width_new; j++) { // colums
-        temp =
-            (fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
-        if (temp < 0) {
-          result[i * width_new + j] = temp + 256;
-        } else {
-          result[i * width_new + j] = temp;
-        }
-      }
-    }
-  } else {
-    for (i = 0; i < height_new; i++) {  // rows
-      for (j = 0; j < width_new; j++) { // colums
-        temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
-        if (temp < 0) {
-          result[i * width_new + j] = temp + 256;
-        } else {
-          result[i * width_new + j] = temp;
-        }
-      }
-    }
-  }
-
-  // convert storage method (from row-major to column-major)
-  fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
-  if (converted == 1) {
-    for (i = 0; i < width_new; i++) {    // rows
-      for (j = 0; j < height_new; j++) { // colums
-        result_converted[i * height_new + j] = result[j * width_new + i];
-      }
-    }
-  } else {
-    result_converted = result;
-  }
-  free(result);
-
-  // return
-  return result_converted;
-}
-
-// Returns the specified frame from the specified video file
-// If cropped == true, the frame is cropped to pre-determined dimensions
-//  (hardcoded to the boundaries of the blood vessel in the test video)
-// If scaled == true, all values are scaled to the range [0.0, 1.0]
-fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
-              int converted) {
-
-  // variable
-  int dummy;
-  int width = AVI_video_width(cell_file);
-  int height = AVI_video_height(cell_file);
-  int status;
-
-  // There are 600 frames in this file (i.e. frame_num = 600 causes an error)
-  AVI_set_video_position(cell_file, frame_num);
-
-  // Read in the frame from the AVI
-  char *image_buf = (char *)malloc(width * height * sizeof(char));
-  status = AVI_read_frame(cell_file, image_buf, &dummy);
-  if (status == -1) {
-    AVI_print_error((char *)"Error with AVI_read_frame");
-    exit(-1);
-  }
-
-  // The image is read in upside-down, so we need to flip it
-  fp *image_chopped;
-  image_chopped =
-      chop_flip_image(image_buf, height, width, cropped, scaled, converted);
-
-  // free image buffer
-  free(image_buf);
-
-  // return
-  return image_chopped;
-}
-
-// #ifdef __cplusplus
-// }
-// #endif
--- a/examples/heartwall/AVI/avimod.h
+++ b/examples/heartwall/AVI/avimod.h
@ -1,24 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//===============================================================================================================================================================================================================
-//	DEFINE / INCLUDE
-//===============================================================================================================================================================================================================
-#define fp float
-
-#include "avilib.h"
-
-//===============================================================================================================================================================================================================
-//	DEFINE / INCLUDE
-//===============================================================================================================================================================================================================
-
-fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
-                    int converted);
-
-fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
-              int converted);
-
-#ifdef __cplusplus
-}
-#endif
--- a/examples/heartwall/define.c
+++ b/examples/heartwall/define.c
@ -1,396 +0,0 @@
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	DEFINE / INCLUDE
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-
-#define fp float
-
-/* #define NUMBER_THREADS 512 */
-#ifdef RD_WG_SIZE_0_0
-#define NUMBER_THREADS RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define NUMBER_THREADS RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define NUMBER_THREADS RD_WG_SIZE
-#else
-#define NUMBER_THREADS 256
-#endif
-
-#define ENDO_POINTS 20
-#define EPI_POINTS 31
-#define ALL_POINTS 51
-
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	PARAMS_COMMON_CHANGE STRUCT
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-
-typedef struct params_common_change {
-
-  //======================================================================================================================================================
-  //	FRAME
-  //======================================================================================================================================================
-
-  fp *d_frame;
-  int frame_no;
-
-} params_common_change;
-
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	PARAMS_COMMON STRUCTURE
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-
-typedef struct params_common {
-
-  //======================================================================================================================================================
-  //	HARDCODED INPUTS FROM MATLAB
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	CONSTANTS
-  //====================================================================================================
-
-  int sSize;
-  int tSize;
-  int maxMove;
-  fp alpha;
-
-  //====================================================================================================
-  //	FRAME
-  //====================================================================================================
-
-  int no_frames;
-  int frame_rows;
-  int frame_cols;
-  int frame_elem;
-  int frame_mem;
-
-  //====================================================================================================
-  //	ENDO POINTS
-  //====================================================================================================
-
-  int endoPoints;
-  int endo_mem;
-
-  int *endoRow;
-  int *endoCol;
-  int *tEndoRowLoc;
-  int *tEndoColLoc;
-
-  int *d_endoRow;
-  int *d_endoCol;
-  int *d_tEndoRowLoc;
-  int *d_tEndoColLoc;
-
-  fp *d_endoT;
-
-  //====================================================================================================
-  //	EPI POINTS
-  //====================================================================================================
-  int epiPoints;
-  int epi_mem;
-
-  int *epiRow;
-  int *epiCol;
-  int *tEpiRowLoc;
-  int *tEpiColLoc;
-
-  int *d_epiRow;
-  int *d_epiCol;
-  int *d_tEpiRowLoc;
-  int *d_tEpiColLoc;
-
-  fp *d_epiT;
-
-  //====================================================================================================
-  //	ALL POINTS
-  //====================================================================================================
-
-  int allPoints;
-
-  //======================================================================================================================================================
-  //	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
-  //======================================================================================================================================================
-
-  int in_rows;
-  int in_cols;
-  int in_elem;
-  int in_mem;
-
-  //======================================================================================================================================================
-  // 	AREA AROUND POINT		FROM	FRAME
-  //======================================================================================================================================================
-
-  int in2_rows;
-  int in2_cols;
-  int in2_elem;
-  int in2_mem;
-
-  //======================================================================================================================================================
-  //	CONVOLUTION
-  //======================================================================================================================================================
-
-  int conv_rows;
-  int conv_cols;
-  int conv_elem;
-  int conv_mem;
-  int ioffset;
-  int joffset;
-
-  //======================================================================================================================================================
-  //	CUMULATIVE SUM 1
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
-  //====================================================================================================
-
-  int in2_pad_add_rows;
-  int in2_pad_add_cols;
-  int in2_pad_cumv_rows;
-  int in2_pad_cumv_cols;
-  int in2_pad_cumv_elem;
-  int in2_pad_cumv_mem;
-
-  //====================================================================================================
-  //	SELECTION
-  //====================================================================================================
-
-  int in2_pad_cumv_sel_rows;
-  int in2_pad_cumv_sel_cols;
-  int in2_pad_cumv_sel_elem;
-  int in2_pad_cumv_sel_mem;
-  int in2_pad_cumv_sel_rowlow;
-  int in2_pad_cumv_sel_rowhig;
-  int in2_pad_cumv_sel_collow;
-  int in2_pad_cumv_sel_colhig;
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
-  //====================================================================================================
-
-  int in2_pad_cumv_sel2_rowlow;
-  int in2_pad_cumv_sel2_rowhig;
-  int in2_pad_cumv_sel2_collow;
-  int in2_pad_cumv_sel2_colhig;
-  int in2_sub_cumh_rows;
-  int in2_sub_cumh_cols;
-  int in2_sub_cumh_elem;
-  int in2_sub_cumh_mem;
-
-  //====================================================================================================
-  //	SELECTION
-  //====================================================================================================
-
-  int in2_sub_cumh_sel_rows;
-  int in2_sub_cumh_sel_cols;
-  int in2_sub_cumh_sel_elem;
-  int in2_sub_cumh_sel_mem;
-  int in2_sub_cumh_sel_rowlow;
-  int in2_sub_cumh_sel_rowhig;
-  int in2_sub_cumh_sel_collow;
-  int in2_sub_cumh_sel_colhig;
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION
-  //====================================================================================================
-
-  int in2_sub_cumh_sel2_rowlow;
-  int in2_sub_cumh_sel2_rowhig;
-  int in2_sub_cumh_sel2_collow;
-  int in2_sub_cumh_sel2_colhig;
-  int in2_sub2_rows;
-  int in2_sub2_cols;
-  int in2_sub2_elem;
-  int in2_sub2_mem;
-
-  //======================================================================================================================================================
-  //	CUMULATIVE SUM 2
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	MULTIPLICATION
-  //====================================================================================================
-
-  int in2_sqr_rows;
-  int in2_sqr_cols;
-  int in2_sqr_elem;
-  int in2_sqr_mem;
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION
-  //====================================================================================================
-
-  int in2_sqr_sub2_rows;
-  int in2_sqr_sub2_cols;
-  int in2_sqr_sub2_elem;
-  int in2_sqr_sub2_mem;
-
-  //======================================================================================================================================================
-  //	FINAL
-  //======================================================================================================================================================
-
-  int in_sqr_rows;
-  int in_sqr_cols;
-  int in_sqr_elem;
-  int in_sqr_mem;
-
-  //======================================================================================================================================================
-  //	TEMPLATE MASK CREATE
-  //======================================================================================================================================================
-
-  int tMask_rows;
-  int tMask_cols;
-  int tMask_elem;
-  int tMask_mem;
-
-  //======================================================================================================================================================
-  //	POINT MASK INITIALIZE
-  //======================================================================================================================================================
-
-  int mask_rows;
-  int mask_cols;
-  int mask_elem;
-  int mask_mem;
-
-  //======================================================================================================================================================
-  //	MASK CONVOLUTION
-  //======================================================================================================================================================
-
-  int mask_conv_rows;
-  int mask_conv_cols;
-  int mask_conv_elem;
-  int mask_conv_mem;
-  int mask_conv_ioffset;
-  int mask_conv_joffset;
-
-} params_common;
-
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	PARAMS_UNIQUE STRUCTURE
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-
-typedef struct params_unique {
-
-  //======================================================================================================================================================
-  //	POINT NUMBER
-  //======================================================================================================================================================
-
-  int *d_Row;
-  int *d_Col;
-  int *d_tRowLoc;
-  int *d_tColLoc;
-  fp *d_T;
-
-  //======================================================================================================================================================
-  //	POINT NUMBER
-  //======================================================================================================================================================
-
-  int point_no;
-
-  //======================================================================================================================================================
-  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
-  //======================================================================================================================================================
-
-  int in_pointer;
-
-  //======================================================================================================================================================
-  //	AREA AROUND POINT		FROM	FRAME
-  //======================================================================================================================================================
-
-  fp *d_in2;
-
-  //======================================================================================================================================================
-  //	CONVOLUTION
-  //======================================================================================================================================================
-
-  fp *d_conv;
-  fp *d_in_mod;
-
-  //======================================================================================================================================================
-  //	CUMULATIVE SUM
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
-  //====================================================================================================
-
-  fp *d_in2_pad_cumv;
-
-  //====================================================================================================
-  //	SELECTION
-  //====================================================================================================
-
-  fp *d_in2_pad_cumv_sel;
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
-  //====================================================================================================
-
-  fp *d_in2_sub_cumh;
-
-  //====================================================================================================
-  //	SELECTION
-  //====================================================================================================
-
-  fp *d_in2_sub_cumh_sel;
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION
-  //====================================================================================================
-
-  fp *d_in2_sub2;
-
-  //======================================================================================================================================================
-  //	CUMULATIVE SUM 2
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	MULTIPLICATION
-  //====================================================================================================
-
-  fp *d_in2_sqr;
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION
-  //====================================================================================================
-
-  fp *d_in2_sqr_sub2;
-
-  //======================================================================================================================================================
-  //	FINAL
-  //======================================================================================================================================================
-
-  fp *d_in_sqr;
-
-  //======================================================================================================================================================
-  //	TEMPLATE MASK
-  //======================================================================================================================================================
-
-  fp *d_tMask;
-
-  //======================================================================================================================================================
-  //	POINT MASK INITIALIZE
-  //======================================================================================================================================================
-
-  fp *d_mask;
-
-  //======================================================================================================================================================
-  //	MASK CONVOLUTION
-  //======================================================================================================================================================
-
-  fp *d_mask_conv;
-
-} params_unique;
-
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	END OF STRUCTURE
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
--- a/examples/heartwall/kernel.cu
+++ b/examples/heartwall/kernel.cu
--- a/examples/heartwall/main.cu
+++ b/examples/heartwall/main.cu
@ -1,795 +0,0 @@
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	DEFINE / INCLUDE
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-
-//======================================================================================================================================================
-//	LIBRARIES
-//======================================================================================================================================================
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <avilib.h>
-#include <avimod.h>
-#include <cuda.h>
-
-//======================================================================================================================================================
-//	STRUCTURES, GLOBAL STRUCTURE VARIABLES
-//======================================================================================================================================================
-
-#include "define.c"
-
-params_common_change common_change;
-__constant__ params_common_change d_common_change;
-
-params_common common;
-__constant__ params_common d_common;
-
-params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
-                                  // more than usually needed
-__constant__ params_unique d_unique[ALL_POINTS];
-
-//======================================================================================================================================================
-// KERNEL CODE
-//======================================================================================================================================================
-
-#include "kernel.cu"
-
-//	WRITE DATA FUNCTION
-//===============================================================================================================================================================================================================200
-
-void write_data(char *filename, int frameNo, int frames_processed,
-                int endoPoints, int *input_a, int *input_b, int epiPoints,
-                int *input_2a, int *input_2b) {
-
-  //================================================================================80
-  //	VARIABLES
-  //================================================================================80
-
-  FILE *fid;
-  int i, j;
-  char c;
-
-  //================================================================================80
-  //	OPEN FILE FOR READING
-  //================================================================================80
-
-  fid = fopen(filename, "w+");
-  if (fid == NULL) {
-    printf("The file was not opened for writing\n");
-    return;
-  }
-
-  //================================================================================80
-  //	WRITE VALUES TO THE FILE
-  //================================================================================80
-  fprintf(fid, "Total AVI Frames: %d\n", frameNo);
-  fprintf(fid, "Frames Processed: %d\n", frames_processed);
-  fprintf(fid, "endoPoints: %d\n", endoPoints);
-  fprintf(fid, "epiPoints: %d", epiPoints);
-  for (j = 0; j < frames_processed; j++) {
-    fprintf(fid, "\n---Frame %d---", j);
-    fprintf(fid, "\n--endo--\n", j);
-    for (i = 0; i < endoPoints; i++) {
-      fprintf(fid, "%d\t", input_a[j + i * frameNo]);
-    }
-    fprintf(fid, "\n");
-    for (i = 0; i < endoPoints; i++) {
-      // if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
-      fprintf(fid, "%d\t", input_b[j + i * frameNo]);
-    }
-    fprintf(fid, "\n--epi--\n", j);
-    for (i = 0; i < epiPoints; i++) {
-      // if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
-      fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
-    }
-    fprintf(fid, "\n");
-    for (i = 0; i < epiPoints; i++) {
-      // if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
-      fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
-    }
-  }
-  // 	================================================================================80
-  //		CLOSE FILE
-  //	================================================================================80
-
-  fclose(fid);
-}
-
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	MAIN FUNCTION
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-int main(int argc, char *argv[]) {
-  cudaSetDevice(0);
-  printf("WG size of kernel = %d \n", NUMBER_THREADS);
-  //======================================================================================================================================================
-  //	VARIABLES
-  //======================================================================================================================================================
-
-  // CUDA kernel execution parameters
-  dim3 threads;
-  dim3 blocks;
-
-  // counter
-  int i;
-  int frames_processed;
-
-  // frames
-  char *video_file_name;
-  avi_t *frames;
-  fp *frame;
-
-  //======================================================================================================================================================
-  // 	FRAME
-  //======================================================================================================================================================
-
-  if (argc != 3) {
-    printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
-    exit(1);
-  }
-
-  // open movie file
-  video_file_name = argv[1];
-  frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
-  if (frames == NULL) {
-    AVI_print_error((char *)"Error with AVI_open_input_file");
-    return -1;
-  }
-
-  // common
-  common.no_frames = AVI_video_frames(frames);
-  common.frame_rows = AVI_video_height(frames);
-  common.frame_cols = AVI_video_width(frames);
-  common.frame_elem = common.frame_rows * common.frame_cols;
-  common.frame_mem = sizeof(fp) * common.frame_elem;
-
-  // pointers
-  cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
-
-  //======================================================================================================================================================
-  // 	CHECK INPUT ARGUMENTS
-  //======================================================================================================================================================
-
-  frames_processed = atoi(argv[2]);
-  if (frames_processed < 0 || frames_processed > common.no_frames) {
-    printf("ERROR: %d is an incorrect number of frames specified, select in "
-           "the range of 0-%d\n",
-           frames_processed, common.no_frames);
-    return 0;
-  }
-
-  //======================================================================================================================================================
-  //	HARDCODED INPUTS FROM MATLAB
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	CONSTANTS
-  //====================================================================================================
-
-  common.sSize = 40;
-  common.tSize = 25;
-  common.maxMove = 10;
-  common.alpha = 0.87;
-
-  //====================================================================================================
-  //	ENDO POINTS
-  //====================================================================================================
-
-  common.endoPoints = ENDO_POINTS;
-  common.endo_mem = sizeof(int) * common.endoPoints;
-
-  common.endoRow = (int *)malloc(common.endo_mem);
-  common.endoRow[0] = 369;
-  common.endoRow[1] = 400;
-  common.endoRow[2] = 429;
-  common.endoRow[3] = 452;
-  common.endoRow[4] = 476;
-  common.endoRow[5] = 486;
-  common.endoRow[6] = 479;
-  common.endoRow[7] = 458;
-  common.endoRow[8] = 433;
-  common.endoRow[9] = 404;
-  common.endoRow[10] = 374;
-  common.endoRow[11] = 346;
-  common.endoRow[12] = 318;
-  common.endoRow[13] = 294;
-  common.endoRow[14] = 277;
-  common.endoRow[15] = 269;
-  common.endoRow[16] = 275;
-  common.endoRow[17] = 287;
-  common.endoRow[18] = 311;
-  common.endoRow[19] = 339;
-  cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
-  cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
-             cudaMemcpyHostToDevice);
-
-  common.endoCol = (int *)malloc(common.endo_mem);
-  common.endoCol[0] = 408;
-  common.endoCol[1] = 406;
-  common.endoCol[2] = 397;
-  common.endoCol[3] = 383;
-  common.endoCol[4] = 354;
-  common.endoCol[5] = 322;
-  common.endoCol[6] = 294;
-  common.endoCol[7] = 270;
-  common.endoCol[8] = 250;
-  common.endoCol[9] = 237;
-  common.endoCol[10] = 235;
-  common.endoCol[11] = 241;
-  common.endoCol[12] = 254;
-  common.endoCol[13] = 273;
-  common.endoCol[14] = 300;
-  common.endoCol[15] = 328;
-  common.endoCol[16] = 356;
-  common.endoCol[17] = 383;
-  common.endoCol[18] = 401;
-  common.endoCol[19] = 411;
-  cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
-  cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
-             cudaMemcpyHostToDevice);
-
-  common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
-  cudaMalloc((void **)&common.d_tEndoRowLoc,
-             common.endo_mem * common.no_frames);
-
-  common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
-  cudaMalloc((void **)&common.d_tEndoColLoc,
-             common.endo_mem * common.no_frames);
-
-  //====================================================================================================
-  //	EPI POINTS
-  //====================================================================================================
-
-  common.epiPoints = EPI_POINTS;
-  common.epi_mem = sizeof(int) * common.epiPoints;
-
-  common.epiRow = (int *)malloc(common.epi_mem);
-  common.epiRow[0] = 390;
-  common.epiRow[1] = 419;
-  common.epiRow[2] = 448;
-  common.epiRow[3] = 474;
-  common.epiRow[4] = 501;
-  common.epiRow[5] = 519;
-  common.epiRow[6] = 535;
-  common.epiRow[7] = 542;
-  common.epiRow[8] = 543;
-  common.epiRow[9] = 538;
-  common.epiRow[10] = 528;
-  common.epiRow[11] = 511;
-  common.epiRow[12] = 491;
-  common.epiRow[13] = 466;
-  common.epiRow[14] = 438;
-  common.epiRow[15] = 406;
-  common.epiRow[16] = 376;
-  common.epiRow[17] = 347;
-  common.epiRow[18] = 318;
-  common.epiRow[19] = 291;
-  common.epiRow[20] = 275;
-  common.epiRow[21] = 259;
-  common.epiRow[22] = 256;
-  common.epiRow[23] = 252;
-  common.epiRow[24] = 252;
-  common.epiRow[25] = 257;
-  common.epiRow[26] = 266;
-  common.epiRow[27] = 283;
-  common.epiRow[28] = 305;
-  common.epiRow[29] = 331;
-  common.epiRow[30] = 360;
-  cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
-  cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
-             cudaMemcpyHostToDevice);
-
-  common.epiCol = (int *)malloc(common.epi_mem);
-  common.epiCol[0] = 457;
-  common.epiCol[1] = 454;
-  common.epiCol[2] = 446;
-  common.epiCol[3] = 431;
-  common.epiCol[4] = 411;
-  common.epiCol[5] = 388;
-  common.epiCol[6] = 361;
-  common.epiCol[7] = 331;
-  common.epiCol[8] = 301;
-  common.epiCol[9] = 273;
-  common.epiCol[10] = 243;
-  common.epiCol[11] = 218;
-  common.epiCol[12] = 196;
-  common.epiCol[13] = 178;
-  common.epiCol[14] = 166;
-  common.epiCol[15] = 157;
-  common.epiCol[16] = 155;
-  common.epiCol[17] = 165;
-  common.epiCol[18] = 177;
-  common.epiCol[19] = 197;
-  common.epiCol[20] = 218;
-  common.epiCol[21] = 248;
-  common.epiCol[22] = 276;
-  common.epiCol[23] = 304;
-  common.epiCol[24] = 333;
-  common.epiCol[25] = 361;
-  common.epiCol[26] = 391;
-  common.epiCol[27] = 415;
-  common.epiCol[28] = 434;
-  common.epiCol[29] = 448;
-  common.epiCol[30] = 455;
-  cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
-  cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
-             cudaMemcpyHostToDevice);
-
-  common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
-  cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
-
-  common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
-  cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
-
-  //====================================================================================================
-  //	ALL POINTS
-  //====================================================================================================
-
-  common.allPoints = ALL_POINTS;
-
-  //======================================================================================================================================================
-  // 	TEMPLATE SIZES
-  //======================================================================================================================================================
-
-  // common
-  common.in_rows = common.tSize + 1 + common.tSize;
-  common.in_cols = common.in_rows;
-  common.in_elem = common.in_rows * common.in_cols;
-  common.in_mem = sizeof(fp) * common.in_elem;
-
-  //======================================================================================================================================================
-  // 	CREATE ARRAY OF TEMPLATES FOR ALL POINTS
-  //======================================================================================================================================================
-
-  // common
-  cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
-  cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
-
-  //======================================================================================================================================================
-  //	SPECIFIC TO ENDO OR EPI TO BE SET HERE
-  //======================================================================================================================================================
-
-  for (i = 0; i < common.endoPoints; i++) {
-    unique[i].point_no = i;
-    unique[i].d_Row = common.d_endoRow;
-    unique[i].d_Col = common.d_endoCol;
-    unique[i].d_tRowLoc = common.d_tEndoRowLoc;
-    unique[i].d_tColLoc = common.d_tEndoColLoc;
-    unique[i].d_T = common.d_endoT;
-  }
-  for (i = common.endoPoints; i < common.allPoints; i++) {
-    unique[i].point_no = i - common.endoPoints;
-    unique[i].d_Row = common.d_epiRow;
-    unique[i].d_Col = common.d_epiCol;
-    unique[i].d_tRowLoc = common.d_tEpiRowLoc;
-    unique[i].d_tColLoc = common.d_tEpiColLoc;
-    unique[i].d_T = common.d_epiT;
-  }
-
-  //======================================================================================================================================================
-  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
-  //======================================================================================================================================================
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    unique[i].in_pointer = unique[i].point_no * common.in_elem;
-  }
-
-  //======================================================================================================================================================
-  // 	AREA AROUND POINT		FROM	FRAME
-  //======================================================================================================================================================
-
-  // common
-  common.in2_rows = 2 * common.sSize + 1;
-  common.in2_cols = 2 * common.sSize + 1;
-  common.in2_elem = common.in2_rows * common.in2_cols;
-  common.in2_mem = sizeof(float) * common.in2_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
-  }
-
-  //======================================================================================================================================================
-  // 	CONVOLUTION
-  //======================================================================================================================================================
-
-  // common
-  common.conv_rows =
-      common.in_rows + common.in2_rows - 1; // number of rows in I
-  common.conv_cols =
-      common.in_cols + common.in2_cols - 1; // number of columns in I
-  common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
-  common.conv_mem = sizeof(float) * common.conv_elem;
-  common.ioffset = 0;
-  common.joffset = 0;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
-  }
-
-  //======================================================================================================================================================
-  // 	CUMULATIVE SUM
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  // 	PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
-  //====================================================================================================
-
-  // common
-  common.in2_pad_add_rows = common.in_rows;
-  common.in2_pad_add_cols = common.in_cols;
-
-  common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
-  common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
-  common.in2_pad_cumv_elem =
-      common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
-  common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
-  }
-
-  //====================================================================================================
-  // 	SELECTION
-  //====================================================================================================
-
-  // common
-  common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
-  common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
-  common.in2_pad_cumv_sel_collow = 1;
-  common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
-  common.in2_pad_cumv_sel_rows =
-      common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
-  common.in2_pad_cumv_sel_cols =
-      common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
-  common.in2_pad_cumv_sel_elem =
-      common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
-  common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
-               common.in2_pad_cumv_sel_mem);
-  }
-
-  //====================================================================================================
-  // 	SELECTION	2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
-  //====================================================================================================
-
-  // common
-  common.in2_pad_cumv_sel2_rowlow = 1;
-  common.in2_pad_cumv_sel2_rowhig =
-      common.in2_pad_cumv_rows - common.in_rows - 1;
-  common.in2_pad_cumv_sel2_collow = 1;
-  common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
-  common.in2_sub_cumh_rows =
-      common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
-  common.in2_sub_cumh_cols =
-      common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
-  common.in2_sub_cumh_elem =
-      common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
-  common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
-  }
-
-  //====================================================================================================
-  // 	SELECTION
-  //====================================================================================================
-
-  // common
-  common.in2_sub_cumh_sel_rowlow = 1;
-  common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
-  common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
-  common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
-  common.in2_sub_cumh_sel_rows =
-      common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
-  common.in2_sub_cumh_sel_cols =
-      common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
-  common.in2_sub_cumh_sel_elem =
-      common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
-  common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
-               common.in2_sub_cumh_sel_mem);
-  }
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION
-  //====================================================================================================
-
-  // common
-  common.in2_sub_cumh_sel2_rowlow = 1;
-  common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
-  common.in2_sub_cumh_sel2_collow = 1;
-  common.in2_sub_cumh_sel2_colhig =
-      common.in2_sub_cumh_cols - common.in_cols - 1;
-  common.in2_sub2_rows =
-      common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
-  common.in2_sub2_cols =
-      common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
-  common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
-  common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
-  }
-
-  //======================================================================================================================================================
-  //	CUMULATIVE SUM 2
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	MULTIPLICATION
-  //====================================================================================================
-
-  // common
-  common.in2_sqr_rows = common.in2_rows;
-  common.in2_sqr_cols = common.in2_cols;
-  common.in2_sqr_elem = common.in2_elem;
-  common.in2_sqr_mem = common.in2_mem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
-  }
-
-  //====================================================================================================
-  //	SELECTION 2, SUBTRACTION
-  //====================================================================================================
-
-  // common
-  common.in2_sqr_sub2_rows = common.in2_sub2_rows;
-  common.in2_sqr_sub2_cols = common.in2_sub2_cols;
-  common.in2_sqr_sub2_elem = common.in2_sub2_elem;
-  common.in2_sqr_sub2_mem = common.in2_sub2_mem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
-  }
-
-  //======================================================================================================================================================
-  //	FINAL
-  //======================================================================================================================================================
-
-  // common
-  common.in_sqr_rows = common.in_rows;
-  common.in_sqr_cols = common.in_cols;
-  common.in_sqr_elem = common.in_elem;
-  common.in_sqr_mem = common.in_mem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
-  }
-
-  //======================================================================================================================================================
-  //	TEMPLATE MASK CREATE
-  //======================================================================================================================================================
-
-  // common
-  common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
-  common.tMask_cols = common.tMask_rows;
-  common.tMask_elem = common.tMask_rows * common.tMask_cols;
-  common.tMask_mem = sizeof(float) * common.tMask_elem;
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
-  }
-
-  //======================================================================================================================================================
-  //	POINT MASK INITIALIZE
-  //======================================================================================================================================================
-
-  // common
-  common.mask_rows = common.maxMove;
-  common.mask_cols = common.mask_rows;
-  common.mask_elem = common.mask_rows * common.mask_cols;
-  common.mask_mem = sizeof(float) * common.mask_elem;
-
-  //======================================================================================================================================================
-  //	MASK CONVOLUTION
-  //======================================================================================================================================================
-
-  // common
-  common.mask_conv_rows = common.tMask_rows; // number of rows in I
-  common.mask_conv_cols = common.tMask_cols; // number of columns in I
-  common.mask_conv_elem =
-      common.mask_conv_rows * common.mask_conv_cols; // number of elements
-  common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
-  common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
-  if ((common.mask_rows - 1) % 2 > 0.5) {
-    common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
-  }
-  common.mask_conv_joffset = (common.mask_cols - 1) / 2;
-  if ((common.mask_cols - 1) % 2 > 0.5) {
-    common.mask_conv_joffset = common.mask_conv_joffset + 1;
-  }
-
-  // pointers
-  for (i = 0; i < common.allPoints; i++) {
-    cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
-  }
-
-  //======================================================================================================================================================
-  //	KERNEL
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	THREAD BLOCK
-  //====================================================================================================
-
-  // All kernels operations within kernel use same max size of threads. Size of
-  // block size is set to the size appropriate for max size operation (on padded
-  // matrix). Other use subsets of that.
-  threads.x = NUMBER_THREADS; // define the number of threads in the block
-  threads.y = 1;
-  blocks.x = common.allPoints; // define the number of blocks in the grid
-  blocks.y = 1;
-
-  //====================================================================================================
-  //	COPY ARGUMENTS
-  //====================================================================================================
-
-  cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
-  cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
-
-  //====================================================================================================
-  //	PRINT FRAME PROGRESS START
-  //====================================================================================================
-
-  printf("frame progress: ");
-  fflush(NULL);
-
-  //====================================================================================================
-  //	LAUNCH
-  //====================================================================================================
-
-  for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
-       common_change.frame_no++) {
-    printf("get frame\n");
-    // Extract a cropped version of the first frame from the video file
-    frame = get_frame(
-        frames,                 // pointer to video file
-        common_change.frame_no, // number of frame that needs to be returned
-        0,                      // cropped?
-        0,                      // scaled?
-        1);                     // converted
-    printf("memcpy\n");
-    // copy frame to GPU memory
-    cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
-               cudaMemcpyHostToDevice);
-    printf("toSymbol\n");
-    cudaMemcpyToSymbol(d_common_change, &common_change,
-                       sizeof(params_common_change));
-
-    // launch GPU kernel
-    printf("launch\n");
-    kernel<<<1, 32>>>();
-    cudaDeviceSynchronize();
-    printf("return\n");
-    // free frame after each loop iteration, since AVI library allocates memory
-    // for every frame fetched
-    printf("free\n");
-    free(frame);
-
-    // print frame progress
-    printf("%d ", common_change.frame_no);
-    fflush(NULL);
-  }
-
-  //====================================================================================================
-  //	PRINT FRAME PROGRESS END
-  //====================================================================================================
-
-  printf("\n");
-  fflush(NULL);
-
-  //====================================================================================================
-  //	OUTPUT
-  //====================================================================================================
-
-  cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
-             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
-  cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
-             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
-
-  cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
-             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
-  cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
-             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
-
-#ifdef OUTPUT
-
-  //==================================================50
-  //	DUMP DATA TO FILE
-  //==================================================50
-  write_data("result.txt", common.no_frames, frames_processed,
-             common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
-             common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
-
-  //==================================================50
-  //	End
-  //==================================================50
-
-#endif
-
-  //======================================================================================================================================================
-  //	DEALLOCATION
-  //======================================================================================================================================================
-
-  //====================================================================================================
-  //	COMMON
-  //====================================================================================================
-
-  // frame
-  cudaFree(common_change.d_frame);
-
-  // endo points
-  free(common.endoRow);
-  free(common.endoCol);
-  free(common.tEndoRowLoc);
-  free(common.tEndoColLoc);
-
-  cudaFree(common.d_endoRow);
-  cudaFree(common.d_endoCol);
-  cudaFree(common.d_tEndoRowLoc);
-  cudaFree(common.d_tEndoColLoc);
-
-  cudaFree(common.d_endoT);
-
-  // epi points
-  free(common.epiRow);
-  free(common.epiCol);
-  free(common.tEpiRowLoc);
-  free(common.tEpiColLoc);
-
-  cudaFree(common.d_epiRow);
-  cudaFree(common.d_epiCol);
-  cudaFree(common.d_tEpiRowLoc);
-  cudaFree(common.d_tEpiColLoc);
-
-  cudaFree(common.d_epiT);
-
-  //====================================================================================================
-  //	POINTERS
-  //====================================================================================================
-
-  for (i = 0; i < common.allPoints; i++) {
-    cudaFree(unique[i].d_in2);
-
-    cudaFree(unique[i].d_conv);
-    cudaFree(unique[i].d_in2_pad_cumv);
-    cudaFree(unique[i].d_in2_pad_cumv_sel);
-    cudaFree(unique[i].d_in2_sub_cumh);
-    cudaFree(unique[i].d_in2_sub_cumh_sel);
-    cudaFree(unique[i].d_in2_sub2);
-    cudaFree(unique[i].d_in2_sqr);
-    cudaFree(unique[i].d_in2_sqr_sub2);
-    cudaFree(unique[i].d_in_sqr);
-
-    cudaFree(unique[i].d_tMask);
-    cudaFree(unique[i].d_mask_conv);
-  }
-}
-
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
-//	MAIN FUNCTION
-//===============================================================================================================================================================================================================
-//===============================================================================================================================================================================================================
--- a/examples/heartwall/run.sh
+++ b/examples/heartwall/run.sh
@ -1,17 +0,0 @@
-#!/bin/bash
-
-cd AVI; make; cd ..;
-
-clang++ -DOUTPUT main.cu -I./AVI  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
-
-
-/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-
-
-g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o  ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
-
-./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20
--- a/examples/heartwall/setdevice.cu
+++ b/examples/heartwall/setdevice.cu
@ -1,5 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Set Device
-////////////////////////////////////////////////////////////////////////////////
-
-void setdevice(void) { cudaSetDevice(0); }
--- a/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,719 +0,0 @@
-; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "hotspot.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
-
-@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
-@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
-@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
-entry:
-  %iteration.addr = alloca i32, align 4
-  %power.addr = alloca float*, align 8
-  %temp_src.addr = alloca float*, align 8
-  %temp_dst.addr = alloca float*, align 8
-  %grid_cols.addr = alloca i32, align 4
-  %grid_rows.addr = alloca i32, align 4
-  %border_cols.addr = alloca i32, align 4
-  %border_rows.addr = alloca i32, align 4
-  %Cap.addr = alloca float, align 4
-  %Rx.addr = alloca float, align 4
-  %Ry.addr = alloca float, align 4
-  %Rz.addr = alloca float, align 4
-  %step.addr = alloca float, align 4
-  %time_elapsed.addr = alloca float, align 4
-  %amb_temp = alloca float, align 4
-  %step_div_Cap = alloca float, align 4
-  %Rx_1 = alloca float, align 4
-  %Ry_1 = alloca float, align 4
-  %Rz_1 = alloca float, align 4
-  %bx = alloca i32, align 4
-  %by = alloca i32, align 4
-  %tx = alloca i32, align 4
-  %ty = alloca i32, align 4
-  %small_block_rows = alloca i32, align 4
-  %small_block_cols = alloca i32, align 4
-  %blkY = alloca i32, align 4
-  %blkX = alloca i32, align 4
-  %blkYmax = alloca i32, align 4
-  %blkXmax = alloca i32, align 4
-  %yidx = alloca i32, align 4
-  %xidx = alloca i32, align 4
-  %loadYidx = alloca i32, align 4
-  %loadXidx = alloca i32, align 4
-  %index = alloca i32, align 4
-  %validYmin = alloca i32, align 4
-  %validYmax = alloca i32, align 4
-  %validXmin = alloca i32, align 4
-  %validXmax = alloca i32, align 4
-  %N = alloca i32, align 4
-  %S = alloca i32, align 4
-  %W = alloca i32, align 4
-  %E = alloca i32, align 4
-  %computed = alloca i8, align 1
-  %i = alloca i32, align 4
-  store i32 %iteration, i32* %iteration.addr, align 4
-  store float* %power, float** %power.addr, align 8
-  store float* %temp_src, float** %temp_src.addr, align 8
-  store float* %temp_dst, float** %temp_dst.addr, align 8
-  store i32 %grid_cols, i32* %grid_cols.addr, align 4
-  store i32 %grid_rows, i32* %grid_rows.addr, align 4
-  store i32 %border_cols, i32* %border_cols.addr, align 4
-  store i32 %border_rows, i32* %border_rows.addr, align 4
-  store float %Cap, float* %Cap.addr, align 4
-  store float %Rx, float* %Rx.addr, align 4
-  store float %Ry, float* %Ry.addr, align 4
-  store float %Rz, float* %Rz.addr, align 4
-  store float %step, float* %step.addr, align 4
-  store float %time_elapsed, float* %time_elapsed.addr, align 4
-  store float 8.000000e+01, float* %amb_temp, align 4
-  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call, i32* %bx, align 4
-  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
-  store i32 %call1, i32* %by, align 4
-  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
-  store i32 %call2, i32* %tx, align 4
-  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
-  store i32 %call3, i32* %ty, align 4
-  %0 = load float, float* %step.addr, align 4
-  %1 = load float, float* %Cap.addr, align 4
-  %div = fdiv float %0, %1
-  store float %div, float* %step_div_Cap, align 4
-  %2 = load float, float* %Rx.addr, align 4
-  %div4 = fdiv float 1.000000e+00, %2
-  store float %div4, float* %Rx_1, align 4
-  %3 = load float, float* %Ry.addr, align 4
-  %div5 = fdiv float 1.000000e+00, %3
-  store float %div5, float* %Ry_1, align 4
-  %4 = load float, float* %Rz.addr, align 4
-  %div6 = fdiv float 1.000000e+00, %4
-  store float %div6, float* %Rz_1, align 4
-  %5 = load i32, i32* %iteration.addr, align 4
-  %mul = mul nsw i32 %5, 2
-  %sub = sub nsw i32 16, %mul
-  store i32 %sub, i32* %small_block_rows, align 4
-  %6 = load i32, i32* %iteration.addr, align 4
-  %mul7 = mul nsw i32 %6, 2
-  %sub8 = sub nsw i32 16, %mul7
-  store i32 %sub8, i32* %small_block_cols, align 4
-  %7 = load i32, i32* %small_block_rows, align 4
-  %8 = load i32, i32* %by, align 4
-  %mul9 = mul nsw i32 %7, %8
-  %9 = load i32, i32* %border_rows.addr, align 4
-  %sub10 = sub nsw i32 %mul9, %9
-  store i32 %sub10, i32* %blkY, align 4
-  %10 = load i32, i32* %small_block_cols, align 4
-  %11 = load i32, i32* %bx, align 4
-  %mul11 = mul nsw i32 %10, %11
-  %12 = load i32, i32* %border_cols.addr, align 4
-  %sub12 = sub nsw i32 %mul11, %12
-  store i32 %sub12, i32* %blkX, align 4
-  %13 = load i32, i32* %blkY, align 4
-  %add = add nsw i32 %13, 16
-  %sub13 = sub nsw i32 %add, 1
-  store i32 %sub13, i32* %blkYmax, align 4
-  %14 = load i32, i32* %blkX, align 4
-  %add14 = add nsw i32 %14, 16
-  %sub15 = sub nsw i32 %add14, 1
-  store i32 %sub15, i32* %blkXmax, align 4
-  %15 = load i32, i32* %blkY, align 4
-  %16 = load i32, i32* %ty, align 4
-  %add16 = add nsw i32 %15, %16
-  store i32 %add16, i32* %yidx, align 4
-  %17 = load i32, i32* %blkX, align 4
-  %18 = load i32, i32* %tx, align 4
-  %add17 = add nsw i32 %17, %18
-  store i32 %add17, i32* %xidx, align 4
-  %19 = load i32, i32* %yidx, align 4
-  store i32 %19, i32* %loadYidx, align 4
-  %20 = load i32, i32* %xidx, align 4
-  store i32 %20, i32* %loadXidx, align 4
-  %21 = load i32, i32* %grid_cols.addr, align 4
-  %22 = load i32, i32* %loadYidx, align 4
-  %mul18 = mul nsw i32 %21, %22
-  %23 = load i32, i32* %loadXidx, align 4
-  %add19 = add nsw i32 %mul18, %23
-  store i32 %add19, i32* %index, align 4
-  %24 = load i32, i32* %loadYidx, align 4
-  %cmp = icmp sge i32 %24, 0
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %25 = load i32, i32* %loadYidx, align 4
-  %26 = load i32, i32* %grid_rows.addr, align 4
-  %sub20 = sub nsw i32 %26, 1
-  %cmp21 = icmp sle i32 %25, %sub20
-  br i1 %cmp21, label %land.lhs.true22, label %if.end
-
-land.lhs.true22:                                  ; preds = %land.lhs.true
-  %27 = load i32, i32* %loadXidx, align 4
-  %cmp23 = icmp sge i32 %27, 0
-  br i1 %cmp23, label %land.lhs.true24, label %if.end
-
-land.lhs.true24:                                  ; preds = %land.lhs.true22
-  %28 = load i32, i32* %loadXidx, align 4
-  %29 = load i32, i32* %grid_cols.addr, align 4
-  %sub25 = sub nsw i32 %29, 1
-  %cmp26 = icmp sle i32 %28, %sub25
-  br i1 %cmp26, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true24
-  %30 = load float*, float** %temp_src.addr, align 8
-  %31 = load i32, i32* %index, align 4
-  %idxprom = sext i32 %31 to i64
-  %arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
-  %32 = load float, float* %arrayidx, align 4
-  %33 = load i32, i32* %ty, align 4
-  %idxprom27 = sext i32 %33 to i64
-  %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
-  %34 = load i32, i32* %tx, align 4
-  %idxprom29 = sext i32 %34 to i64
-  %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
-  store float %32, float* %arrayidx30, align 4
-  %35 = load float*, float** %power.addr, align 8
-  %36 = load i32, i32* %index, align 4
-  %idxprom31 = sext i32 %36 to i64
-  %arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
-  %37 = load float, float* %arrayidx32, align 4
-  %38 = load i32, i32* %ty, align 4
-  %idxprom33 = sext i32 %38 to i64
-  %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
-  %39 = load i32, i32* %tx, align 4
-  %idxprom35 = sext i32 %39 to i64
-  %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
-  store float %37, float* %arrayidx36, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
-  call void @llvm.nvvm.barrier0()
-  %40 = load i32, i32* %blkY, align 4
-  %cmp37 = icmp slt i32 %40, 0
-  br i1 %cmp37, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %if.end
-  %41 = load i32, i32* %blkY, align 4
-  %sub38 = sub nsw i32 0, %41
-  br label %cond.end
-
-cond.false:                                       ; preds = %if.end
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
-  store i32 %cond, i32* %validYmin, align 4
-  %42 = load i32, i32* %blkYmax, align 4
-  %43 = load i32, i32* %grid_rows.addr, align 4
-  %sub39 = sub nsw i32 %43, 1
-  %cmp40 = icmp sgt i32 %42, %sub39
-  br i1 %cmp40, label %cond.true41, label %cond.false45
-
-cond.true41:                                      ; preds = %cond.end
-  %44 = load i32, i32* %blkYmax, align 4
-  %45 = load i32, i32* %grid_rows.addr, align 4
-  %sub42 = sub nsw i32 %44, %45
-  %add43 = add nsw i32 %sub42, 1
-  %sub44 = sub nsw i32 15, %add43
-  br label %cond.end46
-
-cond.false45:                                     ; preds = %cond.end
-  br label %cond.end46
-
-cond.end46:                                       ; preds = %cond.false45, %cond.true41
-  %cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
-  store i32 %cond47, i32* %validYmax, align 4
-  %46 = load i32, i32* %blkX, align 4
-  %cmp48 = icmp slt i32 %46, 0
-  br i1 %cmp48, label %cond.true49, label %cond.false51
-
-cond.true49:                                      ; preds = %cond.end46
-  %47 = load i32, i32* %blkX, align 4
-  %sub50 = sub nsw i32 0, %47
-  br label %cond.end52
-
-cond.false51:                                     ; preds = %cond.end46
-  br label %cond.end52
-
-cond.end52:                                       ; preds = %cond.false51, %cond.true49
-  %cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
-  store i32 %cond53, i32* %validXmin, align 4
-  %48 = load i32, i32* %blkXmax, align 4
-  %49 = load i32, i32* %grid_cols.addr, align 4
-  %sub54 = sub nsw i32 %49, 1
-  %cmp55 = icmp sgt i32 %48, %sub54
-  br i1 %cmp55, label %cond.true56, label %cond.false60
-
-cond.true56:                                      ; preds = %cond.end52
-  %50 = load i32, i32* %blkXmax, align 4
-  %51 = load i32, i32* %grid_cols.addr, align 4
-  %sub57 = sub nsw i32 %50, %51
-  %add58 = add nsw i32 %sub57, 1
-  %sub59 = sub nsw i32 15, %add58
-  br label %cond.end61
-
-cond.false60:                                     ; preds = %cond.end52
-  br label %cond.end61
-
-cond.end61:                                       ; preds = %cond.false60, %cond.true56
-  %cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
-  store i32 %cond62, i32* %validXmax, align 4
-  %52 = load i32, i32* %ty, align 4
-  %sub63 = sub nsw i32 %52, 1
-  store i32 %sub63, i32* %N, align 4
-  %53 = load i32, i32* %ty, align 4
-  %add64 = add nsw i32 %53, 1
-  store i32 %add64, i32* %S, align 4
-  %54 = load i32, i32* %tx, align 4
-  %sub65 = sub nsw i32 %54, 1
-  store i32 %sub65, i32* %W, align 4
-  %55 = load i32, i32* %tx, align 4
-  %add66 = add nsw i32 %55, 1
-  store i32 %add66, i32* %E, align 4
-  %56 = load i32, i32* %N, align 4
-  %57 = load i32, i32* %validYmin, align 4
-  %cmp67 = icmp slt i32 %56, %57
-  br i1 %cmp67, label %cond.true68, label %cond.false69
-
-cond.true68:                                      ; preds = %cond.end61
-  %58 = load i32, i32* %validYmin, align 4
-  br label %cond.end70
-
-cond.false69:                                     ; preds = %cond.end61
-  %59 = load i32, i32* %N, align 4
-  br label %cond.end70
-
-cond.end70:                                       ; preds = %cond.false69, %cond.true68
-  %cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
-  store i32 %cond71, i32* %N, align 4
-  %60 = load i32, i32* %S, align 4
-  %61 = load i32, i32* %validYmax, align 4
-  %cmp72 = icmp sgt i32 %60, %61
-  br i1 %cmp72, label %cond.true73, label %cond.false74
-
-cond.true73:                                      ; preds = %cond.end70
-  %62 = load i32, i32* %validYmax, align 4
-  br label %cond.end75
-
-cond.false74:                                     ; preds = %cond.end70
-  %63 = load i32, i32* %S, align 4
-  br label %cond.end75
-
-cond.end75:                                       ; preds = %cond.false74, %cond.true73
-  %cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
-  store i32 %cond76, i32* %S, align 4
-  %64 = load i32, i32* %W, align 4
-  %65 = load i32, i32* %validXmin, align 4
-  %cmp77 = icmp slt i32 %64, %65
-  br i1 %cmp77, label %cond.true78, label %cond.false79
-
-cond.true78:                                      ; preds = %cond.end75
-  %66 = load i32, i32* %validXmin, align 4
-  br label %cond.end80
-
-cond.false79:                                     ; preds = %cond.end75
-  %67 = load i32, i32* %W, align 4
-  br label %cond.end80
-
-cond.end80:                                       ; preds = %cond.false79, %cond.true78
-  %cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
-  store i32 %cond81, i32* %W, align 4
-  %68 = load i32, i32* %E, align 4
-  %69 = load i32, i32* %validXmax, align 4
-  %cmp82 = icmp sgt i32 %68, %69
-  br i1 %cmp82, label %cond.true83, label %cond.false84
-
-cond.true83:                                      ; preds = %cond.end80
-  %70 = load i32, i32* %validXmax, align 4
-  br label %cond.end85
-
-cond.false84:                                     ; preds = %cond.end80
-  %71 = load i32, i32* %E, align 4
-  br label %cond.end85
-
-cond.end85:                                       ; preds = %cond.false84, %cond.true83
-  %cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
-  store i32 %cond86, i32* %E, align 4
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %cond.end85
-  %72 = load i32, i32* %i, align 4
-  %73 = load i32, i32* %iteration.addr, align 4
-  %cmp87 = icmp slt i32 %72, %73
-  br i1 %cmp87, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  store i8 0, i8* %computed, align 1
-  %74 = load i32, i32* %tx, align 4
-  %75 = load i32, i32* %i, align 4
-  %add88 = add nsw i32 %75, 1
-  %cmp89 = icmp sge i32 %74, %add88
-  br i1 %cmp89, label %land.lhs.true90, label %if.end175
-
-land.lhs.true90:                                  ; preds = %for.body
-  %76 = load i32, i32* %tx, align 4
-  %77 = load i32, i32* %i, align 4
-  %sub91 = sub nsw i32 16, %77
-  %sub92 = sub nsw i32 %sub91, 2
-  %cmp93 = icmp sle i32 %76, %sub92
-  br i1 %cmp93, label %land.lhs.true94, label %if.end175
-
-land.lhs.true94:                                  ; preds = %land.lhs.true90
-  %78 = load i32, i32* %ty, align 4
-  %79 = load i32, i32* %i, align 4
-  %add95 = add nsw i32 %79, 1
-  %cmp96 = icmp sge i32 %78, %add95
-  br i1 %cmp96, label %land.lhs.true97, label %if.end175
-
-land.lhs.true97:                                  ; preds = %land.lhs.true94
-  %80 = load i32, i32* %ty, align 4
-  %81 = load i32, i32* %i, align 4
-  %sub98 = sub nsw i32 16, %81
-  %sub99 = sub nsw i32 %sub98, 2
-  %cmp100 = icmp sle i32 %80, %sub99
-  br i1 %cmp100, label %land.lhs.true101, label %if.end175
-
-land.lhs.true101:                                 ; preds = %land.lhs.true97
-  %82 = load i32, i32* %tx, align 4
-  %83 = load i32, i32* %validXmin, align 4
-  %cmp102 = icmp sge i32 %82, %83
-  br i1 %cmp102, label %land.lhs.true103, label %if.end175
-
-land.lhs.true103:                                 ; preds = %land.lhs.true101
-  %84 = load i32, i32* %tx, align 4
-  %85 = load i32, i32* %validXmax, align 4
-  %cmp104 = icmp sle i32 %84, %85
-  br i1 %cmp104, label %land.lhs.true105, label %if.end175
-
-land.lhs.true105:                                 ; preds = %land.lhs.true103
-  %86 = load i32, i32* %ty, align 4
-  %87 = load i32, i32* %validYmin, align 4
-  %cmp106 = icmp sge i32 %86, %87
-  br i1 %cmp106, label %land.lhs.true107, label %if.end175
-
-land.lhs.true107:                                 ; preds = %land.lhs.true105
-  %88 = load i32, i32* %ty, align 4
-  %89 = load i32, i32* %validYmax, align 4
-  %cmp108 = icmp sle i32 %88, %89
-  br i1 %cmp108, label %if.then109, label %if.end175
-
-if.then109:                                       ; preds = %land.lhs.true107
-  store i8 1, i8* %computed, align 1
-  %90 = load i32, i32* %ty, align 4
-  %idxprom110 = sext i32 %90 to i64
-  %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
-  %91 = load i32, i32* %tx, align 4
-  %idxprom112 = sext i32 %91 to i64
-  %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
-  %92 = load float, float* %arrayidx113, align 4
-  %conv = fpext float %92 to double
-  %93 = load float, float* %step_div_Cap, align 4
-  %conv114 = fpext float %93 to double
-  %94 = load i32, i32* %ty, align 4
-  %idxprom115 = sext i32 %94 to i64
-  %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
-  %95 = load i32, i32* %tx, align 4
-  %idxprom117 = sext i32 %95 to i64
-  %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
-  %96 = load float, float* %arrayidx118, align 4
-  %conv119 = fpext float %96 to double
-  %97 = load i32, i32* %S, align 4
-  %idxprom120 = sext i32 %97 to i64
-  %arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
-  %98 = load i32, i32* %tx, align 4
-  %idxprom122 = sext i32 %98 to i64
-  %arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
-  %99 = load float, float* %arrayidx123, align 4
-  %100 = load i32, i32* %N, align 4
-  %idxprom124 = sext i32 %100 to i64
-  %arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
-  %101 = load i32, i32* %tx, align 4
-  %idxprom126 = sext i32 %101 to i64
-  %arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
-  %102 = load float, float* %arrayidx127, align 4
-  %add128 = fadd contract float %99, %102
-  %conv129 = fpext float %add128 to double
-  %103 = load i32, i32* %ty, align 4
-  %idxprom130 = sext i32 %103 to i64
-  %arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
-  %104 = load i32, i32* %tx, align 4
-  %idxprom132 = sext i32 %104 to i64
-  %arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
-  %105 = load float, float* %arrayidx133, align 4
-  %conv134 = fpext float %105 to double
-  %mul135 = fmul contract double 2.000000e+00, %conv134
-  %sub136 = fsub contract double %conv129, %mul135
-  %106 = load float, float* %Ry_1, align 4
-  %conv137 = fpext float %106 to double
-  %mul138 = fmul contract double %sub136, %conv137
-  %add139 = fadd contract double %conv119, %mul138
-  %107 = load i32, i32* %ty, align 4
-  %idxprom140 = sext i32 %107 to i64
-  %arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
-  %108 = load i32, i32* %E, align 4
-  %idxprom142 = sext i32 %108 to i64
-  %arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
-  %109 = load float, float* %arrayidx143, align 4
-  %110 = load i32, i32* %ty, align 4
-  %idxprom144 = sext i32 %110 to i64
-  %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
-  %111 = load i32, i32* %W, align 4
-  %idxprom146 = sext i32 %111 to i64
-  %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
-  %112 = load float, float* %arrayidx147, align 4
-  %add148 = fadd contract float %109, %112
-  %conv149 = fpext float %add148 to double
-  %113 = load i32, i32* %ty, align 4
-  %idxprom150 = sext i32 %113 to i64
-  %arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
-  %114 = load i32, i32* %tx, align 4
-  %idxprom152 = sext i32 %114 to i64
-  %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
-  %115 = load float, float* %arrayidx153, align 4
-  %conv154 = fpext float %115 to double
-  %mul155 = fmul contract double 2.000000e+00, %conv154
-  %sub156 = fsub contract double %conv149, %mul155
-  %116 = load float, float* %Rx_1, align 4
-  %conv157 = fpext float %116 to double
-  %mul158 = fmul contract double %sub156, %conv157
-  %add159 = fadd contract double %add139, %mul158
-  %117 = load float, float* %amb_temp, align 4
-  %118 = load i32, i32* %ty, align 4
-  %idxprom160 = sext i32 %118 to i64
-  %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
-  %119 = load i32, i32* %tx, align 4
-  %idxprom162 = sext i32 %119 to i64
-  %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
-  %120 = load float, float* %arrayidx163, align 4
-  %sub164 = fsub contract float %117, %120
-  %121 = load float, float* %Rz_1, align 4
-  %mul165 = fmul contract float %sub164, %121
-  %conv166 = fpext float %mul165 to double
-  %add167 = fadd contract double %add159, %conv166
-  %mul168 = fmul contract double %conv114, %add167
-  %add169 = fadd contract double %conv, %mul168
-  %conv170 = fptrunc double %add169 to float
-  %122 = load i32, i32* %ty, align 4
-  %idxprom171 = sext i32 %122 to i64
-  %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
-  %123 = load i32, i32* %tx, align 4
-  %idxprom173 = sext i32 %123 to i64
-  %arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
-  store float %conv170, float* %arrayidx174, align 4
-  br label %if.end175
-
-if.end175:                                        ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
-  call void @llvm.nvvm.barrier0()
-  %124 = load i32, i32* %i, align 4
-  %125 = load i32, i32* %iteration.addr, align 4
-  %sub176 = sub nsw i32 %125, 1
-  %cmp177 = icmp eq i32 %124, %sub176
-  br i1 %cmp177, label %if.then178, label %if.end179
-
-if.then178:                                       ; preds = %if.end175
-  br label %for.end
-
-if.end179:                                        ; preds = %if.end175
-  %126 = load i8, i8* %computed, align 1
-  %tobool = trunc i8 %126 to i1
-  br i1 %tobool, label %if.then180, label %if.end189
-
-if.then180:                                       ; preds = %if.end179
-  %127 = load i32, i32* %ty, align 4
-  %idxprom181 = sext i32 %127 to i64
-  %arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
-  %128 = load i32, i32* %tx, align 4
-  %idxprom183 = sext i32 %128 to i64
-  %arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
-  %129 = load float, float* %arrayidx184, align 4
-  %130 = load i32, i32* %ty, align 4
-  %idxprom185 = sext i32 %130 to i64
-  %arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
-  %131 = load i32, i32* %tx, align 4
-  %idxprom187 = sext i32 %131 to i64
-  %arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
-  store float %129, float* %arrayidx188, align 4
-  br label %if.end189
-
-if.end189:                                        ; preds = %if.then180, %if.end179
-  call void @llvm.nvvm.barrier0()
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end189
-  %132 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %132, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %if.then178, %for.cond
-  %133 = load i8, i8* %computed, align 1
-  %tobool190 = trunc i8 %133 to i1
-  br i1 %tobool190, label %if.then191, label %if.end198
-
-if.then191:                                       ; preds = %for.end
-  %134 = load i32, i32* %ty, align 4
-  %idxprom192 = sext i32 %134 to i64
-  %arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
-  %135 = load i32, i32* %tx, align 4
-  %idxprom194 = sext i32 %135 to i64
-  %arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
-  %136 = load float, float* %arrayidx195, align 4
-  %137 = load float*, float** %temp_dst.addr, align 8
-  %138 = load i32, i32* %index, align 4
-  %idxprom196 = sext i32 %138 to i64
-  %arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
-  store float %136, float* %arrayidx197, align 4
-  br label %if.end198
-
-if.end198:                                        ; preds = %if.then191, %for.end
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  ret i32 %0
-}
-
-; Function Attrs: convergent nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nounwind }
-attributes #3 = { nounwind readnone }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
--- a/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot/hotspot.cu
+++ b/examples/hotspot/hotspot.cu
@ -1,353 +0,0 @@
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-#ifdef RD_WG_SIZE_0_0
-#define BLOCK_SIZE RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define BLOCK_SIZE RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE RD_WG_SIZE
-#else
-#define BLOCK_SIZE 16
-#endif
-
-#define STR_SIZE 256
-
-/* maximum power density possible (say 300W for a 10mm x 10mm chip)	*/
-#define MAX_PD (3.0e6)
-/* required precision in degrees	*/
-#define PRECISION 0.001
-#define SPEC_HEAT_SI 1.75e6
-#define K_SI 100
-/* capacitance fitting factor	*/
-#define FACTOR_CHIP 0.5
-
-/* chip parameters	*/
-float t_chip = 0.0005;
-float chip_height = 0.016;
-float chip_width = 0.016;
-/* ambient temperature, assuming no package at all	*/
-float amb_temp = 80.0;
-
-void run(int argc, char **argv);
-
-/* define timer macros */
-#define pin_stats_reset() startCycle()
-#define pin_stats_pause(cycles) stopCycle(cycles)
-#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
-
-void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
-
-void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
-
-  int i, j, index = 0;
-  FILE *fp;
-  char str[STR_SIZE];
-
-  if ((fp = fopen(file, "w")) == 0)
-    printf("The file was not opened\n");
-
-  for (i = 0; i < grid_rows; i++)
-    for (j = 0; j < grid_cols; j++) {
-
-      sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
-      fputs(str, fp);
-      index++;
-    }
-
-  fclose(fp);
-}
-
-void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
-
-  int i, j;
-  FILE *fp;
-  char str[STR_SIZE];
-  float val;
-
-  if ((fp = fopen(file, "r")) == 0)
-    printf("The file was not opened\n");
-
-  for (i = 0; i <= grid_rows - 1; i++)
-    for (j = 0; j <= grid_cols - 1; j++) {
-      fgets(str, STR_SIZE, fp);
-      if (feof(fp))
-        fatal("not enough lines in file");
-      // if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
-      // ((i-1)*(grid_cols-2)+j-1)))
-      if ((sscanf(str, "%f", &val) != 1))
-        fatal("invalid file format");
-      vect[i * grid_cols + j] = val;
-    }
-
-  fclose(fp);
-}
-
-#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
-#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
-#define MIN(a, b) ((a) <= (b) ? (a) : (b))
-
-__global__ void calculate_temp(int iteration,   // number of iteration
-                               float *power,    // power input
-                               float *temp_src, // temperature input/output
-                               float *temp_dst, // temperature input/output
-                               int grid_cols,   // Col of grid
-                               int grid_rows,   // Row of grid
-                               int border_cols, // border offset
-                               int border_rows, // border offset
-                               float Cap,       // Capacitance
-                               float Rx, float Ry, float Rz, float step,
-                               float time_elapsed) {
-
-  __shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
-  __shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
-  __shared__ float temp_t[BLOCK_SIZE]
-                         [BLOCK_SIZE]; // saving temparary temperature result
-
-  float amb_temp = 80.0;
-  float step_div_Cap;
-  float Rx_1, Ry_1, Rz_1;
-
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-
-  step_div_Cap = step / Cap;
-
-  Rx_1 = 1 / Rx;
-  Ry_1 = 1 / Ry;
-  Rz_1 = 1 / Rz;
-
-  // each block finally computes result for a small block
-  // after N iterations.
-  // it is the non-overlapping small blocks that cover
-  // all the input data
-
-  // calculate the small block size
-  int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
-  int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
-
-  // calculate the boundary for the block according to
-  // the boundary of its small block
-  int blkY = small_block_rows * by - border_rows;
-  int blkX = small_block_cols * bx - border_cols;
-  int blkYmax = blkY + BLOCK_SIZE - 1;
-  int blkXmax = blkX + BLOCK_SIZE - 1;
-
-  // calculate the global thread coordination
-  int yidx = blkY + ty;
-  int xidx = blkX + tx;
-
-  // load data if it is within the valid input range
-  int loadYidx = yidx, loadXidx = xidx;
-  int index = grid_cols * loadYidx + loadXidx;
-
-  if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
-      IN_RANGE(loadXidx, 0, grid_cols - 1)) {
-    temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
-                                            // global memory to shared memory
-    power_on_cuda[ty][tx] =
-        power[index]; // Load the power data from global memory to shared memory
-  }
-  __syncthreads();
-
-  // effective range within this block that falls within
-  // the valid range of the input data
-  // used to rule out computation outside the boundary.
-  int validYmin = (blkY < 0) ? -blkY : 0;
-  int validYmax = (blkYmax > grid_rows - 1)
-                      ? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
-                      : BLOCK_SIZE - 1;
-  int validXmin = (blkX < 0) ? -blkX : 0;
-  int validXmax = (blkXmax > grid_cols - 1)
-                      ? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
-                      : BLOCK_SIZE - 1;
-
-  int N = ty - 1;
-  int S = ty + 1;
-  int W = tx - 1;
-  int E = tx + 1;
-
-  N = (N < validYmin) ? validYmin : N;
-  S = (S > validYmax) ? validYmax : S;
-  W = (W < validXmin) ? validXmin : W;
-  E = (E > validXmax) ? validXmax : E;
-
-  bool computed;
-  for (int i = 0; i < iteration; i++) {
-    computed = false;
-    if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
-        IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
-        IN_RANGE(tx, validXmin, validXmax) &&
-        IN_RANGE(ty, validYmin, validYmax)) {
-      computed = true;
-      temp_t[ty][tx] =
-          temp_on_cuda[ty][tx] +
-          step_div_Cap * (power_on_cuda[ty][tx] +
-                          (temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
-                           2.0 * temp_on_cuda[ty][tx]) *
-                              Ry_1 +
-                          (temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
-                           2.0 * temp_on_cuda[ty][tx]) *
-                              Rx_1 +
-                          (amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
-    }
-    __syncthreads();
-    if (i == iteration - 1)
-      break;
-    if (computed) // Assign the computation range
-      temp_on_cuda[ty][tx] = temp_t[ty][tx];
-    __syncthreads();
-  }
-
-  // update the global memory
-  // after the last iteration, only threads coordinated within the
-  // small block perform the calculation and switch on ``computed''
-  if (computed) {
-    temp_dst[index] = temp_t[ty][tx];
-  }
-}
-
-/*
-   compute N time steps
-*/
-
-int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
-                      int row, int total_iterations, int num_iterations,
-                      int blockCols, int blockRows, int borderCols,
-                      int borderRows) {
-  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
-  dim3 dimGrid(blockCols, blockRows);
-
-  float grid_height = chip_height / row;
-  float grid_width = chip_width / col;
-
-  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
-  float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
-  float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
-  float Rz = t_chip / (K_SI * grid_height * grid_width);
-
-  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
-  float step = PRECISION / max_slope;
-  float t;
-  float time_elapsed;
-  time_elapsed = 0.001;
-
-  int src = 1, dst = 0;
-
-  for (t = 0; t < total_iterations; t += num_iterations) {
-    int temp = src;
-    src = dst;
-    dst = temp;
-    calculate_temp<<<dimGrid, dimBlock>>>(
-        MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
-        MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
-        step, time_elapsed);
-    cudaDeviceSynchronize();
-  }
-  return dst;
-}
-
-void usage(int argc, char **argv) {
-  fprintf(stderr,
-          "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
-          "<temp_file> <power_file> <output_file>\n",
-          argv[0]);
-  fprintf(stderr, "\t<grid_rows/grid_cols>  - number of rows/cols in the grid "
-                  "(positive integer)\n");
-  fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
-  fprintf(stderr, "\t<sim_time>   - number of iterations\n");
-  fprintf(stderr, "\t<temp_file>  - name of the file containing the initial "
-                  "temperature values of each cell\n");
-  fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
-                  "power values of each cell\n");
-  fprintf(stderr, "\t<output_file> - name of the output file\n");
-  exit(1);
-}
-
-int main(int argc, char **argv) {
-  cudaSetDevice(0);
-  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
-
-  run(argc, argv);
-
-  return EXIT_SUCCESS;
-}
-
-void run(int argc, char **argv) {
-  int size;
-  int grid_rows, grid_cols;
-  float *FilesavingTemp, *FilesavingPower, *MatrixOut;
-  char *tfile, *pfile, *ofile;
-
-  int total_iterations = 60;
-  int pyramid_height = 1; // number of iterations
-
-  if (argc != 7)
-    usage(argc, argv);
-  if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
-      (pyramid_height = atoi(argv[2])) <= 0 ||
-      (total_iterations = atoi(argv[3])) <= 0)
-    usage(argc, argv);
-
-  tfile = argv[4];
-  pfile = argv[5];
-  ofile = argv[6];
-
-  size = grid_rows * grid_cols;
-
-/* --------------- pyramid parameters --------------- */
-#define EXPAND_RATE                                                            \
-  2 // add one iteration will extend the pyramid base by 2 per each borderline
-  int borderCols = (pyramid_height)*EXPAND_RATE / 2;
-  int borderRows = (pyramid_height)*EXPAND_RATE / 2;
-  int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
-  int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
-  int blockCols =
-      grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
-  int blockRows =
-      grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
-
-  FilesavingTemp = (float *)malloc(size * sizeof(float));
-  FilesavingPower = (float *)malloc(size * sizeof(float));
-  MatrixOut = (float *)calloc(size, sizeof(float));
-
-  if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
-    fatal("unable to allocate memory");
-
-  printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
-         "%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
-         pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
-         blockCols, blockRows, smallBlockCol, smallBlockRow);
-
-  readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
-  readinput(FilesavingPower, grid_rows, grid_cols, pfile);
-
-  float *MatrixTemp[2], *MatrixPower;
-  cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
-  cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
-  cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
-             cudaMemcpyHostToDevice);
-
-  cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
-  cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
-             cudaMemcpyHostToDevice);
-  printf("Start computing the transient temperature\n");
-  int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
-                              total_iterations, pyramid_height, blockCols,
-                              blockRows, borderCols, borderRows);
-  printf("Ending simulation\n");
-  cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
-             cudaMemcpyDeviceToHost);
-
-  writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
-
-  cudaFree(MatrixPower);
-  cudaFree(MatrixTemp[0]);
-  cudaFree(MatrixTemp[1]);
-  free(MatrixOut);
-}
--- a/examples/hotspot/run.sh
+++ b/examples/hotspot/run.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-set -e
-llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
-../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-
-g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
-    -o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
-
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
-if head output.out | grep -q "323.829"; then
-    echo "Pass"
-else
-    echo "Error result"
-    exit 1
-fi
--- a/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,587 +0,0 @@
-; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "3D.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_blockDim_t = type { i8 }
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
-
-$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
-
-$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
-
-$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
-
-@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
-entry:
-  %p.addr = alloca i8**, align 8
-  %s.addr = alloca i64, align 8
-  store i8** %p, i8*** %p.addr, align 8
-  store i64 %s, i64* %s.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
-entry:
-  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
-  %c.addr = alloca i8*, align 8
-  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
-  store i8* %c, i8** %c.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
-entry:
-  %value.addr = alloca i32*, align 8
-  %attr.addr = alloca i32, align 4
-  %device.addr = alloca i32, align 4
-  store i32* %value, i32** %value.addr, align 8
-  store i32 %attr, i32* %attr.addr, align 4
-  store i32 %device, i32* %device.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
-entry:
-  %device.addr = alloca i32*, align 8
-  store i32* %device, i32** %device.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
-entry:
-  %numBlocks.addr = alloca i32*, align 8
-  %func.addr = alloca i8*, align 8
-  %blockSize.addr = alloca i32, align 4
-  %dynamicSmemSize.addr = alloca i64, align 8
-  %flags.addr = alloca i32, align 4
-  store i32* %numBlocks, i32** %numBlocks.addr, align 8
-  store i8* %func, i8** %func.addr, align 8
-  store i32 %blockSize, i32* %blockSize.addr, align 4
-  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
-  store i32 %flags, i32* %flags.addr, align 4
-  ret i32 999
-}
-
-; Function Attrs: convergent noinline nounwind optnone
-define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
-entry:
-  %p.addr = alloca float*, align 8
-  %tIn.addr = alloca float*, align 8
-  %tOut.addr = alloca float*, align 8
-  %sdc.addr = alloca float, align 4
-  %nx.addr = alloca i32, align 4
-  %ny.addr = alloca i32, align 4
-  %nz.addr = alloca i32, align 4
-  %ce.addr = alloca float, align 4
-  %cw.addr = alloca float, align 4
-  %cn.addr = alloca float, align 4
-  %cs.addr = alloca float, align 4
-  %ct.addr = alloca float, align 4
-  %cb.addr = alloca float, align 4
-  %cc.addr = alloca float, align 4
-  %amb_temp = alloca float, align 4
-  %i = alloca i32, align 4
-  %j = alloca i32, align 4
-  %c = alloca i32, align 4
-  %xy = alloca i32, align 4
-  %W = alloca i32, align 4
-  %E = alloca i32, align 4
-  %N = alloca i32, align 4
-  %S = alloca i32, align 4
-  %temp1 = alloca float, align 4
-  %temp2 = alloca float, align 4
-  %temp3 = alloca float, align 4
-  %k = alloca i32, align 4
-  store float* %p, float** %p.addr, align 8
-  store float* %tIn, float** %tIn.addr, align 8
-  store float* %tOut, float** %tOut.addr, align 8
-  store float %sdc, float* %sdc.addr, align 4
-  store i32 %nx, i32* %nx.addr, align 4
-  store i32 %ny, i32* %ny.addr, align 4
-  store i32 %nz, i32* %nz.addr, align 4
-  store float %ce, float* %ce.addr, align 4
-  store float %cw, float* %cw.addr, align 4
-  store float %cn, float* %cn.addr, align 4
-  store float %cs, float* %cs.addr, align 4
-  store float %ct, float* %ct.addr, align 4
-  store float %cb, float* %cb.addr, align 4
-  store float %cc, float* %cc.addr, align 4
-  store float 8.000000e+01, float* %amb_temp, align 4
-  %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
-  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
-  %mul = mul i32 %call, %call1
-  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
-  %add = add i32 %mul, %call2
-  store i32 %add, i32* %i, align 4
-  %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
-  %call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
-  %mul5 = mul i32 %call3, %call4
-  %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
-  %add7 = add i32 %mul5, %call6
-  store i32 %add7, i32* %j, align 4
-  %0 = load i32, i32* %i, align 4
-  %1 = load i32, i32* %j, align 4
-  %2 = load i32, i32* %nx.addr, align 4
-  %mul8 = mul nsw i32 %1, %2
-  %add9 = add nsw i32 %0, %mul8
-  store i32 %add9, i32* %c, align 4
-  %3 = load i32, i32* %nx.addr, align 4
-  %4 = load i32, i32* %ny.addr, align 4
-  %mul10 = mul nsw i32 %3, %4
-  store i32 %mul10, i32* %xy, align 4
-  %5 = load i32, i32* %i, align 4
-  %cmp = icmp eq i32 %5, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  %6 = load i32, i32* %c, align 4
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  %7 = load i32, i32* %c, align 4
-  %sub = sub nsw i32 %7, 1
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
-  store i32 %cond, i32* %W, align 4
-  %8 = load i32, i32* %i, align 4
-  %9 = load i32, i32* %nx.addr, align 4
-  %sub11 = sub nsw i32 %9, 1
-  %cmp12 = icmp eq i32 %8, %sub11
-  br i1 %cmp12, label %cond.true13, label %cond.false14
-
-cond.true13:                                      ; preds = %cond.end
-  %10 = load i32, i32* %c, align 4
-  br label %cond.end16
-
-cond.false14:                                     ; preds = %cond.end
-  %11 = load i32, i32* %c, align 4
-  %add15 = add nsw i32 %11, 1
-  br label %cond.end16
-
-cond.end16:                                       ; preds = %cond.false14, %cond.true13
-  %cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
-  store i32 %cond17, i32* %E, align 4
-  %12 = load i32, i32* %j, align 4
-  %cmp18 = icmp eq i32 %12, 0
-  br i1 %cmp18, label %cond.true19, label %cond.false20
-
-cond.true19:                                      ; preds = %cond.end16
-  %13 = load i32, i32* %c, align 4
-  br label %cond.end22
-
-cond.false20:                                     ; preds = %cond.end16
-  %14 = load i32, i32* %c, align 4
-  %15 = load i32, i32* %nx.addr, align 4
-  %sub21 = sub nsw i32 %14, %15
-  br label %cond.end22
-
-cond.end22:                                       ; preds = %cond.false20, %cond.true19
-  %cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
-  store i32 %cond23, i32* %N, align 4
-  %16 = load i32, i32* %j, align 4
-  %17 = load i32, i32* %ny.addr, align 4
-  %sub24 = sub nsw i32 %17, 1
-  %cmp25 = icmp eq i32 %16, %sub24
-  br i1 %cmp25, label %cond.true26, label %cond.false27
-
-cond.true26:                                      ; preds = %cond.end22
-  %18 = load i32, i32* %c, align 4
-  br label %cond.end29
-
-cond.false27:                                     ; preds = %cond.end22
-  %19 = load i32, i32* %c, align 4
-  %20 = load i32, i32* %nx.addr, align 4
-  %add28 = add nsw i32 %19, %20
-  br label %cond.end29
-
-cond.end29:                                       ; preds = %cond.false27, %cond.true26
-  %cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
-  store i32 %cond30, i32* %S, align 4
-  %21 = load float*, float** %tIn.addr, align 8
-  %22 = load i32, i32* %c, align 4
-  %idxprom = sext i32 %22 to i64
-  %arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
-  %23 = load float, float* %arrayidx, align 4
-  store float %23, float* %temp2, align 4
-  store float %23, float* %temp1, align 4
-  %24 = load float*, float** %tIn.addr, align 8
-  %25 = load i32, i32* %c, align 4
-  %26 = load i32, i32* %xy, align 4
-  %add31 = add nsw i32 %25, %26
-  %idxprom32 = sext i32 %add31 to i64
-  %arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
-  %27 = load float, float* %arrayidx33, align 4
-  store float %27, float* %temp3, align 4
-  %28 = load float, float* %cc.addr, align 4
-  %29 = load float, float* %temp2, align 4
-  %mul34 = fmul contract float %28, %29
-  %30 = load float, float* %cw.addr, align 4
-  %31 = load float*, float** %tIn.addr, align 8
-  %32 = load i32, i32* %W, align 4
-  %idxprom35 = sext i32 %32 to i64
-  %arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
-  %33 = load float, float* %arrayidx36, align 4
-  %mul37 = fmul contract float %30, %33
-  %add38 = fadd contract float %mul34, %mul37
-  %34 = load float, float* %ce.addr, align 4
-  %35 = load float*, float** %tIn.addr, align 8
-  %36 = load i32, i32* %E, align 4
-  %idxprom39 = sext i32 %36 to i64
-  %arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
-  %37 = load float, float* %arrayidx40, align 4
-  %mul41 = fmul contract float %34, %37
-  %add42 = fadd contract float %add38, %mul41
-  %38 = load float, float* %cs.addr, align 4
-  %39 = load float*, float** %tIn.addr, align 8
-  %40 = load i32, i32* %S, align 4
-  %idxprom43 = sext i32 %40 to i64
-  %arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
-  %41 = load float, float* %arrayidx44, align 4
-  %mul45 = fmul contract float %38, %41
-  %add46 = fadd contract float %add42, %mul45
-  %42 = load float, float* %cn.addr, align 4
-  %43 = load float*, float** %tIn.addr, align 8
-  %44 = load i32, i32* %N, align 4
-  %idxprom47 = sext i32 %44 to i64
-  %arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
-  %45 = load float, float* %arrayidx48, align 4
-  %mul49 = fmul contract float %42, %45
-  %add50 = fadd contract float %add46, %mul49
-  %46 = load float, float* %cb.addr, align 4
-  %47 = load float, float* %temp1, align 4
-  %mul51 = fmul contract float %46, %47
-  %add52 = fadd contract float %add50, %mul51
-  %48 = load float, float* %ct.addr, align 4
-  %49 = load float, float* %temp3, align 4
-  %mul53 = fmul contract float %48, %49
-  %add54 = fadd contract float %add52, %mul53
-  %50 = load float, float* %sdc.addr, align 4
-  %51 = load float*, float** %p.addr, align 8
-  %52 = load i32, i32* %c, align 4
-  %idxprom55 = sext i32 %52 to i64
-  %arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
-  %53 = load float, float* %arrayidx56, align 4
-  %mul57 = fmul contract float %50, %53
-  %add58 = fadd contract float %add54, %mul57
-  %54 = load float, float* %ct.addr, align 4
-  %55 = load float, float* %amb_temp, align 4
-  %mul59 = fmul contract float %54, %55
-  %add60 = fadd contract float %add58, %mul59
-  %56 = load float*, float** %tOut.addr, align 8
-  %57 = load i32, i32* %c, align 4
-  %idxprom61 = sext i32 %57 to i64
-  %arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
-  store float %add60, float* %arrayidx62, align 4
-  %58 = load i32, i32* %xy, align 4
-  %59 = load i32, i32* %c, align 4
-  %add63 = add nsw i32 %59, %58
-  store i32 %add63, i32* %c, align 4
-  %60 = load i32, i32* %xy, align 4
-  %61 = load i32, i32* %W, align 4
-  %add64 = add nsw i32 %61, %60
-  store i32 %add64, i32* %W, align 4
-  %62 = load i32, i32* %xy, align 4
-  %63 = load i32, i32* %E, align 4
-  %add65 = add nsw i32 %63, %62
-  store i32 %add65, i32* %E, align 4
-  %64 = load i32, i32* %xy, align 4
-  %65 = load i32, i32* %N, align 4
-  %add66 = add nsw i32 %65, %64
-  store i32 %add66, i32* %N, align 4
-  %66 = load i32, i32* %xy, align 4
-  %67 = load i32, i32* %S, align 4
-  %add67 = add nsw i32 %67, %66
-  store i32 %add67, i32* %S, align 4
-  store i32 1, i32* %k, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %cond.end29
-  %68 = load i32, i32* %k, align 4
-  %69 = load i32, i32* %nz.addr, align 4
-  %sub68 = sub nsw i32 %69, 1
-  %cmp69 = icmp slt i32 %68, %sub68
-  br i1 %cmp69, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %70 = load float, float* %temp2, align 4
-  store float %70, float* %temp1, align 4
-  %71 = load float, float* %temp3, align 4
-  store float %71, float* %temp2, align 4
-  %72 = load float*, float** %tIn.addr, align 8
-  %73 = load i32, i32* %c, align 4
-  %74 = load i32, i32* %xy, align 4
-  %add70 = add nsw i32 %73, %74
-  %idxprom71 = sext i32 %add70 to i64
-  %arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
-  %75 = load float, float* %arrayidx72, align 4
-  store float %75, float* %temp3, align 4
-  %76 = load float, float* %cc.addr, align 4
-  %77 = load float, float* %temp2, align 4
-  %mul73 = fmul contract float %76, %77
-  %78 = load float, float* %cw.addr, align 4
-  %79 = load float*, float** %tIn.addr, align 8
-  %80 = load i32, i32* %W, align 4
-  %idxprom74 = sext i32 %80 to i64
-  %arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
-  %81 = load float, float* %arrayidx75, align 4
-  %mul76 = fmul contract float %78, %81
-  %add77 = fadd contract float %mul73, %mul76
-  %82 = load float, float* %ce.addr, align 4
-  %83 = load float*, float** %tIn.addr, align 8
-  %84 = load i32, i32* %E, align 4
-  %idxprom78 = sext i32 %84 to i64
-  %arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
-  %85 = load float, float* %arrayidx79, align 4
-  %mul80 = fmul contract float %82, %85
-  %add81 = fadd contract float %add77, %mul80
-  %86 = load float, float* %cs.addr, align 4
-  %87 = load float*, float** %tIn.addr, align 8
-  %88 = load i32, i32* %S, align 4
-  %idxprom82 = sext i32 %88 to i64
-  %arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
-  %89 = load float, float* %arrayidx83, align 4
-  %mul84 = fmul contract float %86, %89
-  %add85 = fadd contract float %add81, %mul84
-  %90 = load float, float* %cn.addr, align 4
-  %91 = load float*, float** %tIn.addr, align 8
-  %92 = load i32, i32* %N, align 4
-  %idxprom86 = sext i32 %92 to i64
-  %arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
-  %93 = load float, float* %arrayidx87, align 4
-  %mul88 = fmul contract float %90, %93
-  %add89 = fadd contract float %add85, %mul88
-  %94 = load float, float* %cb.addr, align 4
-  %95 = load float, float* %temp1, align 4
-  %mul90 = fmul contract float %94, %95
-  %add91 = fadd contract float %add89, %mul90
-  %96 = load float, float* %ct.addr, align 4
-  %97 = load float, float* %temp3, align 4
-  %mul92 = fmul contract float %96, %97
-  %add93 = fadd contract float %add91, %mul92
-  %98 = load float, float* %sdc.addr, align 4
-  %99 = load float*, float** %p.addr, align 8
-  %100 = load i32, i32* %c, align 4
-  %idxprom94 = sext i32 %100 to i64
-  %arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
-  %101 = load float, float* %arrayidx95, align 4
-  %mul96 = fmul contract float %98, %101
-  %add97 = fadd contract float %add93, %mul96
-  %102 = load float, float* %ct.addr, align 4
-  %103 = load float, float* %amb_temp, align 4
-  %mul98 = fmul contract float %102, %103
-  %add99 = fadd contract float %add97, %mul98
-  %104 = load float*, float** %tOut.addr, align 8
-  %105 = load i32, i32* %c, align 4
-  %idxprom100 = sext i32 %105 to i64
-  %arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
-  store float %add99, float* %arrayidx101, align 4
-  %106 = load i32, i32* %xy, align 4
-  %107 = load i32, i32* %c, align 4
-  %add102 = add nsw i32 %107, %106
-  store i32 %add102, i32* %c, align 4
-  %108 = load i32, i32* %xy, align 4
-  %109 = load i32, i32* %W, align 4
-  %add103 = add nsw i32 %109, %108
-  store i32 %add103, i32* %W, align 4
-  %110 = load i32, i32* %xy, align 4
-  %111 = load i32, i32* %E, align 4
-  %add104 = add nsw i32 %111, %110
-  store i32 %add104, i32* %E, align 4
-  %112 = load i32, i32* %xy, align 4
-  %113 = load i32, i32* %N, align 4
-  %add105 = add nsw i32 %113, %112
-  store i32 %add105, i32* %N, align 4
-  %114 = load i32, i32* %xy, align 4
-  %115 = load i32, i32* %S, align 4
-  %add106 = add nsw i32 %115, %114
-  store i32 %add106, i32* %S, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %116 = load i32, i32* %k, align 4
-  %inc = add nsw i32 %116, 1
-  store i32 %inc, i32* %k, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %117 = load float, float* %temp2, align 4
-  store float %117, float* %temp1, align 4
-  %118 = load float, float* %temp3, align 4
-  store float %118, float* %temp2, align 4
-  %119 = load float, float* %cc.addr, align 4
-  %120 = load float, float* %temp2, align 4
-  %mul107 = fmul contract float %119, %120
-  %121 = load float, float* %cw.addr, align 4
-  %122 = load float*, float** %tIn.addr, align 8
-  %123 = load i32, i32* %W, align 4
-  %idxprom108 = sext i32 %123 to i64
-  %arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
-  %124 = load float, float* %arrayidx109, align 4
-  %mul110 = fmul contract float %121, %124
-  %add111 = fadd contract float %mul107, %mul110
-  %125 = load float, float* %ce.addr, align 4
-  %126 = load float*, float** %tIn.addr, align 8
-  %127 = load i32, i32* %E, align 4
-  %idxprom112 = sext i32 %127 to i64
-  %arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
-  %128 = load float, float* %arrayidx113, align 4
-  %mul114 = fmul contract float %125, %128
-  %add115 = fadd contract float %add111, %mul114
-  %129 = load float, float* %cs.addr, align 4
-  %130 = load float*, float** %tIn.addr, align 8
-  %131 = load i32, i32* %S, align 4
-  %idxprom116 = sext i32 %131 to i64
-  %arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
-  %132 = load float, float* %arrayidx117, align 4
-  %mul118 = fmul contract float %129, %132
-  %add119 = fadd contract float %add115, %mul118
-  %133 = load float, float* %cn.addr, align 4
-  %134 = load float*, float** %tIn.addr, align 8
-  %135 = load i32, i32* %N, align 4
-  %idxprom120 = sext i32 %135 to i64
-  %arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
-  %136 = load float, float* %arrayidx121, align 4
-  %mul122 = fmul contract float %133, %136
-  %add123 = fadd contract float %add119, %mul122
-  %137 = load float, float* %cb.addr, align 4
-  %138 = load float, float* %temp1, align 4
-  %mul124 = fmul contract float %137, %138
-  %add125 = fadd contract float %add123, %mul124
-  %139 = load float, float* %ct.addr, align 4
-  %140 = load float, float* %temp3, align 4
-  %mul126 = fmul contract float %139, %140
-  %add127 = fadd contract float %add125, %mul126
-  %141 = load float, float* %sdc.addr, align 4
-  %142 = load float*, float** %p.addr, align 8
-  %143 = load i32, i32* %c, align 4
-  %idxprom128 = sext i32 %143 to i64
-  %arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
-  %144 = load float, float* %arrayidx129, align 4
-  %mul130 = fmul contract float %141, %144
-  %add131 = fadd contract float %add127, %mul130
-  %145 = load float, float* %ct.addr, align 4
-  %146 = load float, float* %amb_temp, align 4
-  %mul132 = fmul contract float %145, %146
-  %add133 = fadd contract float %add131, %mul132
-  %147 = load float*, float** %tOut.addr, align 8
-  %148 = load i32, i32* %c, align 4
-  %idxprom134 = sext i32 %148 to i64
-  %arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
-  store float %add133, float* %arrayidx135, align 4
-  ret void
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  ret i32 %0
-}
-
-; Function Attrs: alwaysinline convergent nounwind
-define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  ret i32 %0
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
-
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { convergent nounwind }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
--- a/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot3D/3D.cu
+++ b/examples/hotspot3D/3D.cu
@ -1,205 +0,0 @@
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <time.h>
-
-#define BLOCK_SIZE 16
-#define STR_SIZE 256
-
-#define block_x_ 128
-#define block_y_ 2
-#define block_z_ 1
-#define MAX_PD (3.0e6)
-/* required precision in degrees	*/
-#define PRECISION 0.001
-#define SPEC_HEAT_SI 1.75e6
-#define K_SI 100
-/* capacitance fitting factor	*/
-#define FACTOR_CHIP 0.5
-
-#include "opt1.cu"
-
-/* chip parameters	*/
-float t_chip = 0.0005;
-float chip_height = 0.016;
-float chip_width = 0.016; /* ambient temperature, assuming no package at all
-                           */
-float amb_temp = 80.0;
-
-void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
-
-void readinput(float *vect, int grid_rows, int grid_cols, int layers,
-               char *file) {
-  int i, j, k;
-  FILE *fp;
-  char str[STR_SIZE];
-  float val;
-
-  if ((fp = fopen(file, "r")) == 0)
-    fatal("The file was not opened");
-
-  for (i = 0; i <= grid_rows - 1; i++)
-    for (j = 0; j <= grid_cols - 1; j++)
-      for (k = 0; k <= layers - 1; k++) {
-        if (fgets(str, STR_SIZE, fp) == NULL)
-          fatal("Error reading file\n");
-        if (feof(fp))
-          fatal("not enough lines in file");
-        if ((sscanf(str, "%f", &val) != 1))
-          fatal("invalid file format");
-        vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
-      }
-
-  fclose(fp);
-}
-
-void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
-                 char *file) {
-
-  int i, j, k, index = 0;
-  FILE *fp;
-  char str[STR_SIZE];
-
-  if ((fp = fopen(file, "w")) == 0)
-    printf("The file was not opened\n");
-
-  for (i = 0; i < grid_rows; i++)
-    for (j = 0; j < grid_cols; j++)
-      for (k = 0; k < layers; k++) {
-        sprintf(str, "%d\t%g\n", index,
-                vect[i * grid_cols + j + k * grid_rows * grid_cols]);
-        fputs(str, fp);
-        index++;
-      }
-
-  fclose(fp);
-}
-
-void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
-                    float Cap, float Rx, float Ry, float Rz, float dt,
-                    int numiter) {
-  float ce, cw, cn, cs, ct, cb, cc;
-  float stepDivCap = dt / Cap;
-  ce = cw = stepDivCap / Rx;
-  cn = cs = stepDivCap / Ry;
-  ct = cb = stepDivCap / Rz;
-
-  cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
-
-  int c, w, e, n, s, b, t;
-  int x, y, z;
-  int i = 0;
-  do {
-    for (z = 0; z < nz; z++)
-      for (y = 0; y < ny; y++)
-        for (x = 0; x < nx; x++) {
-          c = x + y * nx + z * nx * ny;
-
-          w = (x == 0) ? c : c - 1;
-          e = (x == nx - 1) ? c : c + 1;
-          n = (y == 0) ? c : c - nx;
-          s = (y == ny - 1) ? c : c + nx;
-          b = (z == 0) ? c : c - nx * ny;
-          t = (z == nz - 1) ? c : c + nx * ny;
-
-          tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
-                    tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
-                    (dt / Cap) * pIn[c] + ct * amb_temp;
-        }
-    float *temp = tIn;
-    tIn = tOut;
-    tOut = temp;
-    i++;
-  } while (i < numiter);
-}
-
-float accuracy(float *arr1, float *arr2, int len) {
-  float err = 0.0;
-  int i;
-  for (i = 0; i < len; i++) {
-    err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
-  }
-
-  return (float)sqrt(err / len);
-}
-
-void usage(int argc, char **argv) {
-  fprintf(stderr,
-          "Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
-          "<outputFile>\n",
-          argv[0]);
-  fprintf(
-      stderr,
-      "\t<rows/cols>  - number of rows/cols in the grid (positive integer)\n");
-  fprintf(stderr,
-          "\t<layers>  - number of layers in the grid (positive integer)\n");
-
-  fprintf(stderr, "\t<iteration> - number of iterations\n");
-  fprintf(stderr, "\t<powerFile>  - name of the file containing the initial "
-                  "power values of each cell\n");
-  fprintf(stderr, "\t<tempFile>  - name of the file containing the initial "
-                  "temperature values of each cell\n");
-  fprintf(stderr, "\t<outputFile - output file\n");
-  exit(1);
-}
-
-int main(int argc, char **argv) {
-  cudaSetDevice(0);
-  if (argc != 7) {
-    usage(argc, argv);
-  }
-
-  char *pfile, *tfile, *ofile;
-  int iterations = atoi(argv[3]);
-
-  pfile = argv[4];
-  tfile = argv[5];
-  ofile = argv[6];
-  int numCols = atoi(argv[1]);
-  int numRows = atoi(argv[1]);
-  int layers = atoi(argv[2]);
-
-  /* calculating parameters*/
-
-  float dx = chip_height / numRows;
-  float dy = chip_width / numCols;
-  float dz = t_chip / layers;
-
-  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
-  float Rx = dy / (2.0 * K_SI * t_chip * dx);
-  float Ry = dx / (2.0 * K_SI * t_chip * dy);
-  float Rz = dz / (K_SI * dx * dy);
-
-  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
-  float dt = PRECISION / max_slope;
-
-  float *powerIn, *tempOut, *tempIn, *tempCopy;
-  int size = numCols * numRows * layers;
-
-  powerIn = (float *)calloc(size, sizeof(float));
-  tempCopy = (float *)malloc(size * sizeof(float));
-  tempIn = (float *)calloc(size, sizeof(float));
-  tempOut = (float *)calloc(size, sizeof(float));
-  float *answer = (float *)calloc(size, sizeof(float));
-
-  readinput(powerIn, numRows, numCols, layers, pfile);
-  readinput(tempIn, numRows, numCols, layers, tfile);
-
-  memcpy(tempCopy, tempIn, size * sizeof(float));
-
-  hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
-               Rz, dt, iterations);
-
-  computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
-                 Ry, Rz, dt, iterations);
-
-  float acc = accuracy(tempOut, answer, numRows * numCols * layers);
-  printf("Accuracy: %e\n", acc);
-  writeoutput(tempOut, numRows, numCols, layers, ofile);
-  free(tempIn);
-  free(tempOut);
-  free(powerIn);
-  return 0;
-}
--- a/examples/hotspot3D/run.sh
+++ b/examples/hotspot3D/run.sh
@ -1,22 +0,0 @@
-# # #!/bin/bash
-set -e
-llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
-../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-
-g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o 3D \
-    -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
-
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
-
-if head output.out | grep -q "334.017"; then
-    echo "Pass"
-else
-    echo "Error result"
-    exit 1
-fi
--- a/examples/huffman/comparison_helpers.h
+++ b/examples/huffman/comparison_helpers.h
@ -1,24 +0,0 @@
-#ifndef _COMPARISON_HELPERS_H_
-#define _COMPARISON_HELPERS_H_
-#include <stdio.h>
-template <typename T>
-__inline int compare_vectors(T *data1, T *data2, unsigned int size) {
-  printf("Comparing vectors: \n");
-  bool match = true;
-  for (unsigned int i = 0; i < size; i++)
-    if (data1[i] != data2[i]) {
-      match = false;
-      printf("Diff: data1[%d]=%d,  data1[%d]=%d.\n", i, data1[i], i, data2[i]);
-    }
-
-  if (match) {
-    printf("PASS! vectors are matching!\n");
-    return 0;
-  } else {
-    printf("FAIL! vectors are NOT matching!\n");
-    exit(1);
-    return -1;
-  }
-}
-
-#endif
--- a/examples/huffman/cpuencode.cpp
+++ b/examples/huffman/cpuencode.cpp
@ -1,116 +0,0 @@
-#include "stdafx.h"
-
-#include "cpuencode.h"
-#include "print_helpers.h"
-
-using namespace std;
-
-#if 1
-
-// The max. codeword length for each byte symbol is 32-bits
-
-extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
-                               unsigned int *outdata, unsigned int *outsize,
-                               unsigned int *codewords,
-                               unsigned int *codewordlens) {
-  unsigned int *bitstreamPt =
-      (unsigned int *)outdata; /* Pointer to current byte   */
-  *bitstreamPt = 0x00000000U;
-  unsigned int startbit = 0;
-  unsigned int totalBytes = 0;
-
-  for (unsigned int k = 0; k < num_elements; k++) {
-    unsigned int cw32 = 0;
-    unsigned int val32 = indata[k];
-    unsigned int numbits = 0;
-    unsigned int mask32;
-
-    for (unsigned int i = 0; i < 4; i++) {
-      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
-      cw32 = codewords[symbol];
-      numbits = codewordlens[symbol];
-
-      while (numbits > 0) {
-        int writebits = min(32 - startbit, numbits);
-        if (numbits == writebits)
-          mask32 = (cw32 & ((1 << numbits) - 1))
-                   << (32 - startbit -
-                       numbits); // first make sure that the start of the word
-                                 // is clean, then shift to the left as many
-                                 // places as you need
-        else
-          mask32 = cw32 >>
-                   (numbits - writebits); // shift out the bits that can not fit
-        *bitstreamPt = (*bitstreamPt) | mask32;
-        numbits = numbits - writebits;
-        startbit = (startbit + writebits) % 32;
-        if (startbit == 0) {
-          bitstreamPt++;
-          *bitstreamPt = 0x00000000;
-          totalBytes += 4;
-        }
-      }
-    }
-  }
-  totalBytes += (startbit / 8) +
-                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
-  *outsize = totalBytes;
-}
-
-//////////////////////////////////////////////////////////////////////
-/// ALTERNATIVE CODER
-/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
-/// i.e. g 64 bits
-///////////////////////////////////////////////////////////////////////
-
-#else
-
-extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
-                               unsigned int *outdata, unsigned int *outsize,
-                               unsigned int *codewords,
-                               unsigned int *codewordlens) {
-  unsigned int *bitstreamPt =
-      (unsigned int *)outdata; /* Pointer to current byte   */
-  // assume memset is done.
-  *bitstreamPt = 0x00000000U;
-  unsigned int startbit = 0;
-  unsigned int totalBytes = 0;
-
-  for (unsigned int k = 0; k < num_elements; k++) {
-    unsigned long long cw64 = 0, mask64 = 0;
-    unsigned int val32 = indata[k];
-    unsigned int numbits = 0;
-    unsigned int mask32, temp32;
-
-    for (unsigned int i = 0; i < 4; i++) {
-      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
-      cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
-      numbits += codewordlens[symbol];
-      // if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
-      // %d!!!!!!!\n", k, numbits);
-    }
-
-    while (numbits > 0) {
-      int writebits = min(32 - startbit, numbits);
-      if (numbits == writebits) {
-        temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
-        mask32 = temp32 << (32 - startbit - numbits);
-      } else {
-        mask32 = (unsigned int)(cw64 >> (numbits - writebits));
-        cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
-      }
-      *bitstreamPt = (*bitstreamPt) | mask32;
-      numbits = numbits - writebits;
-      startbit = (startbit + writebits) % 32;
-      if (startbit == 0) {
-        bitstreamPt++;
-        *bitstreamPt = 0x00000000;
-        totalBytes += 4;
-      }
-    }
-  }
-  totalBytes += (startbit / 8) +
-                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
-  *outsize = totalBytes;
-}
-#endif
--- a/examples/huffman/cpuencode.h
+++ b/examples/huffman/cpuencode.h
@ -1,8 +0,0 @@
-#ifndef _CE_H_
-#define _CE_H_
-
-extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
-                               unsigned int *outdata, unsigned int *outsize,
-                               unsigned int *codewords,
-                               unsigned int *codewordlens);
-#endif
--- a/examples/huffman/cuda_helpers.h
+++ b/examples/huffman/cuda_helpers.h
@ -1,20 +0,0 @@
-#ifndef __CUDA_HELPERS__
-#define __CUDA_HELPERS__
-#include <stdio.h>
-/************************************************************************/
-/* Init CUDA                                                            */
-/************************************************************************/
-#if __DEVICE_EMULATION__
-
-bool InitCUDA(void) { return true; }
-
-#else
-bool InitCUDA(void) {
-
-  cudaSetDevice(0);
-
-  printf("CUDA initialized.\n");
-  return true;
-}
-#endif
-#endif
--- a/examples/huffman/cutil.h
+++ b/examples/huffman/cutil.h
@ -1,931 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/* CUda UTility Library */
-
-#ifndef _CUTIL_H_
-#define _CUTIL_H_
-
-#ifdef _WIN32
-#pragma warning(disable : 4996) // disable deprecated warning
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-
-// helper typedefs for building DLL
-#ifdef _WIN32
-#ifdef BUILD_DLL
-#define DLL_MAPPING __declspec(dllexport)
-#else
-#define DLL_MAPPING __declspec(dllimport)
-#endif
-#else
-#define DLL_MAPPING
-#endif
-
-#ifdef _WIN32
-#define CUTIL_API __stdcall
-#else
-#define CUTIL_API
-#endif
-
-////////////////////////////////////////////////////////////////////////////
-//! CUT bool type
-////////////////////////////////////////////////////////////////////////////
-enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 };
-
-////////////////////////////////////////////////////////////////////////////
-//! Deallocate memory allocated within Cutil
-//! @param  pointer to memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-void CUTIL_API cutFree(void *ptr);
-
-////////////////////////////////////////////////////////////////////////////
-//! Helper for bank conflict checking (should only be used with the
-//! CUT_BANK_CHECKER macro)
-//! @param tidx  thread id in x dimension of block
-//! @param tidy  thread id in y dimension of block
-//! @param tidz  thread id in z dimension of block
-//! @param bdimx block size in x dimension
-//! @param bdimy block size in y dimension
-//! @param bdimz block size in z dimension
-//! @param file  name of the source file where the access takes place
-//! @param line  line in the source file where the access takes place
-//! @param aname name of the array which is accessed
-//! @param index index into the array
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-void CUTIL_API cutCheckBankAccess(unsigned int tidx, unsigned int tidy,
-                                  unsigned int tidz, unsigned int bdimx,
-                                  unsigned int bdimy, unsigned int bdimz,
-                                  const char *file, const int line,
-                                  const char *aname, const int index);
-
-////////////////////////////////////////////////////////////////////////////
-//! Find the path for a filename
-//! @return the path if succeeded, otherwise 0
-//! @param filename        name of the file
-//! @param executablePath  optional absolute path of the executable
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-char *CUTIL_API cutFindFilePath(const char *filename,
-                                const char *executablePath);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing single precision floating point data
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutReadFilef(const char *filename, float **data,
-                                  unsigned int *len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing double precision floating point data
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutReadFiled(const char *filename, double **data,
-                                  unsigned int *len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing integer data
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutReadFilei(const char *filename, int **data,
-                                  unsigned int *len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing unsigned integer data
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutReadFileui(const char *filename, unsigned int **data,
-                                   unsigned int *len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing char / byte data
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutReadFileb(const char *filename, char **data,
-                                  unsigned int *len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing unsigned char / byte data
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutReadFileub(const char *filename, unsigned char **data,
-                                   unsigned int *len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing single precision floating point
-//! data
-//! @return CUTTrue if writing the file succeeded, otherwise false
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-//! @param epsilon  epsilon for comparison
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutWriteFilef(const char *filename, const float *data,
-                                   unsigned int len, const float epsilon,
-                                   bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing double precision floating point
-//! data
-//! @return CUTTrue if writing the file succeeded, otherwise false
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-//! @param epsilon  epsilon for comparison
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutWriteFiled(const char *filename, const float *data,
-                                   unsigned int len, const double epsilon,
-                                   bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing integer data
-//! @return CUTTrue if writing the file succeeded, otherwise false
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutWriteFilei(const char *filename, const int *data,
-                                   unsigned int len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing unsigned integer data
-//! @return CUTTrue if writing the file succeeded, otherwise false
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutWriteFileui(const char *filename,
-                                    const unsigned int *data, unsigned int len,
-                                    bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing char / byte data
-//! @return CUTTrue if writing the file succeeded, otherwise false
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutWriteFileb(const char *filename, const char *data,
-                                   unsigned int len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing unsigned char / byte data
-//! @return CUTTrue if writing the file succeeded, otherwise false
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutWriteFileub(const char *filename,
-                                    const unsigned char *data, unsigned int len,
-                                    bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PGM image file (with unsigned char as data element type)
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutLoadPGMub(const char *file, unsigned char **data,
-                                  unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PPM image file (with unsigned char as data element type)
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutLoadPPMub(const char *file, unsigned char **data,
-                                  unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PPM image file (with unsigned char as data element type), padding
-//! 4th component
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutLoadPPM4ub(const char *file, unsigned char **data,
-                                   unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PGM image file (with unsigned int as data element type)
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized within Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutLoadPGMi(const char *file, unsigned int **data,
-                                 unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PGM image file (with unsigned short as data element type)
-//! @return CUTTrue if reading the file succeeded, otherwise false
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized  withing Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutLoadPGMs(const char *file, unsigned short **data,
-                                 unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PGM image file (with float as data element type)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-//! @note If a NULL pointer is passed to this function and it is
-//!       initialized withing Cutil then cutFree() has to be used to
-//!       deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutLoadPGMf(const char *file, float **data,
-                                 unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PGM image file (with unsigned char as data element type)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutSavePGMub(const char *file, unsigned char *data,
-                                  unsigned int w, unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PPM image file (with unsigned char as data element type)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutSavePPMub(const char *file, unsigned char *data,
-                                  unsigned int w, unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PPM image file (with unsigned char as data element type, padded to
-//! 4 bytes)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutSavePPM4ub(const char *file, unsigned char *data,
-                                   unsigned int w, unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PGM image file (with unsigned int as data element type)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutSavePGMi(const char *file, unsigned int *data,
-                                 unsigned int w, unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PGM image file (with unsigned short as data element type)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutSavePGMs(const char *file, unsigned short *data,
-                                 unsigned int w, unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PGM image file (with float as data element type)
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutSavePGMf(const char *file, float *data, unsigned int w,
-                                 unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////
-// Command line arguments: General notes
-// * All command line arguments begin with '--' followed by the token;
-//   token and value are seperated by '='; example --samples=50
-// * Arrays have the form --model=[one.obj,two.obj,three.obj]
-//   (without whitespaces)
-////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////
-//! Check if command line argument \a flag-name is given
-//! @return CUTTrue if command line argument \a flag_name has been given,
-//!         otherwise 0
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param flag_name  name of command line flag
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCheckCmdLineFlag(const int argc, const char **argv,
-                                         const char *flag_name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type int
-//! @return CUTTrue if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise CUTFalse
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutGetCmdLineArgumenti(const int argc, const char **argv,
-                                            const char *arg_name, int *val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type float
-//! @return CUTTrue if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise CUTFalse
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutGetCmdLineArgumentf(const int argc, const char **argv,
-                                            const char *arg_name, float *val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type string
-//! @return CUTTrue if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise CUTFalse
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutGetCmdLineArgumentstr(const int argc, const char **argv,
-                                              const char *arg_name, char **val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument list those element are strings
-//! @return CUTTrue if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise CUTFalse
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  command line argument list
-//! @param len  length of the list / number of elements
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutGetCmdLineArgumentListstr(const int argc,
-                                                  const char **argv,
-                                                  const char *arg_name,
-                                                  char **val,
-                                                  unsigned int *len);
-
-////////////////////////////////////////////////////////////////////////////
-//! Extended assert
-//! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
-//! @param val  condition to test
-//! @param file  __FILE__ macro
-//! @param line  __LINE__ macro
-//! @note This function should be used via the CONDITION(val) macro
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCheckCondition(int val, const char *file,
-                                       const int line);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutComparef(const float *reference, const float *data,
-                                 const unsigned int len);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two integer arrays
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutComparei(const int *reference, const int *data,
-                                 const unsigned int len);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two unsigned integer arrays, with epsilon and threshold
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
-////////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCompareuit(const unsigned int *reference,
-                                   const unsigned int *data,
-                                   const unsigned int len, const float epsilon,
-                                   const float threshold);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two unsigned char arrays
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCompareub(const unsigned char *reference,
-                                  const unsigned char *data,
-                                  const unsigned int len);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two integers with a tolernance for # of byte errors
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
-////////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCompareubt(const unsigned char *reference,
-                                   const unsigned char *data,
-                                   const unsigned int len, const float epsilon,
-                                   const float threshold);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two integer arrays witha n epsilon tolerance for equality
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCompareube(const unsigned char *reference,
-                                   const unsigned char *data,
-                                   const unsigned int len, const float epsilon);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays with an epsilon tolerance for equality
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutComparefe(const float *reference, const float *data,
-                                  const unsigned int len, const float epsilon);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays with an epsilon tolerance for equality and a
-//!     threshold for # pixel errors
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutComparefet(const float *reference, const float *data,
-                                   const unsigned int len, const float epsilon,
-                                   const float threshold);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays using L2-norm with an epsilon tolerance for
-//! equality
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCompareL2fe(const float *reference, const float *data,
-                                    const unsigned int len,
-                                    const float epsilon);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two PPM image files with an epsilon tolerance for equality
-//! @return  CUTTrue if \a reference and \a data are identical,
-//!          otherwise CUTFalse
-//! @param src_file   filename for the image to be compared
-//! @param data       filename for the reference data / gold image
-//! @param epsilon    epsilon to use for the comparison
-//! @param threshold  threshold of pixels that can still mismatch to pass (i.e.
-//! 0.15f = 15% must pass) $param verboseErrors output details of image mismatch
-//! to std::err
-////////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutComparePPM(const char *src_file, const char *ref_file,
-                                   const float epsilon, const float threshold,
-                                   bool verboseErrors = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Timer functionality
-
-////////////////////////////////////////////////////////////////////////////
-//! Create a new timer
-//! @return CUTTrue if a time has been created, otherwise false
-//! @param  name of the new timer, 0 if the creation failed
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutCreateTimer(unsigned int *name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Delete a timer
-//! @return CUTTrue if a time has been deleted, otherwise false
-//! @param  name of the timer to delete
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutDeleteTimer(unsigned int name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Start the time with name \a name
-//! @param name  name of the timer to start
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutStartTimer(const unsigned int name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Stop the time with name \a name. Does not reset.
-//! @param name  name of the timer to stop
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutStopTimer(const unsigned int name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Resets the timer's counter.
-//! @param name  name of the timer to reset.
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-CUTBoolean CUTIL_API cutResetTimer(const unsigned int name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Returns total execution time in milliseconds for the timer over all
-//! runs since the last reset or timer creation.
-//! @param name  name of the timer to return the time of
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-float CUTIL_API cutGetTimerValue(const unsigned int name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Return the average time in milliseconds for timer execution as the
-//! total  time for the timer dividied by the number of completed (stopped)
-//! runs the timer has made.
-//! Excludes the current running time if the timer is currently running.
-//! @param name  name of the timer to return the time of
-////////////////////////////////////////////////////////////////////////////
-DLL_MAPPING
-float CUTIL_API cutGetAverageTimerValue(const unsigned int name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Macros
-
-#if CUDART_VERSION >= 4000
-#define CUT_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
-#else
-#define CUT_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
-#endif
-
-#if CUDART_VERSION >= 4000
-#define CUT_DEVICE_RESET() cudaDeviceReset();
-#else
-#define CUT_DEVICE_RESET() cudaThreadExit();
-#endif
-
-// This is for the CUTIL bank checker
-#ifdef _DEBUG
-#if __DEVICE_EMULATION__
-// Interface for bank conflict checker
-#define CUT_BANK_CHECKER(array, index)                                         \
-  (cutCheckBankAccess(threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x,       \
-                      blockDim.y, blockDim.z, __FILE__, __LINE__, #array,      \
-                      index),                                                  \
-   array[index])
-#else
-#define CUT_BANK_CHECKER(array, index) array[index]
-#endif
-#else
-#define CUT_BANK_CHECKER(array, index) array[index]
-#endif
-
-#define CU_SAFE_CALL_NO_SYNC(call)                                             \
-  {                                                                            \
-    CUresult err = call;                                                       \
-    if (CUDA_SUCCESS != err) {                                                 \
-      fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err,  \
-              __FILE__, __LINE__);                                             \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-#define CU_SAFE_CALL(call) CU_SAFE_CALL_NO_SYNC(call);
-
-#define CU_SAFE_CTX_SYNC()                                                     \
-  {                                                                            \
-    CUresult err = cuCtxSynchronize();                                         \
-    if (CUDA_SUCCESS != err) {                                                 \
-      fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err,  \
-              __FILE__, __LINE__);                                             \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
-  {                                                                            \
-    cudaError err = call;                                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
-              __LINE__, cudaGetErrorString(err));                              \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
-
-#define CUDA_SAFE_THREAD_SYNC()                                                \
-  {                                                                            \
-    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                  \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
-              __LINE__, cudaGetErrorString(err));                              \
-    }                                                                          \
-  }
-
-#define CUFFT_SAFE_CALL(call)                                                  \
-  {                                                                            \
-    cufftResult err = call;                                                    \
-    if (CUFFT_SUCCESS != err) {                                                \
-      fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", __FILE__,      \
-              __LINE__);                                                       \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-#define CUT_SAFE_CALL(call)                                                    \
-  if (CUTTrue != call) {                                                       \
-    fprintf(stderr, "Cut error in file '%s' in line %i.\n", __FILE__,          \
-            __LINE__);                                                         \
-    exit(EXIT_FAILURE);                                                        \
-  }
-
-//! Check for CUDA error
-#ifdef _DEBUG
-#define CUT_CHECK_ERROR(errorMessage)                                          \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    err = CUT_DEVICE_SYNCHRONIZE();                                            \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#else
-#define CUT_CHECK_ERROR(errorMessage)                                          \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#endif
-
-//! Check for malloc error
-#define CUT_SAFE_MALLOC(mallocCall)                                            \
-  {                                                                            \
-    if (!(mallocCall)) {                                                       \
-      fprintf(stderr, "Host malloc failure in file '%s' in line %i\n",         \
-              __FILE__, __LINE__);                                             \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }                                                                            \
-  while (0)                                                                    \
-    ;
-
-//! Check if conditon is true (flexible assert)
-#define CUT_CONDITION(val)                                                     \
-  if (CUTFalse == cutCheckCondition(val, __FILE__, __LINE__)) {                \
-    exit(EXIT_FAILURE);                                                        \
-  }
-
-#if __DEVICE_EMULATION__
-
-#define CUT_DEVICE_INIT(ARGC, ARGV)
-
-#else
-
-#define CUT_DEVICE_INIT(ARGC, ARGV)                                            \
-  {                                                                            \
-    int deviceCount;                                                           \
-    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));                  \
-    if (deviceCount == 0) {                                                    \
-      fprintf(stderr, "cutil error: no devices supporting CUDA.\n");           \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    int dev = 0;                                                               \
-    cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev);         \
-    if (dev < 0)                                                               \
-      dev = 0;                                                                 \
-    if (dev > deviceCount - 1)                                                 \
-      dev = deviceCount - 1;                                                   \
-    cudaDeviceProp deviceProp;                                                 \
-    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));         \
-    if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse)   \
-      fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);          \
-    CUDA_SAFE_CALL(cudaSetDevice(dev));                                        \
-  }
-
-//! Check for CUDA context lost
-#define CUDA_CHECK_CTX_LOST(errorMessage)                                      \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    err = CUT_DEVICE_SYNCHRONIZE();                                            \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-//! Check for CUDA context lost
-#define CU_CHECK_CTX_LOST(errorMessage)                                        \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (CUDA_ERROR_INVALID_CONTEXT != err) {                                   \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    err = CUT_DEVICE_SYNCHRONIZE();                                            \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-#endif
-
-#define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV)                              \
-  {                                                                            \
-    cuDevice = 0;                                                              \
-    int deviceCount = 0;                                                       \
-    CUresult err = cuInit(0);                                                  \
-    if (CUDA_SUCCESS == err)                                                   \
-      CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));                    \
-    if (deviceCount == 0) {                                                    \
-      fprintf(stderr, "cutil error: no devices supporting CUDA\n");            \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    int dev = 0;                                                               \
-    cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev);         \
-    if (dev < 0)                                                               \
-      dev = 0;                                                                 \
-    if (dev > deviceCount - 1)                                                 \
-      dev = deviceCount - 1;                                                   \
-    CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev));                         \
-    char name[100];                                                            \
-    cuDeviceGetName(name, 100, cuDevice);                                      \
-    if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse)   \
-      fprintf(stderr, "Using device %d: %s\n", dev, name);                     \
-  }
-
-#define CUT_EXIT(argc, argv)                                                   \
-  if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) {           \
-    printf("\nPress ENTER to exit...\n");                                      \
-    fflush(stdout);                                                            \
-    fflush(stderr);                                                            \
-    getchar();                                                                 \
-  }                                                                            \
-  exit(EXIT_SUCCESS);
-
-#endif // #ifndef _CUTIL_H_
--- a/examples/huffman/hist.cu
+++ b/examples/huffman/hist.cu
@ -1,104 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and *
- * proprietary rights in and to this software and related documentation. Any
- * use, reproduction, disclosure, or distribution of this software and related
- * documentation without an express license agreement from NVIDIA Corporation is
- * strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA)
- * associated with this source code for terms and conditions that govern
- * your use of this NVIDIA software.
- *
- */
-
-#include <iostream>
-#include <stdio.h>
-
-#define CHECK(ans)                                                             \
-  { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort)
-      exit(code);
-  }
-}
-
-using namespace std;
-
-#define SIZE (100 * 1024 * 1024)
-
-__global__ void histo_kernel(unsigned char *buffer, long size,
-                             unsigned int *histo) {
-
-  __shared__ unsigned int temp[256];
-
-  temp[threadIdx.x] = 0;
-  __syncthreads();
-
-  int i = threadIdx.x + blockIdx.x * blockDim.x;
-  int offset = blockDim.x * gridDim.x;
-  while (i < size) {
-    atomicAdd(&temp[buffer[i]], 1);
-    i += offset;
-  }
-
-  __syncthreads();
-  atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
-}
-
-int runHisto(char *file, unsigned int *freq, unsigned int memSize,
-             unsigned int *source) {
-
-  FILE *f = fopen(file, "rb");
-  if (!f) {
-    perror(file);
-    exit(1);
-  }
-  fseek(f, 0, SEEK_SET);
-  size_t result = fread(source, 1, memSize, f);
-  if (result != memSize)
-    fputs("Cannot read input file", stderr);
-
-  fclose(f);
-
-  unsigned char *buffer = (unsigned char *)source;
-
-  int blocks = 2;
-
-  // allocate memory on the GPU for the file's data
-  int partSize = memSize / 32;
-  int totalNum = memSize / sizeof(unsigned int);
-  int partialNum = partSize / sizeof(unsigned int);
-
-  unsigned char *dev_buffer0;
-  unsigned char *dev_buffer1;
-  unsigned int *dev_histo;
-  cudaMalloc((void **)&dev_buffer0, partSize);
-  cudaMalloc((void **)&dev_buffer1, partSize);
-  cudaMalloc((void **)&dev_histo, 256 * sizeof(int));
-  cudaMemset(dev_histo, 0, 256 * sizeof(int));
-
-  for (int i = 0; i < totalNum; i += partialNum * 2) {
-    CHECK(
-        cudaMemcpy(dev_buffer0, buffer + i, partSize, cudaMemcpyHostToDevice));
-    CHECK(cudaMemcpy(dev_buffer1, buffer + i + partialNum, partSize,
-                     cudaMemcpyHostToDevice));
-
-    // kernel launch - 2x the number of mps gave best timing
-    histo_kernel<<<blocks * 2, 256>>>(dev_buffer0, partSize, dev_histo);
-    cudaDeviceSynchronize();
-    histo_kernel<<<blocks * 2, 256>>>(dev_buffer1, partSize, dev_histo);
-    cudaDeviceSynchronize();
-  }
-  cudaMemcpy(freq, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost);
-
-  cudaFree(dev_histo);
-  cudaFree(dev_buffer0);
-  cudaFree(dev_buffer1);
-  return 0;
-}
--- a/examples/huffman/huffTree.h
+++ b/examples/huffman/huffTree.h
@ -1,90 +0,0 @@
-#include "stdio.h"
-#include <algorithm>
-#include <climits> // for CHAR_BIT
-#include <iostream>
-#include <iterator>
-#include <map>
-#include <math.h>
-#include <queue>
-
-using namespace std;
-
-const int UniqueSymbols = 1 << CHAR_BIT;
-void printBits(unsigned int val, int numbits) {
-  for (int i = numbits - 1; i >= 0; i--)
-    putchar('0' + ((val >> i) & 1));
-}
-
-typedef vector<bool> HuffCode;
-typedef map<unsigned char, HuffCode> HuffCodeMap;
-
-class INode {
-public:
-  const int f;
-  virtual ~INode() {}
-
-protected:
-  INode(int f) : f(f) {}
-};
-
-class InternalNode : public INode {
-public:
-  INode *const left;
-  INode *const right;
-
-  InternalNode(INode *c0, INode *c1)
-      : INode(c0->f + c1->f), left(c0), right(c1) {}
-  ~InternalNode() {
-    delete left;
-    delete right;
-  }
-};
-
-class LeafNode : public INode {
-public:
-  const char c;
-
-  LeafNode(int f, char c) : INode(f), c(c) {}
-};
-
-struct NodeCmp {
-  bool operator()(const INode *lhs, const INode *rhs) const {
-    return lhs->f > rhs->f;
-  }
-};
-
-INode *BuildTree(unsigned int (&frequencies)[UniqueSymbols]) {
-  std::priority_queue<INode *, std::vector<INode *>, NodeCmp> trees;
-
-  for (int i = 0; i < UniqueSymbols; ++i) {
-    if (frequencies[i] != 0)
-      trees.push(new LeafNode(frequencies[i], (char)i));
-  }
-  while (trees.size() > 1) {
-    INode *childR = trees.top();
-    trees.pop();
-
-    INode *childL = trees.top();
-    trees.pop();
-
-    INode *parent = new InternalNode(childR, childL);
-    trees.push(parent);
-  }
-  return trees.top();
-}
-
-void GenerateCodes(const INode *node, const HuffCode &prefix,
-                   HuffCodeMap &outCodes) {
-  if (const LeafNode *lf = dynamic_cast<const LeafNode *>(node)) {
-    outCodes[lf->c] = prefix;
-  } else if (const InternalNode *in =
-                 dynamic_cast<const InternalNode *>(node)) {
-    HuffCode leftPrefix = prefix;
-    leftPrefix.push_back(false);
-    GenerateCodes(in->left, leftPrefix, outCodes);
-
-    HuffCode rightPrefix = prefix;
-    rightPrefix.push_back(true);
-    GenerateCodes(in->right, rightPrefix, outCodes);
-  }
-}
--- a/examples/huffman/load_data.h
+++ b/examples/huffman/load_data.h
@ -1,65 +0,0 @@
-#ifndef _LOADTESTDATA_H_
-#define _LOADTESTDATA_H_
-
-//#include "testdatagen.h"
-#include "hist.cu"
-#include "huffTree.h"
-
-inline void initParams(char *file_name, uint num_block_threads,
-                       uint &num_blocks, uint &num_elements, uint &mem_size,
-                       uint symbol_type_size) {
-  if (file_name == NULL) {
-    num_elements = num_blocks * num_block_threads;
-    mem_size = num_elements * symbol_type_size;
-  } else {
-    FILE *f = fopen(file_name, "rb");
-    if (!f) {
-      perror(file_name);
-      exit(1);
-    }
-    fseek(f, 0, SEEK_END);
-    mem_size = ftell(f);
-    fclose(f);
-    num_elements = mem_size / symbol_type_size;
-    // todo add check if we need 1 more block!
-    num_blocks = num_elements / num_block_threads;
-  }
-}
-
-inline void loadData(char *file_name, uint *sourceData, uint *codewords,
-                     uint *codewordlens, uint num_elements, uint mem_size,
-                     double &H) {
-  if (file_name == NULL) {
-    printf("No input file\n");
-    exit(-1);
-  } else {
-    unsigned int freqs[UniqueSymbols] = {0};
-    runHisto(file_name, freqs, mem_size, sourceData);
-    INode *root = BuildTree(freqs);
-
-    HuffCodeMap codes;
-    GenerateCodes(root, HuffCode(), codes);
-    delete root;
-
-    for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end();
-         ++it) {
-      unsigned int count = distance(it->second.begin(), it->second.end());
-      for (int i = 0; i < count; i++)
-        if (it->second[i])
-          codewords[(unsigned int)(it->first)] +=
-              (uint)pow(2.0f, (int)count - i - 1);
-      codewordlens[(unsigned int)(it->first)] = count;
-    }
-
-    H = 0.0;
-    for (unsigned int i = 0; i < 256; i++)
-      if (freqs[i] > 0) {
-        double p = (double)freqs[i] / (double)mem_size;
-        H += p * log(p) / log(2.0);
-      }
-    H = -H;
-    printf("\n%s, %u bytes, entropy %f\n\n", file_name, mem_size, H);
-  }
-}
-
-#endif
--- a/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
--- a/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll
--- a/examples/huffman/main_test_cu.cu
+++ b/examples/huffman/main_test_cu.cu
@ -1,225 +0,0 @@
-/*
- * PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.
- *
- * Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the MIT License. Read the full licence:
- * http://www.opensource.org/licenses/mit-license.php
- *
- * If you find this program useful, please contact me and reference PAVLE home
- * page in your work.
- *
- */
-
-#include "comparison_helpers.h"
-#include "cuda_helpers.h"
-#include "load_data.h"
-#include "print_helpers.h"
-#include "stats_logger.h"
-#include "stdafx.h"
-#include <cuda_runtime.h>
-#include <sys/time.h>
-
-//#include "vlc_kernel_gm32.cu"
-//#include "vlc_kernel_sm32.cu"
-#include "vlc_kernel_sm64huff.cu"
-//#include "vlc_kernel_dpt.cu"
-//#include "vlc_kernel_dptt.cu"
-//#include "scan_kernel.cu"
-#include "cpuencode.h"
-#include "pack_kernels.cu"
-#include "scan.cu"
-
-long long get_time() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return (tv.tv_sec * 1000000) + tv.tv_usec;
-}
-void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);
-
-extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
-                               unsigned int *outdata, unsigned int *outsize,
-                               unsigned int *codewords,
-                               unsigned int *codewordlens);
-
-int main(int argc, char *argv[]) {
-  if (!InitCUDA()) {
-    return 0;
-  }
-  unsigned int num_block_threads = 256;
-  if (argc > 1)
-    for (int i = 1; i < argc; i++)
-      runVLCTest(argv[i], num_block_threads);
-  else {
-    runVLCTest(NULL, num_block_threads, 1024);
-  }
-  return 0;
-}
-
-void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
-  printf("CUDA! Starting VLC Tests!\n");
-  unsigned int
-      num_elements;      // uint num_elements = num_blocks * num_block_threads;
-  unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);
-  unsigned int symbol_type_size = sizeof(int);
-  //////// LOAD DATA ///////////////
-  double H; // entropy
-  initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,
-             symbol_type_size);
-  printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "
-         "%d\n----------------------------\n",
-         num_elements, num_blocks, num_block_threads);
-  ////////LOAD DATA ///////////////
-  uint *sourceData = (uint *)malloc(mem_size);
-  uint *destData = (uint *)malloc(mem_size);
-  uint *crefData = (uint *)malloc(mem_size);
-
-  uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
-  uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
-
-  uint *cw32 = (uint *)malloc(mem_size);
-  uint *cw32len = (uint *)malloc(mem_size);
-  uint *cw32idx = (uint *)malloc(mem_size);
-
-  uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int));
-
-  memset(sourceData, 0, mem_size);
-  memset(destData, 0, mem_size);
-  memset(crefData, 0, mem_size);
-  memset(cw32, 0, mem_size);
-  memset(cw32len, 0, mem_size);
-  memset(cw32idx, 0, mem_size);
-  memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);
-  memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);
-  memset(cindex2, 0, num_blocks * sizeof(int));
-  //////// LOAD DATA ///////////////
-  loadData(file_name, sourceData, codewords, codewordlens, num_elements,
-           mem_size, H);
-
-  //////// LOAD DATA ///////////////
-
-  unsigned int *d_sourceData, *d_destData, *d_destDataPacked;
-  unsigned int *d_codewords, *d_codewordlens;
-  unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2;
-
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));
-
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size));
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size));
-
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));
-
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int)));
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int)));
-  // printf("source data\n");
-  // for (int i = 0; i < 200; i++) {
-  //   printf("%d ", sourceData[i]);
-  // }
-  // printf("\n");
-  // printf("codewords\n");
-  // for (int i = 0; i < 200; i++) {
-  //   printf("%d ", codewords[i]);
-  // }
-  // printf("\n");
-  // printf("codeword lens\n");
-  // for (int i = 0; i < 200; i++) {
-  //   printf("%d ", codewordlens[i]);
-  // }
-  // printf("\n");
-  // return;
-  CUDA_SAFE_CALL(
-      cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,
-                            NUM_SYMBOLS * symbol_type_size,
-                            cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,
-                            NUM_SYMBOLS * symbol_type_size,
-                            cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(
-      cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));
-
-  dim3 grid_size(num_blocks, 1, 1);
-  dim3 block_size(num_block_threads, 1, 1);
-  unsigned int sm_size;
-
-  unsigned int NT = 10; // number of runs for each execution time
-
-  //////////////////* CPU ENCODER *///////////////////////////////////
-  unsigned int refbytesize;
-  long long timer = get_time();
-  cpu_vlc_encode((unsigned int *)sourceData, num_elements,
-                 (unsigned int *)crefData, &refbytesize, codewords,
-                 codewordlens);
-  float msec = (float)((get_time() - timer) / 1000.0);
-  printf("CPU Encoding time (CPU): %f (ms)\n", msec);
-  printf("CPU Encoded to %d [B]\n", refbytesize);
-  unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);
-  //////////////////* END CPU *///////////////////////////////////
-
-  //////////////////* SM64HUFF KERNEL *///////////////////////////////////
-  grid_size.x = num_blocks;
-  block_size.x = num_block_threads;
-  sm_size = block_size.x * sizeof(unsigned int);
-#ifdef CACHECWLUT
-  sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);
-#endif
-
-  for (int i = 0; i < NT; i++) {
-    vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(
-        d_sourceData, d_codewords, d_codewordlens,
-#ifdef TESTING
-        d_cw32, d_cw32len, d_cw32idx,
-#endif
-        d_destData, d_cindex); // testedOK2
-    cudaThreadSynchronize();
-  }
-  //   //////////////////* END KERNEL *///////////////////////////////////
-
-#ifdef TESTING
-  unsigned int num_scan_elements = grid_size.x;
-  preallocBlockSums(num_scan_elements);
-  cudaMemset(d_destDataPacked, 0, mem_size);
-  printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);
-  prescanArray(d_cindex2, d_cindex, num_scan_elements);
-  pack2<<<num_scan_elements / 32, 32>>>(
-      (unsigned int *)d_destData, d_cindex, d_cindex2,
-      (unsigned int *)d_destDataPacked, num_elements / num_scan_elements);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");
-  deallocBlockSums();
-  // return;
-
-  CUDA_SAFE_CALL(
-      cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));
-  compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints);
-#endif
-
-  free(sourceData);
-  free(destData);
-  free(codewords);
-  free(codewordlens);
-  free(cw32);
-  free(cw32len);
-  free(crefData);
-  CUDA_SAFE_CALL(cudaFree(d_sourceData));
-  CUDA_SAFE_CALL(cudaFree(d_destData));
-  CUDA_SAFE_CALL(cudaFree(d_destDataPacked));
-  CUDA_SAFE_CALL(cudaFree(d_codewords));
-  CUDA_SAFE_CALL(cudaFree(d_codewordlens));
-  CUDA_SAFE_CALL(cudaFree(d_cw32));
-  CUDA_SAFE_CALL(cudaFree(d_cw32len));
-  CUDA_SAFE_CALL(cudaFree(d_cw32idx));
-  CUDA_SAFE_CALL(cudaFree(d_cindex));
-  CUDA_SAFE_CALL(cudaFree(d_cindex2));
-  free(cindex2);
-}
--- a/examples/huffman/pabio_kernels_v2.cu
+++ b/examples/huffman/pabio_kernels_v2.cu
@ -1,62 +0,0 @@
-/*
- * Copyright Ana Balevic, 2008-2009. All rights reserved.
- */
-#ifndef _PABIO_KERNEL2_H_
-#define _PABIO_KERNEL2_H_
-
-#include "parameters.h"
-
-/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
-*  Set numbits in the destination word out[kc] starting from the position startbit
-*  Implementation comments:
-*  Second atomic operation actually sets these bits to the value stored in the codeword; the other bits are left unotuched
-*  First atomic operation is a necessary prepration - we change only the bits that will be affected by the codeword to be written to 1s
-*  in order for set bits to work with using atomicand.
-*  TODOs: benchmark performance 1) gm atomics vs sm atomics; 2) memset at init time vs. atomicOr
-*/
-__device__ void static put_bits_atomic2(unsigned int* out, unsigned int kc,
-								unsigned int startbit, unsigned int numbits,
-								unsigned int codeword) {
-	unsigned int cw32 = codeword;
-	unsigned int restbits = 32-startbit-numbits;
-
-	/* 1. Prepare the memory location */
-#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
-	unsigned int mask = ((1<<numbits)-1);  // -> 0000...001111
-	mask<<=restbits;  //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
-	atomicAnd(&out[kc], ~mask);		//set 0s in the destination from startbit in the len of numbits
-#endif
-
-	/* 2. Write the codeword */
-	cw32 = cw32<<restbits;
-	atomicOr(&out[kc], cw32);
-}
-
-
-
-/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
-*  Checkes if the part of the word to be written matches whole memory location, and if yes, avoids using the atmoics.
-*  Experience: no benefits, even a bit slower on CUDA.
-*/
-__device__ void static put_bits_atomic2a(unsigned int* out, unsigned int kc,
-								unsigned int startbit, unsigned int numbits,
-								unsigned int codeword) {
-	unsigned int cw32 = codeword;
-	unsigned int restbits = 32-startbit-numbits;
-
-	/* 1. Prepare the memory location */
-#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
-	unsigned int mask = ((1<<numbits)-1);  // -> 0000...001111
-	mask<<=restbits;  //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
-	atomicAnd(&out[kc], ~mask);		//set 0s in the destination from startbit in the len of numbits
-#endif
-
-	/* 2. Write the codeword */
-	if (startbit == 0 && restbits == 0) {
-		out[kc] = cw32;
-	} else {
-		cw32 = cw32<<restbits;
-		atomicOr(&out[kc], cw32);
-	}
-}
-#endif //ifndef _PABIO_KERNEL_H_
--- a/examples/huffman/pack_kernels.cu
+++ b/examples/huffman/pack_kernels.cu
@ -1,43 +0,0 @@
-#ifndef _PACK_KERNELS_H_
-#define _PACK_KERNELS_H_
-#include "parameters.h"
-
-__global__ static void pack2(unsigned int *srcData, unsigned int *cindex,
-                             unsigned int *cindex2, unsigned int *dstData,
-                             unsigned int original_num_block_elements) {
-  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-  // source index
-  unsigned int offset = tid * original_num_block_elements; // DPB,
-  unsigned int bitsize = cindex[tid];
-
-  // destination index
-  unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32;
-
-  unsigned int i, dw, tmp;
-  dw = srcData[offset]; // load the first dword from srcData[]
-  tmp = dw >> bit;      // cut off those bits that do not fit into the initial
-                        // location in destData[]
-  atomicOr(&dstData[dword], tmp); // fill up this initial location
-  tmp = (bit == 0) ? 0 : (dw << 32 - bit);
-  for (i = 1; i < bitsize / 32;
-       i++) { // from now on, we have exclusive access to destData[]
-    dw = srcData[offset + i]; // load next dword from srcData[]
-    tmp |= dw >> bit;         // fill up tmp
-    dstData[dword + i] = tmp; // write complete dword to destData[]
-    tmp = (bit == 0) ? 0 : (dw << 32 - bit);
-  }
-  // exclusive access to dstData[] ends here
-  // the remaining block can, or rather should be further optimized
-  // write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible
-  // by 32, in this case do nothing
-  if (bit != 0 || bitsize % 32 != 0)
-    atomicOr(&dstData[dword + i], tmp);
-  if (bitsize % 32 != 0) {
-    dw = srcData[offset + i];
-    atomicOr(&dstData[dword + i], dw >> bit);
-    atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit));
-  }
-}
-
-#endif
--- a/examples/huffman/parameters.h
+++ b/examples/huffman/parameters.h
@ -1,27 +0,0 @@
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-typedef unsigned int uint;
-typedef unsigned char uint8;
-
-#define BENCH 0
-/* 0 - MEASURE TIME, NO TESTING
-** 1 - TEST
-** 2 - TEST & VERBOSE
-*/
-#define TESTING
-
-#define DPT 4 // data (dwords) per thread
-
-#define CACHECWLUT // MAX DPT = 8
-//#define CACHESRCDATA		// MAX DPT = 4
-
-#define SMATOMICS
-
-#define MEMSET0
-
-#define MAX_SM_BLOCK_SIZE_GPU 16384 // B
-
-#define NUM_SYMBOLS 256 // fixed to 256.
-
-#endif
--- a/examples/huffman/print_helpers.h
+++ b/examples/huffman/print_helpers.h
@ -1,217 +0,0 @@
-#ifndef _PRINT_HELPERS_H_
-#define _PRINT_HELPERS_H_
-
-#include "parameters.h"
-#include <stdio.h>
-
-__inline void printdbg_data_bin(const char *filename, unsigned int *data,
-                                unsigned int num_ints) {
-  FILE *dump = fopen((const char *)filename, "wt");
-  for (unsigned int i = 0; i < num_ints; i++) {
-    unsigned int mask = 0x80000000;
-    for (unsigned int j = 0; j < 32; j++) {
-      if (data[i] & mask)
-        fprintf(dump, "1"); // printf("1");
-      else
-        fprintf(dump, "0"); // printf("0");
-      mask = mask >> 1;
-    }
-    fprintf(dump, "\n");
-  }
-  fclose(dump);
-}
-__inline void printdbg_data_int(const char *filename, unsigned int *data,
-                                unsigned int num_ints) {
-  FILE *dump = fopen((const char *)filename, "wt");
-  for (unsigned int i = 0; i < num_ints; i++) {
-    fprintf(dump, "%d: %d\n", i, data[i]);
-  }
-  fclose(dump);
-}
-
-__inline void printdbg_gpu_data_detailed(FILE *gpudump, unsigned int *cw32,
-                                         unsigned int *cw32len,
-                                         unsigned int *cw32idx,
-                                         unsigned int num_elements) {
-  for (unsigned int i = 0; i < num_elements; i++) {
-    fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
-            cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
-    // print codeword:
-    unsigned int mask = 0x80000000;
-    mask = mask >> (32 - cw32len[i]);
-    for (unsigned int j = 0; j < cw32len[i]; j++) {
-      if (cw32[i] & mask)
-        fprintf(gpudump, "1"); // printf("1");
-      else
-        fprintf(gpudump, "0"); // printf("0");
-      mask = mask >> 1;
-    }
-    fprintf(gpudump, "\n");
-  }
-}
-
-__inline void printdbg_gpu_data_detailed2(const char *filename,
-                                          unsigned int *cw32,
-                                          unsigned int *cw32len,
-                                          unsigned int *cw32idx,
-                                          unsigned int num_elements) {
-  FILE *gpudump = fopen((const char *)filename, "wt");
-  for (unsigned int i = 0; i < num_elements; i++) {
-    fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
-            cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
-    // print codeword:
-    unsigned int mask = 0x80000000;
-    mask = mask >> (32 - cw32len[i]);
-    for (unsigned int j = 0; j < cw32len[i]; j++) {
-      if (cw32[i] & mask)
-        fprintf(gpudump, "1"); // printf("1");
-      else
-        fprintf(gpudump, "0"); // printf("0");
-      mask = mask >> 1;
-    }
-    fprintf(gpudump, "\n");
-  }
-  fclose(gpudump);
-}
-
-/************************************************************************/
-/* BIT PRINTS                                                         */
-/************************************************************************/
-__inline void printBits(unsigned char number) {
-  unsigned char mask = 0x80;
-  for (unsigned int j = 0; j < 8; j++) {
-    if (number & mask)
-      printf("1");
-    else
-      printf("0");
-    mask = mask >> 1;
-  }
-  printf(" ");
-}
-__inline void print32Bits(unsigned int number) {
-  unsigned int mask = 0x80000000;
-  for (unsigned int j = 0; j < 32; j++) {
-    if (number & mask)
-      printf("1");
-    else
-      printf("0");
-    mask = mask >> 1;
-  }
-  printf("\n");
-}
-__inline void print32BitsM(unsigned int marker) {
-  for (unsigned int j = 0; j < 32; j++) {
-    if (marker == (j + 1))
-      printf("|");
-    else
-      printf(".");
-  }
-  printf("\n");
-}
-__inline void print_array_char_as_bits(unsigned char *a, unsigned int len) {
-
-  printf(
-      " ========================= Printing vector =======================\n");
-  printf("Total number of elements is %d\n", len);
-  for (unsigned int i = 0; i < len; i++) {
-    printf("a[%d]=%d \t", i, a[i]);
-    printBits(a[i]);
-    printf("\n");
-  }
-  printf("\n");
-  printf(
-      " ==================================================================\n");
-}
-
-__inline void print_array_ints_as_bits(unsigned int *a, unsigned int len) {
-
-  printf(
-      " ========================= Printing vector =======================\n");
-  for (unsigned int i = 0; i < len; i++) {
-    print32Bits(a[i]);
-    printf("\n");
-  }
-  printf("\n");
-  printf(
-      " ==================================================================\n");
-}
-
-__inline void print_compare_array_ints_as_bits(unsigned int *a, unsigned int *b,
-                                               unsigned int len) {
-
-  printf(
-      " ========================= Printing vector =======================\n");
-  for (unsigned int i = 0; i < len; i++) {
-    print32Bits(a[i]);
-    print32Bits(b[i]);
-    printf("\n");
-  }
-  printf("\n");
-  printf(
-      " ==================================================================\n");
-}
-
-__inline void print_array_in_hex(unsigned int *a, unsigned int len) {
-
-  printf(
-      " ========================= Printing vector =======================\n");
-  // printf("Total number of elements is %d\n", len);
-  for (unsigned int i = 0; i < len; i++) {
-    printf("%#X\t", a[i]);
-  }
-
-  printf("\n");
-  printf(
-      " ==================================================================\n");
-}
-
-/************************************************************************/
-/* ARRAY PRINTS                                                        */
-/***********************************************************************/
-
-template <typename T> __inline void print_array(T *a, unsigned int len) {
-
-  printf(
-      " ========================= Printing vector =======================\n");
-  printf("Total number of elements is %d\n", len);
-  for (unsigned int i = 0; i < len; i++) {
-    printf("a[%d]=%d \t", i, a[i]);
-  }
-
-  printf("\n");
-  printf(
-      " ==================================================================\n");
-}
-
-template <typename ST, typename CT>
-__inline void print_rled_arrays(ST *rle_symbols, CT *rle_counts,
-                                unsigned int rle_len) {
-  ST current_symbol;
-  CT current_count;
-  printf(" ========================= Printing RLE vector "
-         "=======================\n");
-  printf(" Total number of RL Pairs is %d\n", rle_len);
-  for (unsigned int k = 0; k < rle_len; k++) {
-    current_symbol = rle_symbols[k];
-    current_count = rle_counts[k];
-    printf("(%d,%d) ,\t", current_symbol, current_count);
-  }
-  printf("\n");
-}
-
-__inline void print_packed_rle_array(unsigned int *rle, unsigned int rle_len) {
-  unsigned short current_symbol;
-  unsigned short current_count;
-  printf(" ========================= Printing RLE vector "
-         "=======================\n");
-  printf(" Total number of RL Pairs is %d\n", rle_len);
-  for (unsigned int k = 0; k < rle_len; k++) {
-    current_symbol = (unsigned short)(rle[k] >> 16); // get the higher half-word
-    current_count =
-        (unsigned short)rle[k] & 0x0000FFFFF; // get the shorter half-word
-    printf("(%d,%d) ,\t", current_symbol, current_count);
-  }
-  printf("\n");
-}
-
-#endif // _PRINT_HELPERS_H_
--- a/examples/huffman/run.sh
+++ b/examples/huffman/run.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-# clang++ main_test_cu.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
-clang -c -emit-llvm cpuencode.cpp
-llvm-as main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
-llvm-as main_test_cu-host-x86_64-unknown-linux-gnu.ll
-
-../../build/compilation/kernelTranslator main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
-../../build/compilation/hostTranslator main_test_cu-host-x86_64-unknown-linux-gnu.bc host.bc
-
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-llc --relocation-model=pic --filetype=obj  cpuencode.bc
-
-g++ -Wall -L../../build/runtime \
-     -L../../build/runtime/threadPool -o pavle \
-     -fPIC -no-pie host.o kernel.o cpuencode.o -lc -lx86Runtime -lthreadPool -lpthread
-
-export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
-./pavle ../../rodinia-data/huffman/test1024_H2.206587175259.in
--- a/examples/huffman/scan.cu
+++ b/examples/huffman/scan.cu
@ -1,216 +0,0 @@
-/*
- * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO USER:
- *
- * This source code is subject to NVIDIA ownership rights under U.S. and
- * international Copyright laws.
- *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- * OR PERFORMANCE OF THIS SOURCE CODE.
- *
- * U.S. Government End Users.  This source code is a "commercial item" as
- * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
- * "commercial computer software" and "commercial computer software
- * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- * and is provided to the U.S. Government only as a commercial end item.
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- * source code with only those rights set forth herein.
- */
-
-#ifndef _PRESCAN_CU_
-#define _PRESCAN_CU_
-
-// includes, kernels
-#include "cutil.h"
-#include "scanLargeArray_kernel.cu"
-#include <assert.h>
-#include <stdio.h>
-
-#define max(a, b) (a > b ? a : b)
-inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); }
-
-inline int floorPow2(int n) {
-#ifdef WIN32
-  // method 2
-  return 1 << (int)logb((float)n);
-#else
-  // method 1
-  // float nf = (float)n;
-  // return 1 << (((*(int*)&nf) >> 23) - 127);
-  int exp;
-  frexp((float)n, &exp);
-  return 1 << (exp - 1);
-#endif
-}
-
-#define BLOCK_SIZE 256
-
-static unsigned int **g_scanBlockSums;
-static unsigned int g_numEltsAllocated = 0;
-static unsigned int g_numLevelsAllocated = 0;
-
-static void preallocBlockSums(unsigned int maxNumElements) {
-  assert(g_numEltsAllocated == 0); // shouldn't be called
-
-  g_numEltsAllocated = maxNumElements;
-
-  unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
-  unsigned int numElts = maxNumElements;
-  int level = 0;
-
-  do {
-    unsigned int numBlocks =
-        max(1, (int)ceil((float)numElts / (2.f * blockSize)));
-    if (numBlocks > 1)
-      level++;
-    numElts = numBlocks;
-  } while (numElts > 1);
-
-  g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *));
-  g_numLevelsAllocated = level;
-  numElts = maxNumElements;
-  level = 0;
-
-  do {
-    unsigned int numBlocks =
-        max(1, (int)ceil((float)numElts / (2.f * blockSize)));
-    if (numBlocks > 1)
-      CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++],
-                                numBlocks * sizeof(unsigned int)));
-    numElts = numBlocks;
-  } while (numElts > 1);
-
-  CUT_CHECK_ERROR("preallocBlockSums");
-}
-
-static void deallocBlockSums() {
-  for (unsigned int i = 0; i < g_numLevelsAllocated; i++) {
-    cudaFree(g_scanBlockSums[i]);
-  }
-
-  CUT_CHECK_ERROR("deallocBlockSums");
-
-  free((void **)g_scanBlockSums);
-
-  g_scanBlockSums = 0;
-  g_numEltsAllocated = 0;
-  g_numLevelsAllocated = 0;
-}
-
-static void prescanArrayRecursive(unsigned int *outArray,
-                                  const unsigned int *inArray, int numElements,
-                                  int level) {
-  unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
-  unsigned int numBlocks =
-      max(1, (int)ceil((float)numElements / (2.f * blockSize)));
-  unsigned int numThreads;
-
-  if (numBlocks > 1)
-    numThreads = blockSize;
-  else if (isPowerOfTwo(numElements))
-    numThreads = numElements / 2;
-  else
-    numThreads = floorPow2(numElements);
-
-  unsigned int numEltsPerBlock = numThreads * 2;
-
-  // if this is a non-power-of-2 array, the last block will be non-full
-  // compute the smallest power of 2 able to compute its scan.
-  unsigned int numEltsLastBlock =
-      numElements - (numBlocks - 1) * numEltsPerBlock;
-  unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2);
-  unsigned int np2LastBlock = 0;
-  unsigned int sharedMemLastBlock = 0;
-
-  if (numEltsLastBlock != numEltsPerBlock) {
-    np2LastBlock = 1;
-
-    if (!isPowerOfTwo(numEltsLastBlock))
-      numThreadsLastBlock = floorPow2(numEltsLastBlock);
-
-    unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
-    sharedMemLastBlock =
-        sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace);
-  }
-
-  // padding space is used to avoid shared memory bank conflicts
-  unsigned int extraSpace = numEltsPerBlock / NUM_BANKS;
-  unsigned int sharedMemSize =
-      sizeof(unsigned int) * (numEltsPerBlock + extraSpace);
-
-#ifdef DEBUG
-  if (numBlocks > 1) {
-    assert(g_numEltsAllocated >= numElements);
-  }
-#endif
-
-  // setup execution parameters
-  // if NP2, we process the last block separately
-  dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1);
-  dim3 threads(numThreads, 1, 1);
-
-  // make sure there are no CUDA errors before we start
-  CUT_CHECK_ERROR("prescanArrayRecursive before kernels");
-
-  // execute the scan
-  if (numBlocks > 1) {
-    prescan<true, false><<<grid, threads>>>(
-        outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("prescanWithBlockSums");
-    if (np2LastBlock) {
-      prescan<true, true><<<1, numThreadsLastBlock>>>(
-          outArray, inArray, g_scanBlockSums[level], numEltsLastBlock,
-          numBlocks - 1, numElements - numEltsLastBlock);
-      cudaThreadSynchronize();
-      CUT_CHECK_ERROR("prescanNP2WithBlockSums");
-    }
-
-    // After scanning all the sub-blocks, we are mostly done.  But now we
-    // need to take all of the last values of the sub-blocks and scan those.
-    // This will give us a new value that must be sdded to each block to
-    // get the final results.
-    // recursive (CPU) call
-    prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level],
-                          numBlocks, level + 1);
-
-    uniformAdd<<<grid, threads>>>(outArray, g_scanBlockSums[level],
-                                  numElements - numEltsLastBlock, 0, 0);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("uniformAdd");
-    if (np2LastBlock) {
-      uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level],
-                                             numEltsLastBlock, numBlocks - 1,
-                                             numElements - numEltsLastBlock);
-      cudaThreadSynchronize();
-      CUT_CHECK_ERROR("uniformAdd");
-    }
-  } else if (isPowerOfTwo(numElements)) {
-    prescan<false, false>
-        <<<grid, threads>>>(outArray, inArray, 0, numThreads * 2, 0, 0);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("prescan");
-  } else {
-    prescan<false, true>
-        <<<grid, threads>>>(outArray, inArray, 0, numElements, 0, 0);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("prescanNP2");
-  }
-}
-
-static void prescanArray(unsigned int *outArray, unsigned int *inArray,
-                         int numElements) {
-  prescanArrayRecursive(outArray, inArray, numElements, 0);
-}
-
-#endif // _PRESCAN_CU_
--- a/examples/huffman/scanLargeArray_kernel.cu
+++ b/examples/huffman/scanLargeArray_kernel.cu
@ -1,237 +0,0 @@
-/*
- * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO USER:
- *
- * This source code is subject to NVIDIA ownership rights under U.S. and
- * international Copyright laws.
- *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- * OR PERFORMANCE OF THIS SOURCE CODE.
- *
- * U.S. Government End Users.  This source code is a "commercial item" as
- * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
- * "commercial computer software" and "commercial computer software
- * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- * and is provided to the U.S. Government only as a commercial end item.
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- * source code with only those rights set forth herein.
- */
-
-#ifndef _SCAN_BEST_KERNEL_CU_
-#define _SCAN_BEST_KERNEL_CU_
-
-// Define this to more rigorously avoid bank conflicts,
-// even at the lower (root) levels of the tree
-// Note that due to the higher addressing overhead, performance
-// is lower with ZERO_BANK_CONFLICTS enabled.  It is provided
-// as an example.
-//#define ZERO_BANK_CONFLICTS
-
-// 16 banks on G80
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-#ifdef ZERO_BANK_CONFLICTS
-#define CONFLICT_FREE_OFFSET(index)                                            \
-  ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
-#else
-#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-// Work-efficient compute implementation of scan, one thread per 2 elements
-// Work-efficient: O(log(n)) steps, and O(n) adds.
-// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
-// ping-ponging Also avoids most bank conflicts using single-element offsets
-// every NUM_BANKS elements.
-//
-// In addition, If ZERO_BANK_CONFLICTS is defined, uses
-//     n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
-// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
-// using single-element offsets every NUM_BANKS elements, plus additional
-// single-element offsets after every NUM_BANKS^2 elements.
-//
-// Uses a balanced tree type algorithm.  See Blelloch, 1990 "Prefix Sums
-// and Their Applications", or Prins and Chatterjee PRAM course notes:
-// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
-//
-// This work-efficient version is based on the algorithm presented in Guy
-// Blelloch's excellent paper "Prefix sums and their applications".
-// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
-//
-// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
-// is defined) Con: More instructions to compute bank-conflict-free shared
-// memory addressing, and slightly more shared memory storage used.
-//
-
-template <bool isNP2>
-__device__ static void
-loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
-                       int baseIndex, int &ai, int &bi, int &mem_ai,
-                       int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
-  int thid = threadIdx.x;
-  mem_ai = baseIndex + threadIdx.x;
-  mem_bi = mem_ai + blockDim.x;
-
-  ai = thid;
-  bi = thid + blockDim.x;
-
-  // compute spacing to avoid bank conflicts
-  bankOffsetA = CONFLICT_FREE_OFFSET(ai);
-  bankOffsetB = CONFLICT_FREE_OFFSET(bi);
-
-  // Cache the computational window in shared memory
-  // pad values beyond n with zeros
-  s_data[ai + bankOffsetA] = g_idata[mem_ai];
-
-  if (isNP2) // compile-time decision
-  {
-    s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
-  } else {
-    s_data[bi + bankOffsetB] = g_idata[mem_bi];
-  }
-}
-
-template <bool isNP2>
-__device__ static void
-storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
-                      int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
-                      int bankOffsetB) {
-  __syncthreads();
-
-  // write results to global memory
-  g_odata[mem_ai] = s_data[ai + bankOffsetA];
-  if (isNP2) // compile-time decision
-  {
-    if (bi < n)
-      g_odata[mem_bi] = s_data[bi + bankOffsetB];
-  } else {
-    g_odata[mem_bi] = s_data[bi + bankOffsetB];
-  }
-}
-
-template <bool storeSum>
-__device__ static void clearLastElement(unsigned int *s_data,
-                                        unsigned int *g_blockSums,
-                                        int blockIndex) {
-  if (threadIdx.x == 0) {
-    int index = (blockDim.x << 1) - 1;
-    index += CONFLICT_FREE_OFFSET(index);
-
-    if (storeSum) // compile-time decision
-    {
-      // write this block's total sum to the corresponding index in the
-      // blockSums array
-      g_blockSums[blockIndex] = s_data[index];
-    }
-
-    // zero the last element in the scan so it will propagate back to the front
-    s_data[index] = 0;
-  }
-}
-
-__device__ static unsigned int buildSum(unsigned int *s_data) {
-  unsigned int thid = threadIdx.x;
-  unsigned int stride = 1;
-
-  // build the sum in place up the tree
-  for (int d = blockDim.x; d > 0; d >>= 1) {
-    __syncthreads();
-
-    if (thid < d) {
-      int i = __mul24(__mul24(2, stride), thid);
-      int ai = i + stride - 1;
-      int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      s_data[bi] += s_data[ai];
-    }
-
-    stride *= 2;
-  }
-
-  return stride;
-}
-
-__device__ static void scanRootToLeaves(unsigned int *s_data,
-                                        unsigned int stride) {
-  unsigned int thid = threadIdx.x;
-
-  // traverse down the tree building the scan in place
-  for (int d = 1; d <= blockDim.x; d *= 2) {
-    stride >>= 1;
-
-    __syncthreads();
-
-    if (thid < d) {
-      int i = __mul24(__mul24(2, stride), thid);
-      int ai = i + stride - 1;
-      int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      unsigned int t = s_data[ai];
-      s_data[ai] = s_data[bi];
-      s_data[bi] += t;
-    }
-  }
-}
-
-template <bool storeSum>
-__device__ static void prescanBlock(unsigned int *data, int blockIndex,
-                                    unsigned int *blockSums) {
-  int stride = buildSum(data); // build the sum in place up the tree
-  clearLastElement<storeSum>(data, blockSums,
-                             (blockIndex == 0) ? blockIdx.x : blockIndex);
-  scanRootToLeaves(data, stride); // traverse down tree to build the scan
-}
-
-template <bool storeSum, bool isNP2>
-__global__ static void
-prescan(unsigned int *g_odata, const unsigned int *g_idata,
-        unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
-  int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
-  __shared__ unsigned int s_data[3072];
-
-  // load data into shared memory
-  loadSharedChunkFromMem<isNP2>(
-      s_data, g_idata, n,
-      (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
-      bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
-  // scan the data in each block
-  prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
-  // write results to device memory
-  storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
-                               bankOffsetA, bankOffsetB);
-}
-
-__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
-                                  int n, int blockOffset, int baseIndex) {
-  __shared__ unsigned int uni;
-  if (threadIdx.x == 0)
-    uni = uniforms[blockIdx.x + blockOffset];
-
-  unsigned int address =
-      __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
-
-  __syncthreads();
-
-  // note two adds per thread
-  g_data[address] += uni;
-  g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
-}
-
-#endif // #ifndef _SCAN_BEST_KERNEL_CU_
--- a/examples/huffman/stats_logger.cpp
+++ b/examples/huffman/stats_logger.cpp
@ -1,43 +0,0 @@
-/*
- * Copyright 2009 Tjark Bringewat. All rights reserved.
- */
-
-#include "stats_logger.h"
-#include "stdafx.h"
-#include <cstdio>
-#include <map>
-#include <sstream>
-
-std::map<std::string, unsigned int> filenames;
-
-void LogStats(const char *graphname, const char *seriesname, float xValue,
-              float yValue, const char *xAxisQuantity,
-              const char *yAxisQuantity, const char *xAxisUnit,
-              const char *yAxisUnit, const char *xAxisScaleType,
-              const char *yAxisScaleType, unsigned int seriesnumber,
-              const char *description) {
-  std::ostringstream temp, temp2;
-  temp << graphname << "__" << seriesname;
-  size_t exists = filenames.count(temp.str());
-  if (!exists)
-    filenames[temp.str()] = seriesnumber;
-  temp2 << graphname << "__" << filenames[temp.str()] << "_" << seriesname
-        << ".txt";
-  FILE *f;
-  if (!exists) {
-    f = fopen(temp2.str().c_str(), "wt");
-    fprintf(f, "SERIES_NAME\n%s\n", seriesname);
-    fprintf(f, "X_AXIS_QUANTITY\n%s\n", xAxisQuantity);
-    fprintf(f, "Y_AXIS_QUANTITY\n%s\n", yAxisQuantity);
-    fprintf(f, "X_AXIS_UNIT\n%s\n", xAxisUnit);
-    fprintf(f, "Y_AXIS_UNIT\n%s\n", yAxisUnit);
-    fprintf(f, "X_AXIS_SCALE_TYPE\n%s\n", xAxisScaleType);
-    fprintf(f, "Y_AXIS_SCALE_TYPE\n%s\n", yAxisScaleType);
-    fprintf(f, "DESCRIPTION\n%s\n", description);
-    fprintf(f, "__DATA__\n");
-  } else {
-    f = fopen(temp2.str().c_str(), "at");
-  }
-  fprintf(f, "%f %f\n", xValue, yValue);
-  fclose(f);
-}
--- a/examples/huffman/stats_logger.h
+++ b/examples/huffman/stats_logger.h
@ -1,45 +0,0 @@
-/*
- * Copyright Tjark Bringewat. All rights reserved.
- */
-
-#ifndef _STATS_LOGGER_H_
-#define _STATS_LOGGER_H_
-
-#include <cstring>
-#pragma warning(disable : 4996)
-
-extern "C" void
-LogStats(const char *graphname, const char *seriesname, float xValue,
-         float yValue, const char *xAxisQuantity, const char *yAxisQuantity,
-         const char *xAxisUnit = "", const char *yAxisUnit = "",
-         const char *xAxisScaleType = "lin", const char *yAxisScaleType = "lin",
-         unsigned int seriesnumber = 0, const char *description = "");
-
-inline void LogStats2(
-    const char *graph, // Groups several functions into one graph. Only appears
-                       // in the file name.
-    const char *function, // Name of the particular function. Appears in file
-                          // name and legend.
-    float yValue, float xValue, const char *yAxisName = "Time",
-    const char *yAxisUnit = "ms", const char *xAxisName = "Data size",
-    const char *xAxisUnit = "MB",
-    const char *yAxisScaleType = "lin", // Can be lin or log for linear or
-                                        // logarithmic scale, respectively.
-    const char *xAxisScaleType = "log",
-    unsigned int fId =
-        0, // Determines the order in which different functions are plotted to a
-           // common graph. Only appears in the file name.
-    const char *description = "") {
-  LogStats(graph, function, xValue, yValue, xAxisName, yAxisName, xAxisUnit,
-           yAxisUnit, xAxisScaleType, yAxisScaleType, fId, description);
-  if (strcmp(xAxisUnit, "MB") == 0 && strcmp(yAxisUnit, "ms") == 0) {
-    char buffer[100];
-    strcpy(buffer, graph);
-    strcat(buffer, "_datarate");
-    LogStats(buffer, function, xValue, (xValue * 1000.0f) / (yValue * 1024.0f),
-             xAxisName, "Data rate", xAxisUnit, "GB/s", xAxisScaleType,
-             yAxisScaleType, fId, description);
-  }
-}
-
-#endif
--- a/examples/huffman/stdafx.h
+++ b/examples/huffman/stdafx.h
@ -1,11 +0,0 @@
-#pragma once
-
-#include "cutil.h"
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <memory.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
--- a/examples/huffman/testdatagen.h
+++ b/examples/huffman/testdatagen.h
@ -1,83 +0,0 @@
-#ifndef _TESTDATA_GEN_H_
-#define _TESTDATA_GEN_H_
-
-#include "parameters.h"
-
-template <typename T>
-__inline__ void generateRLETestData(T *data, unsigned int num_blocks,
-                                    unsigned int num_block_threads) {
-  unsigned int i, j;
-
-  /* generate first block*/
-  for (i = 0; i < num_block_threads; i += 8) {
-    data[i] = 1;
-    data[i + 1] = 2;
-    data[i + 2] = 3;
-    data[i + 3] = 3;
-    data[i + 4] = 3;
-    data[i + 5] = 4;
-    data[i + 6] = 4;
-    data[i + 7] = 5;
-  }
-  /*  copy contents of the first block to all other blocks (for testing only)*/
-  for (j = 1; j < num_blocks; j++)
-    for (i = 0; i < num_block_threads; i++)
-      *(data + j * num_block_threads + i) = data[i];
-}
-
-template <typename T>
-__inline__ void generateRLETestDataLongRuns1(T *data, unsigned int num_blocks,
-                                             unsigned int num_block_threads,
-                                             unsigned int avg_run_len) {
-  unsigned int i, j;
-
-  /* generate first block*/
-  for (i = 0; i < num_block_threads / avg_run_len; i++)
-    for (j = 0; j < avg_run_len; j++)
-      data[i * avg_run_len + j] = i;
-
-  /*  copy contents of the first block to all other blocks (for testing only)*/
-  for (j = 1; j < num_blocks; j++)
-    for (i = 0; i < num_block_threads; i++)
-      *(data + j * num_block_threads + i) = data[i];
-}
-
-// VLE TEST DATA VER2.0
-
-// for testing only: generates codewords of the following lengths: 1, 2, 3, 4,
-// 4, 5, 6, 7
-//  and dummy odewords: 1, 10, 100, 1000, 1000, 10000, 100000, 1000000
-//  equals 0x01, 0x02, 0x4, 0x8, 0x8, 0x10, 0x20, 0x40
-//  num_symbols  =256. Must be multiple of 8.
-inline void generateCodewords(unsigned int *codewords,
-                              unsigned int *codewordlens,
-                              unsigned int num_symbols) {
-  unsigned int idx, i, j, numbits, k; // val, k;
-  /* Generate codeword lengths*/
-  for (j = 0; j < num_symbols / 8; j++) {
-    for (i = 0; i < 4; i++) { // generate first half of length 1,2 3, 4
-      idx = j * 8 + i;
-      codewordlens[idx] = i % 4 + 1;
-    }
-    for (i = 0; i < 4; i++) { // generate first half of length 4, 5, 6, 7
-      idx = j * 8 + 4 + i;
-      codewordlens[idx] = i % 4 + 4;
-    }
-  }
-  /* Generate codewords*/
-  for (k = 0; k < num_symbols; k++) {
-    numbits = codewordlens[k];
-    codewords[k] = 0x01 << (numbits - 1);
-  }
-}
-
-inline void generateData(unsigned int *data, unsigned int num_elements,
-                         unsigned int *codewords, unsigned int *codewordlens,
-                         unsigned int num_symbols) {
-  unsigned int i;
-  for (i = 0; i < num_elements; i++) {
-    data[i] = (unsigned int)(((float)rand() / (RAND_MAX + 1)) * num_symbols);
-  }
-}
-
-#endif
--- a/examples/huffman/vlc_kernel_sm64huff.cu
+++ b/examples/huffman/vlc_kernel_sm64huff.cu
@ -1,160 +0,0 @@
-#ifndef _VLC_SM64HUFF_KERNEL_H_
-#define _VLC_SM64HUFF_KERNEL_H_
-
-#include "pabio_kernels_v2.cu"
-#include "parameters.h"
-#include <cstdio>
-
-#ifdef SMATOMICS
-
-/* HUFFMAN-FRIENDLY PAVLE
-   CHARACTERISTICS:
-   1. CACHE CW_LUT INTO SM, LOAD AS 2 INT ARRAYS
-   2. PARALLEL PREFIX SUM
-   3. PARALLEL BIT I/O USING SHARED-MEMORY ATOMIC OPERATIONS (COMAPTIBLE WITH
-   CUDA1.3+)
-
-   NOTES & ASSUMPTIONS:
-   -	HUFFMAN-CODING FRIENDLY, SUPPORTS CODEWORDS OF 2X SIZE OF ORIGINAL
-   SYMBOLS (BYTES). -	NUMBER OF THREADS PER BLOCK IS 256; IF YOU WANT TO PLAY
-   WITH DIFFERENT NUMBERS, THE CW CACHING SHOULD BE MODIFIED (SEE DPT* KERNELS)
-   -	SM usage: 1x size of the input data (REUSE) + size of CWLUT
-                TURN ON CACHING FOR HIGH ENTROPY DATA!
-*/
-
-__global__ static void
-vlc_encode_kernel_sm64huff(unsigned int *data, const unsigned int *gm_codewords,
-                           const unsigned int *gm_codewordlens,
-#ifdef TESTING
-                           unsigned int *cw32, unsigned int *cw32len,
-                           unsigned int *cw32idx,
-#endif
-                           unsigned int *out, unsigned int *outidx) {
-
-  unsigned int kn = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int k = threadIdx.x;
-  unsigned int kc, startbit, wrbits;
-
-  unsigned long long cw64 = 0;
-  unsigned int val32, codewordlen = 0;
-  unsigned char tmpbyte, tmpcwlen;
-  unsigned int tmpcw32;
-
-  __shared__ unsigned int sm[3072];
-  __shared__ unsigned int kcmax;
-
-#ifdef CACHECWLUT
-  unsigned int *codewords = (unsigned int *)sm;
-  unsigned int *codewordlens = (unsigned int *)(sm + NUM_SYMBOLS);
-  unsigned int *as = (unsigned int *)(sm + 2 * NUM_SYMBOLS);
-
-  /* Load the codewords and the original data*/
-  codewords[k] = gm_codewords[k];
-  codewordlens[k] = gm_codewordlens[k];
-  val32 = data[kn];
-  __syncthreads();
-  for (unsigned int i = 0; i < 4; i++) {
-    tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
-    tmpcw32 = codewords[tmpbyte];
-    tmpcwlen = codewordlens[tmpbyte];
-    cw64 = (cw64 << tmpcwlen) | tmpcw32;
-    codewordlen += tmpcwlen;
-  }
-#else
-  unsigned int *as = (unsigned int *)sm;
-  val32 = data[kn];
-  for (unsigned int i = 0; i < 4; i++) {
-    tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
-    tmpcw32 = gm_codewords[tmpbyte];
-    tmpcwlen = gm_codewordlens[tmpbyte];
-    cw64 = (cw64 << tmpcwlen) | tmpcw32;
-    codewordlen += tmpcwlen;
-  }
-#endif
-  as[k] = codewordlen;
-  __syncthreads();
-
-  /* Prefix sum of codeword lengths (denoted in bits) [inplace implementation]
-   */
-  unsigned int offset = 1;
-
-  /* Build the sum in place up the tree */
-  for (unsigned int d = (blockDim.x) >> 1; d > 0; d >>= 1) {
-    __syncthreads();
-    if (k < d) {
-      unsigned char ai = offset * (2 * k + 1) - 1;
-      unsigned char bi = offset * (2 * k + 2) - 1;
-      as[bi] += as[ai];
-    }
-    offset *= 2;
-  }
-
-  /* scan back down the tree */
-  /* clear the last element */
-  if (k == 0)
-    as[blockDim.x - 1] = 0;
-
-  // traverse down the tree building the scan in place
-  for (unsigned int d = 1; d < blockDim.x; d *= 2) {
-    offset >>= 1;
-    __syncthreads();
-    if (k < d) {
-      unsigned char ai = offset * (2 * k + 1) - 1;
-      unsigned char bi = offset * (2 * k + 2) - 1;
-      unsigned int t = as[ai];
-      as[ai] = as[bi];
-      as[bi] += t;
-    }
-  }
-  __syncthreads();
-
-  if (k == blockDim.x - 1) {
-    outidx[blockIdx.x] = as[k] + codewordlen;
-    kcmax = (as[k] + codewordlen) / 32;
-    // printf("kcmax: %d\n", kcmax);
-  }
-
-  /* Write the codes */
-  kc = as[k] / 32;
-  startbit = as[k] % 32;
-  as[k] = 0U;
-  __syncthreads();
-
-  /* Part 1*/
-  wrbits = codewordlen > (32 - startbit) ? (32 - startbit) : codewordlen;
-  tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits));
-  // if (wrbits == 32) as[kc] = tmpcw32;
-  // //unnecessary overhead; increases number of branches else
-  atomicOr(&as[kc], tmpcw32 << (32 - startbit -
-                                wrbits)); // shift left in case it's shorter
-                                          // then the available space
-  codewordlen -= wrbits;
-
-  /*Part 2*/
-  if (codewordlen) {
-    wrbits = codewordlen > 32 ? 32 : codewordlen;
-    tmpcw32 =
-        (unsigned int)(cw64 >> (codewordlen - wrbits)) & ((1 << wrbits) - 1);
-    // if (wrbits == 32) as[kc+1] = tmpcw32;
-    // else
-    atomicOr(&as[kc + 1], tmpcw32 << (32 - wrbits));
-    codewordlen -= wrbits;
-  }
-
-  /*Part 3*/
-  if (codewordlen) {
-    tmpcw32 = (unsigned int)(cw64 & ((1 << codewordlen) - 1));
-    // if (wrbits == 32) as[kc+2] = tmpcw32;
-    // else
-    atomicOr(&as[kc + 2], tmpcw32 << (32 - codewordlen));
-  }
-
-  __syncthreads();
-
-  if (k <= kcmax)
-    out[kn] = as[k];
-}
-//////////////////////////////////////////////////////////////////////////////
-#endif
-
-#endif
--- a/Show More
+++ b/Show More