add codebase for TACO submission

2022-05-04 08:59:38 -04:00 · 2022-05-04 08:59:38 -04:00 · f8e72916c1
parent 897af29748
commit f8e72916c1
164 changed files with 65421 additions and 1082 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,3 +39,4 @@ set(GCC_COVERAGE_LINK_FLAGS
    "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")

 add_subdirectory(compilation)
+add_subdirectory(runtime)
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@ -1,6 +1,6 @@
-# Contributing to CuPBoP
+# Contributing to COX

-Thank you for your interest in contributing to CuPBoP!
+Thank you for your interest in contributing to COX!
 We appreciate all contributions, including but not limited to:

 - Add documentation
@ -10,9 +10,9 @@ We appreciate all contributions, including but not limited to:
 ## How to contribute?

 0. (Optional) Open an issue and discuss your idea before start
-1. Fork the latest version CuPBoP
+1. Fork the latest version COX
 2. Commit to the forked repo
-3. Create a Pull Request to CuPBoP main branch
+3. Create a Pull Request to COX main branch

 ## Code style

@ -21,14 +21,13 @@ To make sure your contribution is following the correct style,
 we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.

 ```bash
-# Python3 environment is required
+# Python environment is required
 pip install pre-commit
 ```

 Then, from the repository folder, execute the following instruction:

 ```bash
-# execute in CuPBoP's root folder
 pre-commit install
 ```

--- a/README.md
+++ b/README.md
@ -1,10 +1,10 @@
-# CuPBoP: Cuda for Parallelized and Broad-range Processors
+# COX: CUDA on X86

 ## Introduction

-CuPBoP (Cuda for parallelized and broad-range processors) is a framework
-aims to execute CUDA source code on non-NVIDIA devices,
-including CPU, GPU and other architectures.
+This project consists of two parts: a series of LLVM passes that
+achieve a SPMD NVVM IR as input, and output the corresponding
+MPMD+SIMD version of LLVM IR which can be execute on CPU devices.

 ## Install

@ -22,8 +22,8 @@ including CPU, GPU and other architectures.
 1. Clone from github

    ```bash
-    git clone https://github.com/cupbop/CuPBoP
-    cd CuPBoP
+    git clone https://github.com/drcut/open_source_template
+    cd open_source_template
    ```

 2. Build the transformer for NVVM IR to LLVM IR for X86
@ -55,12 +55,8 @@ g++ ../compilation/examples/vecadd/host.cpp \
 ./vecadd_example
 ```

-## Contribution
-
-We sincerely appreciate all kinds of contributions.
-Please refer to [CONTRIBUTING](docs/CONTRIBUTING.md) for the contributing guideline.
-
 ## Author

-* [Ruobing Han](https://drcut.github.io/)
-* [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/)
+[Ruobing Han](https://drcut.github.io/) is a CS phd student in
+Georgia Institute Technology, under the supervision
+of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).
--- a/compilation/HostTranslation.cpp
+++ b/compilation/HostTranslation.cpp
@ -1,25 +1,43 @@
-#include "ReplaceKernelLaunch.h"
+#include "RemoveCudaBuiltin.h"
+#include "ReplaceConstantMemory.h"
+#include "ReplaceCudaBuiltin.h"
+#include "ReplaceKernelArgs.h"
 #include "tool.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include <assert.h>
+#include <fstream>
 #include <iostream>
 #include <stdlib.h>

 using namespace llvm;

+std::string PATH = "kernel_meta.log";
+
 int main(int argc, char **argv) {
  assert(argc == 3 && "incorrect number of arguments\n");

  char *input_host_path = argv[1];
  char *output_host_path = argv[2];

+  std::ifstream fin;
+  fin.open(PATH);
+
  // load LLVM module(s)
  llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
  VerifyModule(hostModule);
+  // replace const memory
+  ReplaceConstantMemory(hostModule, fin);
  // process host module
-  ReplaceKernelLaunch(hostModule);
+  ReplaceCudaBuiltin(hostModule);
+  // remove builtin unuse functions and variables
+  RemoveCudaBuiltin(hostModule);
+  // replace arguments in kernel_arg, from alloc to malloc
+  ReplaceKernelArg(hostModule);
+
  VerifyModule(hostModule);
  DumpModule(hostModule, output_host_path);
+
+  fin.close();
  return 0;
 }
--- a/compilation/HostTranslation/include/ReplaceKernelLaunch.h
+++ b/compilation/HostTranslation/include/ReplaceKernelLaunch.h
@ -1,11 +1,11 @@
-#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
-#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
+#ifndef __NVVM2x86_REMOVE_CUDABUILTIN__
+#define __NVVM2x86_REMOVE_CUDABUILTIN__

 #include "llvm/IR/Module.h"
 /*
 * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 */
-void ReplaceKernelLaunch(llvm::Module *M);
+void RemoveCudaBuiltin(llvm::Module *M);

 #endif
--- a/compilation/HostTranslation/include/ReplaceConstantMemory.h
+++ b/compilation/HostTranslation/include/ReplaceConstantMemory.h
@ -0,0 +1,12 @@
+#ifndef __NVVM2x86_REPLACE_CONSTANT_MEMORY__
+#define __NVVM2x86_REPLACE_CONSTANT_MEMORY__
+
+#include "llvm/IR/Module.h"
+#include <fstream>
+/*
+ * From: @ff_variable = internal global [5 x float] undef, align 16
+ * To: @wrapper_global_ff_variable = common global [5 x float] zeroinitializer
+ */
+void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin);
+
+#endif
--- a/compilation/HostTranslation/include/ReplaceCudaBuiltin.h
+++ b/compilation/HostTranslation/include/ReplaceCudaBuiltin.h
@ -0,0 +1,11 @@
+#ifndef __NVVM2x86_REPLACE_CUDA_BUILTIN__
+#define __NVVM2x86_REPLACE_CUDA_BUILTIN__
+
+#include "llvm/IR/Module.h"
+/*
+ * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
+ * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
+ */
+void ReplaceCudaBuiltin(llvm::Module *M);
+
+#endif
--- a/compilation/HostTranslation/include/ReplaceKernelArgs.h
+++ b/compilation/HostTranslation/include/ReplaceKernelArgs.h
@ -0,0 +1,14 @@
+#ifndef __NVVM2x86_REPLACE_KERNEL_ARGS__
+#define __NVVM2x86_REPLACE_KERNEL_ARGS__
+
+#include "llvm/IR/Module.h"
+/*
+ * before:
+ * %m_cuda.addr = alloca float*, align 8
+ * after:
+ * %m_cuda.addr_tmp = call i8* @malloc(i64 8)
+ * %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
+ */
+void ReplaceKernelArg(llvm::Module *M);
+
+#endif
--- a/compilation/HostTranslation/lib/GenerateHostStub.cpp
+++ b/compilation/HostTranslation/lib/GenerateHostStub.cpp
@ -0,0 +1,7 @@
+/**
+ *  Generate a file for Cuda Kernel Function Attributes
+ *
+ *
+ *
+ *
+ */
--- a/compilation/HostTranslation/lib/InitializeDevice.cpp
+++ b/compilation/HostTranslation/lib/InitializeDevice.cpp
@ -0,0 +1,6 @@
+/*
+
+  Initialize the cudaDevice as first statements if not set by the User
+  (cudaSetDevice)
+
+*/
--- a/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp
+++ b/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp
@ -0,0 +1,59 @@
+/**
+ * Remove Clang cuda builtin functions and variables
+ */
+#include "RemoveCudaBuiltin.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <iostream>
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+void RemoveCudaBuiltin(llvm::Module *M) {
+
+  std::set<llvm::Function *> need_remove;
+
+  if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) {
+    gv->dropAllReferences();
+    gv->eraseFromParent();
+  }
+  Function *c_tor = NULL;
+  if (c_tor = M->getFunction("__cuda_module_ctor")) {
+    c_tor->dropAllReferences();
+    c_tor->eraseFromParent();
+  }
+  if (c_tor = M->getFunction("__cuda_module_dtor")) {
+    c_tor->dropAllReferences();
+    c_tor->eraseFromParent();
+  }
+  if (c_tor = M->getFunction("__cuda_register_globals")) {
+
+    c_tor->dropAllReferences();
+    c_tor->eraseFromParent();
+  }
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+
+    if (func_name == "__cuda_module_dtor" ||
+        func_name == "__cuda_register_globals" ||
+        func_name == "__cudaRegisterFunction" ||
+        func_name == "__cudaRegisterVar" ||
+        func_name == "__cudaRegisterFatBinary" ||
+        func_name == "__cuda_module_ctor" ||
+        func_name == "__cudaRegisterFatBinaryEnd" ||
+        func_name == "__cudaUnregisterFatBinary") {
+      need_remove.insert(F);
+    }
+  }
+  for (auto f : need_remove) {
+    f->dropAllReferences();
+    f->eraseFromParent();
+  }
+}
--- a/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp
+++ b/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp
@ -0,0 +1,93 @@
+#include "ReplaceConstantMemory.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include <assert.h>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin) {
+  std::string s;
+  bool find_constant_memory = false;
+  while (getline(fin, s)) {
+    if (s.find("ConstMemory2GlobalMemory") != std::string::npos) {
+      find_constant_memory = true;
+      break;
+    }
+  }
+  if (!find_constant_memory) {
+    assert(0 && "Do not find constant to global mapping\n");
+  }
+
+  std::map<std::string, std::string> corresponding_global_memory;
+  while (getline(fin, s)) {
+    if (s.find("END") != std::string::npos) {
+      break;
+    }
+    // get constant name
+    size_t pos = 0;
+    pos = s.find(' ');
+    std::string constant_name = s.substr(0, pos);
+    s.erase(0, pos + 1);
+    // get mapped global name
+    std::string global_name = s.substr(3, s.length() - 1);
+    corresponding_global_memory.insert(
+        std::pair<std::string, std::string>(constant_name, global_name));
+  }
+
+  std::set<llvm::GlobalVariable *> need_remove_constant_memory;
+  // find all constant memory and generate corresponding global memory
+  for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
+    if (auto constant_memory = dyn_cast<llvm::GlobalVariable>(I)) {
+      if (corresponding_global_memory.find(constant_memory->getName().str()) !=
+          corresponding_global_memory.end()) {
+        auto global_name =
+            corresponding_global_memory.find(constant_memory->getName().str())
+                ->second;
+        // create a new global variable
+        if (auto PT = dyn_cast<llvm::PointerType>(I->getType())) {
+          need_remove_constant_memory.insert(constant_memory);
+          // generate the corresponding global memory variable
+          auto element_type = PT->getElementType();
+          if (auto array_type = dyn_cast<llvm::ArrayType>(element_type)) {
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, array_type, false, llvm::GlobalValue::CommonLinkage, NULL,
+                global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
+
+            llvm::ConstantAggregateZero *const_array =
+                llvm::ConstantAggregateZero::get(array_type);
+            global_memory->setInitializer(const_array);
+            constant_memory->replaceAllUsesWith(
+                llvm::ConstantExpr::getPointerCast(
+                    global_memory,
+                    cast<PointerType>(constant_memory->getType())));
+          } else if (element_type->isStructTy()) {
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, element_type, false, llvm::GlobalValue::CommonLinkage, NULL,
+                global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
+            llvm::ConstantAggregateZero *const_array =
+                llvm::ConstantAggregateZero::get(element_type);
+            global_memory->setInitializer(const_array);
+            constant_memory->replaceAllUsesWith(
+                llvm::ConstantExpr::getPointerCast(
+                    global_memory,
+                    cast<PointerType>(constant_memory->getType())));
+          } else {
+            assert(0 && "The required Constant Memory Type is not supported\n");
+          }
+        }
+      }
+    }
+  }
+  for (auto i : need_remove_constant_memory) {
+    i->dropAllReferences();
+    i->eraseFromParent();
+  }
+  return;
+}
--- a/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp
+++ b/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp
@ -0,0 +1,292 @@
+#include "ReplaceCudaBuiltin.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <iostream>
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+/*
+insert sync after cudaKernel launch
+  call void @_Z13staticReversePii(i32* %55, i32 64)
+  %57 = call i32 @cudaDeviceSynchronize()
+*/
+void InsertSyncAfterKernelLaunch(llvm::Module *M) {
+  LLVMContext *C = &M->getContext();
+
+  llvm::Type *Int32T = Type::getInt32Ty(*C);
+  llvm::FunctionType *LauncherFuncT = FunctionType::get(Int32T, NULL);
+  llvm::FunctionCallee _f =
+      M->getOrInsertFunction("cudaDeviceSynchronize", LauncherFuncT);
+  llvm::Function *func_launch = llvm::cast<llvm::Function>(_f.getCallee());
+  std::set<std::string> launch_function_name;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+
+    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+      BasicBlock *B = &(*b);
+
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+        if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
+          if (Function *calledFunction = callInst->getCalledFunction()) {
+            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
+              // F is a kernel launch function
+              launch_function_name.insert(func_name);
+            }
+          }
+        }
+      }
+    }
+  }
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+      BasicBlock *B = &(*b);
+
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+        if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
+          if (Function *calledFunction = callInst->getCalledFunction()) {
+            if (launch_function_name.find(calledFunction->getName().str()) !=
+                launch_function_name.end()) {
+              // insert a sync after launch
+              if (callInst->getNextNonDebugInstruction()) {
+                llvm::CallInst::Create(func_launch, "",
+                                       callInst->getNextNonDebugInstruction());
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
+// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
+void ReplaceKernelLaunch(llvm::Module *M) {
+  LLVMContext &context = M->getContext();
+  auto VoidTy = llvm::Type::getVoidTy(context);
+  auto I8 = llvm::Type::getInt8PtrTy(context);
+  std::map<std::string, Function *> kernels;
+
+  std::set<llvm::Function *> need_remove;
+  LLVMContext *C = &M->getContext();
+
+  llvm::Type *Int32T = Type::getInt32Ty(*C);
+  llvm::Type *Int8T = Type::getInt8Ty(*C);
+
+  llvm::FunctionType *LauncherFuncT =
+      FunctionType::get(Type::getVoidTy(*C), NULL);
+
+  llvm::FunctionType *LaunchFun2 =
+      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
+
+  bool done = false;
+
+  std::set<std::string> cuda_register_kernel_names;
+
+  std::string str;
+  llvm::raw_string_ostream ss(str);
+
+  /*
+
+  When using << >>, clang generates cudaPushCallConfiguration with the same
+  function definition as the kernel definition in the kernel bitcode
+
+    define internal void @__cuda_register_globals(i8** %0) {
+    entry:
+      %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*,
+  float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x
+  i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14
+  x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32*
+  null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void
+  (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8*
+  getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8*
+  getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8*
+  null, i8* null, i8* null, i8* null, i32* null) ret void
+    }
+
+  */
+  Function *f_register_global = M->getFunction("__cuda_register_globals");
+  if (f_register_global) {
+    for (Function::iterator b = f_register_global->begin();
+         b != f_register_global->end(); ++b) {
+      BasicBlock *B = &(*b);
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
+          if (Function *calledFunction = callInst->getCalledFunction()) {
+            if (calledFunction->getName().str() == "__cudaRegisterFunction") {
+              Value *callOperand = callInst->getArgOperand(1);
+
+              Function *functionOperand =
+                  dyn_cast<Function>(callInst->getArgOperand(1));
+
+              // call function is wrapped in a bitcast
+              if (functionOperand == NULL) {
+
+                std::vector<size_t> arg_sizes;
+                functionOperand =
+                    dyn_cast<Function>(callOperand->stripPointerCasts());
+
+                cuda_register_kernel_names.insert(
+                    functionOperand->getName().str());
+                std::cout << "Cuda Register Global Kernel: "
+                          << functionOperand->getName().str() << std::endl;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  bool host_changed = false;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+
+    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+      BasicBlock *B = &(*b);
+
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+
+        if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
+          if (Function *calledFunction = callInst->getCalledFunction()) {
+
+            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
+
+              Value *callOperand = callInst->getArgOperand(0);
+
+              Function *functionOperand =
+                  dyn_cast<Function>(callInst->getArgOperand(0));
+
+              // call function is wrapped in a bitcast
+              if (functionOperand == NULL) {
+
+                std::vector<size_t> arg_sizes;
+                functionOperand =
+                    dyn_cast<Function>(callOperand->stripPointerCasts());
+
+                FunctionType *ft = calledFunction->getFunctionType();
+                std::cout << " Parent (Caller) Function Name: " << func_name
+                          << ", cudaLaunchKernel Function: "
+                          << functionOperand->getName().str() << ", args "
+                          << functionOperand->arg_size() << std::endl;
+                auto rep = kernels.find(functionOperand->getName().str());
+                if (rep != kernels.end()) {
+                  Function *FC = rep->second;
+                  BitCastInst *B = new BitCastInst(FC, I8, "", callInst);
+                  callInst->setArgOperand(0, B);
+
+                  continue;
+                }
+
+                std::vector<Type *> Params;
+                Params.push_back(I8);
+                FunctionType *FT = FunctionType::get(VoidTy, Params, false);
+
+                /*
+                  Because of the TODO in the 2nd if statement, need to get the
+                  prior name before _host is add
+                */
+                std::string oldName = functionOperand->getName().str();
+
+                // if parent function is __host and same as the cudaKernelLaunch
+                std::string newName = oldName + "_wrapper";
+                if (func_name == oldName && host_changed &&
+                    oldName.find("_host") != std::string::npos) {
+                  newName =
+                      oldName.substr(0, oldName.length() - 5) + "_wrapper";
+                }
+                std::cout << "Change Kernel Name to: " << newName << std::endl;
+
+                Function *F =
+                    Function::Create(FT, Function::ExternalLinkage, newName, M);
+                F->setDSOLocal(true);
+                F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+
+                BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
+                callInst->setArgOperand(0, BC);
+                kernels.insert({functionOperand->getName().str(), F});
+              }
+            } else if (cuda_register_kernel_names.find(
+                           calledFunction->getName()) !=
+                       cuda_register_kernel_names.end()) {
+              // if the called function collides with kernel definiton
+              // TODO: some reason changes all occurences of the function name
+              // for both cudaKernelLaunch calls and regular function call
+              // errs() << *inst;
+              host_changed = true;
+              calledFunction->setName(calledFunction->getName() + "_host");
+              std::cout << std::endl;
+              std::cout << "Change Host Function Name To: "
+                        << calledFunction->getName().str() << std::endl;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void ReplaceMemcpyToSymbol(llvm::Module *M) {
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::vector<llvm::Instruction *> need_remove;
+  for (Module::iterator F = M->begin(); F != M->end(); ++F) {
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
+          if (Call->getCalledFunction()) {
+            auto func_name = Call->getCalledFunction()->getName().str();
+            if (func_name == "cudaMemcpyToSymbol") {
+              std::vector<llvm::Type *> args;
+              // i32 @cudaMemcpyToSymbol(i8* %1, i8* %2, i64 %3, i64 %4, i32 %5)
+              args.push_back(llvm::Type::getInt8PtrTy(context));
+              args.push_back(llvm::Type::getInt8PtrTy(context));
+              args.push_back(llvm::Type::getInt64Ty(context));
+              args.push_back(llvm::Type::getInt64Ty(context));
+              args.push_back(llvm::Type::getInt32Ty(context));
+              llvm::FunctionType *func_Type =
+                  FunctionType::get(I32, args, false);
+
+              llvm::FunctionCallee _f =
+                  M->getOrInsertFunction("cudaMemcpyToSymbol_host", func_Type);
+              llvm::Function *func = llvm::cast<llvm::Function>(_f.getCallee());
+              // construct argument(s)
+              std::vector<Value *> func_args;
+              func_args.push_back(Call->getArgOperand(0));
+              func_args.push_back(Call->getArgOperand(1));
+              func_args.push_back(Call->getArgOperand(2));
+              func_args.push_back(Call->getArgOperand(3));
+              func_args.push_back(Call->getArgOperand(4));
+
+              auto c_inst = llvm::CallInst::Create(func, func_args, "", Call);
+              // insert
+              Call->replaceAllUsesWith(c_inst);
+              need_remove.push_back(Call);
+            }
+          }
+        }
+      }
+    }
+  }
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+void ReplaceCudaBuiltin(llvm::Module *M) {
+  InsertSyncAfterKernelLaunch(M);
+  ReplaceKernelLaunch(M);
+  ReplaceMemcpyToSymbol(M);
+}
--- a/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp
+++ b/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp
@ -0,0 +1,90 @@
+#include "ReplaceKernelArgs.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <iostream>
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+/*
+ * before:
+ * %m_cuda.addr = alloca float*, align 8
+ * after:
+ * %m_cuda.addr_tmp = call i8* @malloc(i64 8)
+ * %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
+ */
+// TODO: we use hard-code to implement this replacement,
+// to use use-analysis to find the arguments in the future
+void ReplaceKernelArg(llvm::Module *M) {
+  LLVMContext &context = M->getContext();
+  auto VoidTy = llvm::Type::getVoidTy(context);
+  auto I8 = llvm::Type::getInt8PtrTy(context);
+  std::map<std::string, Function *> kernels;
+
+  std::set<llvm::Function *> need_replace;
+  LLVMContext *C = &M->getContext();
+
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+      BasicBlock *B = &(*b);
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
+          if (Function *calledFunction = callInst->getCalledFunction()) {
+            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
+              need_replace.insert(F);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // find/create C's malloc function
+  std::vector<llvm::Type *> args;
+  args.push_back(llvm::Type::getInt8PtrTy(context));
+  llvm::FunctionType *mallocFuncType =
+      FunctionType::get(llvm::Type::getInt8PtrTy(context),
+                        {llvm::Type::getInt64Ty(context)}, false);
+
+  llvm::FunctionCallee _f = M->getOrInsertFunction("malloc", mallocFuncType);
+  llvm::Function *func_malloc = llvm::cast<llvm::Function>(_f.getCallee());
+
+  for (auto F : need_replace) {
+    std::set<const llvm::Value *> args_set;
+    int arg_cnt = 0;
+    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+         ii != ee; ++ii) {
+      args_set.insert(&(*ii));
+      arg_cnt++;
+    }
+    std::vector<llvm::Instruction *> need_remove;
+    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+      BasicBlock *B = &(*b);
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+        if (llvm::AllocaInst *alloc = llvm::dyn_cast<llvm::AllocaInst>(inst)) {
+          // just replace all alloc in that function
+          auto c_malloc_inst = llvm::CallInst::Create(
+              func_malloc,
+              ConstantInt::get(llvm::Type::getInt64Ty(context), 256), "",
+              alloc);
+          auto bit_cast = new BitCastInst(c_malloc_inst, alloc->getType(),
+                                          alloc->getName().str(), alloc);
+          alloc->replaceAllUsesWith(bit_cast);
+          need_remove.push_back(alloc);
+        }
+      }
+    }
+    for (auto inst : need_remove) {
+      inst->eraseFromParent();
+    }
+  }
+}
--- a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
+++ b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
@ -1,94 +0,0 @@
-#include "ReplaceKernelLaunch.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include <iostream>
-#include <map>
-#include <set>
-
-using namespace llvm;
-
-// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
-// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
-void ReplaceKernelLaunch(llvm::Module *M) {
-  LLVMContext &context = M->getContext();
-  auto VoidTy = llvm::Type::getVoidTy(context);
-  auto I8 = llvm::Type::getInt8PtrTy(context);
-  std::map<std::string, BitCastInst *> kernels;
-
-  LLVMContext *C = &M->getContext();
-
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
-  llvm::FunctionType *LauncherFuncT =
-      FunctionType::get(Type::getVoidTy(*C), NULL);
-
-  llvm::FunctionType *LaunchFun2 =
-      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
-
-  bool done = false;
-
-  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
-    Function *F = &(*i);
-    auto func_name = F->getName().str();
-
-    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
-      BasicBlock *B = &(*b);
-
-      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
-        Instruction *inst = &(*i);
-
-        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
-          if (Function *calledFunction = callInst->getCalledFunction()) {
-
-            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
-
-              Value *callOperand = callInst->getArgOperand(0);
-
-              Function *functionOperand =
-                  dyn_cast<Function>(callInst->getArgOperand(0));
-
-              // call function is wrapped in a bitcast
-              if (functionOperand == NULL) {
-
-                std::vector<size_t> arg_sizes;
-                functionOperand =
-                    dyn_cast<Function>(callOperand->stripPointerCasts());
-
-                FunctionType *ft = calledFunction->getFunctionType();
-                std::cout << " Parent (Caller) Function Name: " << func_name
-                          << ", cudaLaunchKernel Function: "
-                          << functionOperand->getName().str() << ", args "
-                          << functionOperand->arg_size() << std::endl;
-                auto rep = kernels.find(functionOperand->getName().str());
-                if (rep != kernels.end()) {
-
-                  callInst->setArgOperand(0, rep->second);
-                  continue;
-                }
-
-                std::vector<Type *> Params;
-                Params.push_back(I8);
-                FunctionType *FT = FunctionType::get(VoidTy, Params, false);
-                std::string newName =
-                    functionOperand->getName().str() + "_wrapper";
-
-                Function *F =
-                    Function::Create(FT, Function::ExternalLinkage, newName, M);
-                F->setDSOLocal(true);
-
-                BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
-                callInst->setArgOperand(0, BC);
-                kernels.insert({functionOperand->getName().str(), BC});
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
--- a/compilation/KernelTranslation.cpp
+++ b/compilation/KernelTranslation.cpp
@ -8,46 +8,66 @@
 #include "warp_func.h"
 #include "llvm/IR/Module.h"
 #include <assert.h>
+#include <fstream>
 #include <iostream>
+#include <llvm/Support/raw_ostream.h>
 #include <map>
 #include <set>
 #include <stdlib.h>

 using namespace llvm;

+std::string PATH = "kernel_meta.log";
+
 int main(int argc, char **argv) {
-  assert(argc == 9 && "incorrect number of arguments\n");
+  assert(argc == 3 && "incorrect number of arguments\n");
  llvm::Module *program = LoadModuleFromFilr(argv[1]);
-  // get size of grid and dim from input arguments
-  int *grid_dim = new int[3];
-  int *block_dim = new int[3];
-  grid_dim[0] = atoi(argv[3]);
-  grid_dim[1] = atoi(argv[4]);
-  grid_dim[2] = atoi(argv[5]);
-  block_dim[0] = atoi(argv[6]);
-  block_dim[1] = atoi(argv[7]);
-  block_dim[2] = atoi(argv[8]);
+
+  std::ofstream fout;
+  fout.open(PATH);

  // inline, and create auxiliary global variables
-  init_block(program);
+  init_block(program, fout);
  // insert sync before each vote, and replace the
  // original vote function to warp vote
  handle_warp_vote(program);
+
  // replace warp shuffle
+  // VerifyModule(program);
  handle_warp_shfl(program);
  // insert sync
+  // VerifyModule(program);
  insert_sync(program);
  // split block by sync
+  // VerifyModule(program);
+  std::cout << "split\n" << std::flush;
  split_block_by_sync(program);
  // add loop for intra&intera thread
-  insert_warp_loop(program);
-  // (TODO): replace this patch
-  replace_built_in_function(program, grid_dim, block_dim);
+
  // VerifyModule(program);
+  std::cout << "insert\n" << std::flush;
+  insert_warp_loop(program);
+
+  // VerifyModule(program);
+
+  // (TODO): replace this patch
+  std::cout << "replace\n" << std::flush;
+  replace_built_in_function(program);
+
+  // VerifyModule(program);
+  std::cout << "generate\n" << std::flush;
  generate_x86_format(program);
+
+  // VerifyModule(program);
+
  // performance optimization
  performance_optimization(program);

+  VerifyModule(program);
+
  DumpModule(program, argv[2]);
+
+  fout.close();
+
  return 0;
 }
--- a/compilation/KernelTranslation/include/generate_x86_format.h
+++ b/compilation/KernelTranslation/include/generate_x86_format.h
@ -5,4 +5,6 @@

 void generate_x86_format(llvm::Module *M);

+void set_meta_data(llvm::Module *M);
+
 #endif
--- a/compilation/KernelTranslation/include/init.h
+++ b/compilation/KernelTranslation/include/init.h
@ -2,6 +2,6 @@
 #define __NVVM2x86_INIT__

 #include "llvm/IR/Module.h"
-
-void init_block(llvm::Module *M);
+#include <fstream>
+void init_block(llvm::Module *M, std::ofstream &fout);
 #endif
--- a/compilation/KernelTranslation/include/memory_hierarchy.h
+++ b/compilation/KernelTranslation/include/memory_hierarchy.h
@ -1,9 +1,10 @@
 #ifndef __NVVM2x86_MEMORY_HIERARCHY__
 #define __NVVM2x86_MEMORY_HIERARCHY__
 #include "llvm/IR/Module.h"
-
+#include <fstream>
 using namespace llvm;

 void mem_share2global(llvm::Module *M);
+void mem_constant2global(llvm::Module *M, std::ofstream &fout);

 #endif
--- a/compilation/KernelTranslation/include/tool.h
+++ b/compilation/KernelTranslation/include/tool.h
@ -12,7 +12,7 @@ llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
 void VerifyModule(llvm::Module *);
 void phi2alloc(llvm::Module *M);
 void remove_cuda_built_in(llvm::Module *M);
-void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
+void replace_built_in_function(llvm::Module *M);
 void replace_asm_call(llvm::Module *M);
 bool find_block_barrier_in_region(llvm::BasicBlock *start,
                                  llvm::BasicBlock *end);
@ -21,4 +21,5 @@ bool has_warp_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::BasicBlock *B);
 bool has_block_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::Function *F);
+void replace_dynamic_shared_memory(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/lib/generate_x86_format.cpp
+++ b/compilation/KernelTranslation/lib/generate_x86_format.cpp
@ -18,6 +18,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include <iostream>

 using namespace llvm;

@ -40,6 +41,10 @@ void decode_input(llvm::Module *M) {
  llvm::FunctionType *LauncherFuncT = FunctionType::get(
      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);

+  std::set<GlobalVariable *> dynmaic_memory;
+
+  std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
+
  // generate Wrapper Function type
  // now we only support a single int32*
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
@ -64,6 +69,51 @@ void decode_input(llvm::Module *M) {
    // convert to int**
    input_arg = Builder.CreateBitOrPointerCast(
        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
+
+    // dynamic memory load in the wrapper function
+    GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
+    if (share_memory != NULL) {
+      dynmaic_memory.insert(share_memory);
+      llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
+          *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
+          "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
+          0, false);
+      Value *loadedValue = Builder.CreateLoad(global_mem);
+
+      llvm::FunctionType *LaunchFun2 = FunctionType::get(
+          PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
+
+      FunctionCallee fc2 =
+          M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
+
+      Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
+
+      WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
+      WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
+      Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
+      co->setSelectionKind(Comdat::SelectionKind::Any);
+      WorkGroup2->setComdat(co);
+
+      BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
+
+      llvm::IRBuilder<> Builder2(M->getContext());
+      Builder2.SetInsertPoint(Block2);
+      Builder2.CreateRet(share_memory);
+
+      auto PT = dyn_cast<PointerType>(share_memory->getType());
+      auto element_type = PT->getElementType();
+      // std::cout << element_type->getTypeID()  << " Got global memor $$$$$$"
+      // << share_memory->getName().str() << std::endl;
+
+      AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
+      // new_arr->setAlignment(llvm::MaybeAlign(16));
+      Value *new_ar = new_arr;
+      Value *gptr = Builder.CreateBitOrPointerCast(
+          share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
+
+      Builder.CreateStore(new_ar, gptr);
+    }
+
    size_t idx = 0;
    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
         ii != ee; ++ii) {
@ -95,6 +145,8 @@ void remove_barrier(llvm::Module *M) {
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
+          if (Call->isInlineAsm())
+            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync" ||
              func_name == "llvm.nvvm.barrier0" ||
@ -109,6 +161,11 @@ void remove_barrier(llvm::Module *M) {
  }
 }

+void remove_useless_var(llvm::Module *M) {
+  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
+  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
+}
+
 void generate_x86_format(llvm::Module *M) {
  // change metadata
  set_meta_data(M);
@ -116,4 +173,6 @@ void generate_x86_format(llvm::Module *M) {
  decode_input(M);
  // remove barrier
  remove_barrier(M);
+  // remove useless func/variable
+  remove_useless_var(M);
 }
--- a/compilation/KernelTranslation/lib/handle_sync.cpp
+++ b/compilation/KernelTranslation/lib/handle_sync.cpp
@ -27,6 +27,8 @@ void split_block_by_sync(llvm::Function *F) {
      }
      llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
      if (Call) {
+        if (Call->isInlineAsm())
+          continue;
        auto func_name = Call->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.barrier0" ||
            func_name == "llvm.nvvm.bar.warp.sync" ||
--- a/compilation/KernelTranslation/lib/init.cpp
+++ b/compilation/KernelTranslation/lib/init.cpp
@ -1,6 +1,7 @@
 #include "init.h"
 #include "memory_hierarchy.h"
 #include "tool.h"
+#include <fstream>
 #include <iostream>
 #include <set>

@ -23,7 +24,8 @@

 using namespace llvm;

-void inline_func_vote(llvm::Module *M) {
+bool inline_warp_level_func(llvm::Module *M) {
+  bool changed = false;
  std::set<llvm::Function *> need_remove;

  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
@ -36,10 +38,13 @@ void inline_func_vote(llvm::Module *M) {
      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
          if (c->getCalledFunction()) {
-            if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
+            auto func_name = c->getCalledFunction()->getName().str();
+            if (func_name == "_Z10__any_syncji" ||
+                func_name.find("shfl_down_sync") != std::string::npos) {
              InlineFunctionInfo IFI;
              InlineFunction(c, IFI);
              need_remove.insert(c->getCalledFunction());
+              changed = true;
            }
          }
        }
@ -50,6 +55,56 @@ void inline_func_vote(llvm::Module *M) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
+  return changed;
+}
+
+bool find_sreg_inst(llvm::Function *F) {
+  Function::iterator I = F->begin();
+  for (Function::iterator E = F->end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      if (CallInst *c = dyn_cast<CallInst>(BI++)) {
+        if (c->getCalledFunction()) {
+          auto func_name = c->getCalledFunction()->getName().str();
+          if (func_name.find("llvm.nvvm.read.ptx.sreg.") != std::string::npos) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+bool inline_func_with_tid(llvm::Module *M) {
+  bool changed = false;
+  std::set<llvm::Function *> need_remove;
+  std::set<CallInst *> need_inline;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    Function::iterator I = F->begin();
+    for (Function::iterator E = F->end(); I != E; ++I) {
+      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
+          if (c->getCalledFunction()) {
+            if (find_sreg_inst(c->getCalledFunction())) {
+              printf("inline: %s\n",
+                     c->getCalledFunction()->getName().str().c_str());
+              need_inline.insert(c);
+              need_remove.insert(c->getCalledFunction());
+            }
+          }
+        }
+      }
+    }
+  }
+  if (!need_inline.empty()) {
+    changed = true;
+  }
+  for (auto c : need_inline) {
+    InlineFunctionInfo IFI;
+    InlineFunction(c, IFI);
+  }
+  return changed;
 }

 void create_global_variable(llvm::Module *M) {
@ -70,21 +125,33 @@ void create_global_variable(llvm::Module *M) {
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_x", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_y", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_z", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
-                           NULL, "grid_size", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           NULL, "grid_size_x", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
-                           NULL, "block_index", NULL,
+                           NULL, "grid_size_y", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "grid_size_z", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_index_x", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_index_y", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_index_z", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  // TLS variable used for warp-level collective operators
  new llvm::GlobalVariable(
@ -224,24 +291,23 @@ bool lower_constant_expr(llvm::Module *M) {
          auto load_from = load_inst->getOperand(0);
          if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
            modified = true;
-            auto ReplInst = get_element_ptr->getAsInstruction();
-            ReplInst->insertBefore(load_inst);
            std::vector<Instruction *> Users;
-            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : get_element_ptr->users()) {
              if (auto InstUser = dyn_cast<Instruction>(U)) {
                Users.push_back(InstUser);
              }
            }
-            for (auto &User : Users)
+            for (auto &User : Users) {
+              auto ReplInst = get_element_ptr->getAsInstruction();
+              ReplInst->insertBefore(User);
              User->replaceUsesOfWith(get_element_ptr, ReplInst);
            }
+          }
        } else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
          auto store_to = store_inst->getOperand(1);
          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
            modified = true;
-            auto ReplInst = addr_cast->getAsInstruction();
-            ReplInst->insertBefore(store_inst);
+
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : addr_cast->users()) {
@ -249,16 +315,19 @@ bool lower_constant_expr(llvm::Module *M) {
                Users.push_back(InstUser);
              }
            }
-            for (auto &User : Users)
+            for (auto &User : Users) {
+              auto ReplInst = addr_cast->getAsInstruction();
+              ReplInst->insertBefore(User);
              User->replaceUsesOfWith(addr_cast, ReplInst);
            }
+          }
        } else if (auto get_element_ptr =
                       dyn_cast<llvm::GetElementPtrInst>(BI)) {
          auto get_from = get_element_ptr->getOperand(0);
          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
            modified = true;
-            auto ReplInst = addr_cast->getAsInstruction();
-            ReplInst->insertBefore(get_element_ptr);
+            // auto ReplInst = addr_cast->getAsInstruction();
+            // ReplInst->insertBefore(get_element_ptr);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : addr_cast->users()) {
@ -266,21 +335,37 @@ bool lower_constant_expr(llvm::Module *M) {
                Users.push_back(InstUser);
              }
            }
-            for (auto &User : Users)
+            for (auto &User : Users) {
+              auto ReplInst = addr_cast->getAsInstruction();
+              ReplInst->insertBefore(User);
              User->replaceUsesOfWith(addr_cast, ReplInst);
            }
          }
        }
      }
    }
+  }
  return modified;
 }

-void init_block(llvm::Module *M) {
+void replace_cuda_math_built_in(llvm::Module *M) {
+  // replace _ZL3expd, just delete its body
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (func_name.find("_ZL3expd") != std::string::npos) {
+      F->deleteBody();
+    }
+  }
+}
+
+void init_block(llvm::Module *M, std::ofstream &fout) {
  // using official llvm preprocess
  llvm_preprocess(M);
  // remove useles Cuda function
  remove_cuda_built_in(M);
+  // replace CUDA math function, like expf
+  replace_cuda_math_built_in(M);

  // lower ConstantExpression
  bool modified;
@ -289,14 +374,26 @@ void init_block(llvm::Module *M) {
  } while (modified);
  // remove useless metadata
  remove_metadata(M);
-  // inline vote function
-  inline_func_vote(M);
+  // inline warp-level function
+  while (1) {
+    if (!inline_warp_level_func(M))
+      break;
+  }
+  // TODO: remove the hardcode
+  while (1) {
+    if (!inline_func_with_tid(M))
+      break;
+  }
  // create global variable for warp and vote
  create_global_variable(M);
  // replace phi with data load
  phi2alloc(M);
  // replace share memory
  mem_share2global(M);
+  // replace share memory
+  mem_constant2global(M, fout);
  // replace asm Inline
  replace_asm_call(M);
+  // replace dynamic shared memory
+  replace_dynamic_shared_memory(M);
 }
--- a/compilation/KernelTranslation/lib/insert_sync.cpp
+++ b/compilation/KernelTranslation/lib/insert_sync.cpp
@ -212,11 +212,22 @@ public:
      changed = true;

      // we may create a new conditional barrier after insert
-      if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
+      if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) {
+        // if the block postdominates all its predecessor
+        // then it is not a conditional barriers
+        bool post_dominate_all = true;
+        for (auto I = pred_begin(pred); I != pred_end(pred); I++) {
+          if (!PDT->getPostDomTree().dominates(pred, *I)) {
+            post_dominate_all = false;
+            break;
+          }
+        }
+        if (!post_dominate_all)
          conditionalBarriers.push_back(pred);
+      }

      // find any block which are not dominated by header
-      // but be posdiminated by merge point
+      // but be postdominated by merge point
      std::queue<llvm::BasicBlock *> if_body;
      std::set<llvm::BasicBlock *> visited_block;
      for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
@ -234,19 +245,26 @@ public:
            PDT->getPostDomTree().dominates(merge_point, curr)) {
          // we should insert barrier at the beginning and
          // end of its predecessor
+          printf("insert [255]: %s\n", curr->getName().str().c_str());
          if (has_warp_barrier(b)) {
            CreateIntraWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
+              printf("insert [262]: %s\n", Pred->getName().str().c_str());
              CreateIntraWarpBarrier(&(*Pred->getTerminator()));
            }
          } else {
            CreateInterWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
+              printf("insert [268]: %s\n", Pred->getName().str().c_str());
              CreateInterWarpBarrier(&(*Pred->getTerminator()));
            }
          }
        }
        for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
+          // avoid backedge
+          if (DT->dominates(curr->getTerminator()->getSuccessor(i), pred)) {
+            continue;
+          }
          if_body.push(curr->getTerminator()->getSuccessor(i));
        }
      }
@ -266,6 +284,32 @@ public:
    AU.addRequired<DominatorTreeWrapperPass>();
  }

+  BasicBlock *find_merge_point(BasicBlock *start, PostDominatorTree &PDT) {
+    assert(start->getTerminator()->getNumSuccessors() == 2);
+    std::set<llvm::BasicBlock *> visit;
+    std::queue<llvm::BasicBlock *> pending_blocks;
+    for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
+      pending_blocks.push(start->getTerminator()->getSuccessor(i));
+    }
+    while (!pending_blocks.empty()) {
+      BasicBlock *current = pending_blocks.front();
+      pending_blocks.pop();
+
+      if (visit.find(current) != visit.end())
+        continue;
+
+      visit.insert(current);
+      if (PDT.dominates(current, start))
+        return current;
+      for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
+        auto succ = current->getTerminator()->getSuccessor(i);
+        if (visit.find(succ) == visit.end())
+          pending_blocks.push(succ);
+      }
+    }
+    assert(0 && "Do not find merge point\n");
+    return NULL;
+  }
  virtual bool runOnFunction(Function &F) {
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
@ -280,18 +324,8 @@ public:

    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
      BasicBlock *b = &*i;
-      BasicBlock *merge_point = NULL;
      if (b->getTerminator()->getNumSuccessors() == 2) {
-        auto b1 = b->getTerminator()->getSuccessor(0);
-        auto b2 = b->getTerminator()->getSuccessor(1);
-        if (PDT->getPostDomTree().dominates(b1, b2)) {
-          merge_point = b1;
-        } else if (PDT->getPostDomTree().dominates(b2, b2)) {
-          merge_point = b2;
-        } else {
-          assert(0 && "find complex if-else branch\n");
-        }
-        std::cout << std::flush;
+        auto merge_point = find_merge_point(b, PDT->getPostDomTree());
        for (BasicBlock *Pred : predecessors(merge_point)) {
          if (!DT->dominates(b, Pred)) {
            // we need to insert an extra block to be the merge point
@ -305,14 +339,8 @@ public:
    auto M = F.getParent();
    for (auto head : if_head) {
      assert(head->getTerminator()->getNumSuccessors() == 2);
-      BasicBlock *merge_point = NULL;
-      auto s1 = head->getTerminator()->getSuccessor(0);
-      auto s2 = head->getTerminator()->getSuccessor(1);
-      if (PDT->getPostDomTree().dominates(s1, s2)) {
-        merge_point = s1;
-      } else {
-        merge_point = s2;
-      }
+      BasicBlock *merge_point = find_merge_point(head, PDT->getPostDomTree());
+      assert(PDT->getPostDomTree().dominates(merge_point, head));
      if (!find_barrier_in_region(head, merge_point)) {
        printf("do not need to handle tri-income if: %s\n",
               merge_point->getName().str().c_str());
@ -368,6 +396,8 @@ public:
      for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
           ++j) {
        if (auto Call = dyn_cast<CallInst>(j)) {
+          if (Call->isInlineAsm())
+            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.bar.warp.sync" ||
@ -383,7 +413,7 @@ public:
    }
    if (!is_conditional_loop)
      return 0;
-    // insert barrier at the beginning of header
+    // insert barrier at the beginning of header (for_cond)
    // and the end of pre header, so that we can get a
    // single block connected with latch
    if (!is_warp) {
@ -399,17 +429,40 @@ public:
    }

    // as we assume all loops are rotated, we have to insert
-    // barrier before the condition jump of the loop exit
-
-    if (auto exit_block = L->getExitingBlock()) {
+    // barrier before the condition jump of the for_cond
+    if (auto for_cond = L->getExitingBlock()) {
+      assert(for_cond->getTerminator()->getNumSuccessors() == 2 &&
+             "has more than 2 successors of the for-cond\n");
      auto conditional_br =
-          dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
+          dyn_cast<llvm::BranchInst>(for_cond->getTerminator());
      assert(conditional_br && conditional_br->isConditional());
-      // insert barrier at the beginning of successor of exit
+      // insert barrier before the condition jump of the loop cond
      if (!is_warp)
        CreateInterWarpBarrier(conditional_br);
      else
        CreateIntraWarpBarrier(conditional_br);
+      // insert barrier before the for_body
+      auto for_body = for_cond->getTerminator()->getSuccessor(0);
+      if (for_body == L->getExitBlock()) {
+        for_body = for_cond->getTerminator()->getSuccessor(1);
+      }
+      // insert at the beginning of for_body
+      if (!is_warp)
+        CreateInterWarpBarrier(&(*for_body->begin()));
+      else
+        CreateIntraWarpBarrier(&(*for_body->begin()));
+      // insert at the beginning and end in for_inc block
+      if (auto for_inc = L->getLoopLatch()) {
+        if (!is_warp) {
+          CreateInterWarpBarrier(&(*for_inc->begin()));
+          CreateInterWarpBarrier(for_inc->getTerminator());
+        } else {
+          CreateIntraWarpBarrier(&(*for_inc->begin()));
+          CreateIntraWarpBarrier(for_inc->getTerminator());
+        }
+      } else {
+        assert(0 && "has continue in a barrier loop\n");
+      }
    } else {
      // handle break in for-loop
      printf("loop has multiply exists\n");
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -67,9 +67,15 @@ std::map<std::string, llvm::Instruction *> contextArrays;
 int tempInstructionIndex = 0;
 int need_nested_loop;

+// adding multiple kenerl in file support
+
 bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
  if (isa<BranchInst>(instr))
    return true;
+  // if (isa<AddrSpaceCastInst>(instr))
+  //   return true;
+  // if (isa<CastInst>(instr))
+  //   return true;

  llvm::Module *M = instr->getParent()->getParent()->getParent();
  llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
@ -111,6 +117,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
    return contextArrays[varName];

  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
+
  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
  Function *FF = instruction->getParent()->getParent();
  Module *M = instruction->getParent()->getParent()->getParent();
@ -127,6 +134,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,

  Type *AllocType = elementType;
  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
+  /*
  if (InstCast) {
    unsigned Alignment = InstCast->getAlignment();

@ -166,7 +174,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
      }
    }
  }
-
+  */
  llvm::Value *ItemSize = nullptr;
  llvm::AllocaInst *Alloca = nullptr;

@ -354,14 +362,37 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
    auto F = PRs[0].start_block->getParent();
    for (auto bb = F->begin(); bb != F->end(); bb++) {
      for (auto ii = bb->begin(); ii != bb->end(); ii++) {
-        if (isa<AllocaInst>(&(*ii)))
-          instruction_to_fix.push_back(&(*ii));
+        if (isa<AllocaInst>(&(*ii))) {
+          auto alloc = dyn_cast<AllocaInst>(&(*ii));
+          // Do not duplicate var used outside PRs
+          bool used_in_non_PR = false;
+          for (Instruction::use_iterator ui = alloc->use_begin(),
+                                         ue = alloc->use_end();
+               ui != ue; ++ui) {
+            llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
+            auto user_block = user->getParent();
+            bool find_in_PR = false;
+            for (auto PR : PRs) {
+              if (PR.wrapped_block.find(user_block) != PR.wrapped_block.end()) {
+                find_in_PR = true;
+                break;
+              }
+            }
+            if (find_in_PR == false) {
+              used_in_non_PR = true;
+              break;
+            }
+          }
+          if (!used_in_non_PR) {
+            instruction_to_fix.push_back(alloc);
+          }
+        }
+      }
    }
    for (auto inst : instruction_to_fix) {
      AddContextSaveRestore(inst, intra_warp_loop);
    }
  }
-  }

  for (auto parallel_regions : PRs) {
    std::set<llvm::Instruction *> instruction_in_region;
@ -380,10 +411,8 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
      for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
           ++instr) {
        llvm::Instruction *instruction = &*instr;
-
        if (ShouldNotBeContextSaved(instruction))
          continue;
-
        for (Instruction::use_iterator ui = instruction->use_begin(),
                                       ue = instruction->use_end();
             ui != ue; ++ui) {
@ -582,6 +611,8 @@ void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
  for (auto BB = F->begin(); BB != F->end(); ++BB) {
    for (auto BI = BB->begin(); BI != BB->end(); BI++) {
      if (auto Call = dyn_cast<CallInst>(BI)) {
+        if (Call->isInlineAsm())
+          continue;
        auto func_name = Call->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.bar.warp.sync") {
          need_remove.push_back(Call);
@ -648,6 +679,8 @@ public:
      bool has_barrier = 0;
      for (auto i = current->begin(), e = current->end(); i != e; ++i) {
        if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
+          if (call_inst->isInlineAsm())
+            continue;
          auto func_name = call_inst->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.barrier.sync")
@ -761,6 +794,8 @@ public:
    for (Function::iterator s = F->begin(); s != F->end(); s++) {
      if (llvm::CallInst *call_inst =
              llvm::dyn_cast<llvm::CallInst>(s->begin())) {
+        if (call_inst->isInlineAsm())
+          continue;
        auto func_name = call_inst->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.barrier0" ||
            func_name == "llvm.nvvm.barrier.sync") {
@ -787,6 +822,12 @@ public:
    if (!isKernelFunction(F.getParent(), &F))
      return 0;

+    auto func_name = (&F)->getName().str();
+    // clear context array, temp variables for new kernel function
+    contextArrays.clear();
+    tempInstructionIds.clear();
+    tempInstructionIndex = 0;
+
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();

@ -794,11 +835,11 @@ public:
    auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
    assert(!parallel_regions.empty() && "can not find any parallel regions\n");
    // print_parallel_region(parallel_regions);
-    add_warp_loop(parallel_regions, intra_warp_loop);

    if (intra_warp_loop) {
      handle_local_variable_intra_warp(parallel_regions);
    }
+    add_warp_loop(parallel_regions, intra_warp_loop);
    remove_barrier(&F, intra_warp_loop);
    return 1;
  }
@ -816,6 +857,8 @@ bool has_warp_barrier(llvm::Module *M) {
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
+          if (Call->isInlineAsm())
+            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync") {
            return true;
@ -841,8 +884,8 @@ void insert_warp_loop(llvm::Module *M) {
    // only need a single loop, with size=block_size
    Passes.add(new InsertWarpLoopPass(intra_warp));
    Passes.run(*M);
+  }
  // remove all barriers
  for (auto F = M->begin(); F != M->end(); ++F)
    remove_barrier(dyn_cast<llvm::Function>(F), false);
 }
-}
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -9,6 +9,8 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
+#include <fstream>
+#include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
@ -36,15 +38,35 @@ void mem_share2global(llvm::Module *M) {
          auto new_name = "wrapper_global_" + share_memory->getName().str();
          auto element_type = PT->getElementType();
          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
+            if (share_memory->hasExternalLinkage() &&
+                array_type->getArrayNumElements() == 0) {
+              // external shared memory of []
+              // generate global type pointer
+              PointerType *PointerTy =
+                  PointerType::get(array_type->getElementType(), 0);
+              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
+              llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
+                  *M, PointerTy, false, llvm::GlobalValue::CommonLinkage, x1,
+                  "wrapper_global_data", NULL,
+                  llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+
+              global_ptr->setDSOLocal(true);
+
+              corresponding_global_memory.insert(
+                  std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
+                                                                global_ptr));
+            } else {
              llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
-                *M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
-                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
+                  *M, array_type, false, llvm::GlobalValue::ExternalLinkage,
+                  NULL, new_name, NULL,
+                  llvm::GlobalValue::GeneralDynamicTLSModel, 1);
              ConstantAggregateZero *const_array =
                  ConstantAggregateZero::get(array_type);
              global_memory->setInitializer(const_array);
              corresponding_global_memory.insert(
                  std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                                global_memory));
+            }
          } else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
            auto zero = llvm::ConstantInt::get(int_type, 0, true);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
@ -54,6 +76,16 @@ void mem_share2global(llvm::Module *M) {
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
+          } else if (element_type->isFloatTy()) {
+            auto FP_type = llvm::Type::getFloatTy(*C);
+            auto zero = llvm::ConstantFP::get(FP_type, 0);
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
+                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
+                false);
+            corresponding_global_memory.insert(
+                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
+                                                              global_memory));
          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
          }
@ -62,57 +94,11 @@ void mem_share2global(llvm::Module *M) {
    }
  }

-  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
-    Function *F = &(*i);
-    for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
-      BasicBlock *b = &*i;
-      for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
-        if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
-          auto read_array = get_element_ptr->getPointerOperand();
-          if (GlobalVariable *read_share_memory =
-                  dyn_cast<llvm::GlobalVariable>(read_array)) {
-            // find a GetElementPtr which read share memory
-            if (corresponding_global_memory.find(read_share_memory) !=
-                corresponding_global_memory.end()) {
-              std::vector<Value *> Indices;
-              for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
-                Indices.push_back(get_element_ptr->getOperand(i + 1));
-
-              auto new_GEP = GetElementPtrInst::Create(
-                  NULL, // Pointee type
-                  corresponding_global_memory.find(read_share_memory)
-                      ->second, // Alloca
-                  Indices,      // Indices
-                  "", get_element_ptr);
-              // replace all get_element_ptr with new_GEP:
-              // we can not directly use:
-              // get_element_ptr->replaceAllUsesWith(new_GEP);
-              // as get_element_ptr and new_GEP have different return type
-              llvm::Type *original_type = get_element_ptr->getType();
-              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
-                  new_GEP, original_type, "", get_element_ptr);
-              get_element_ptr->replaceAllUsesWith(FormatASC);
-              need_remove.insert(get_element_ptr);
-            }
-          }
-        } else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
-          auto read_array = addr_cast->getOperand(0);
-          if (GlobalVariable *read_share_memory =
-                  dyn_cast<llvm::GlobalVariable>(read_array)) {
-            // find a GetElementPtr which read share memory
-            if (corresponding_global_memory.find(read_share_memory) !=
-                corresponding_global_memory.end()) {
-              llvm::Type *original_type = addr_cast->getType();
-              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
-                  corresponding_global_memory.find(read_share_memory)->second,
-                  original_type, "", addr_cast);
-              addr_cast->replaceAllUsesWith(FormatASC);
-              need_remove.insert(addr_cast);
-            }
-          }
-        }
-      }
-    }
+  for (auto k : corresponding_global_memory) {
+    auto share_addr = k.first;
+    auto global_addr = k.second;
+    share_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
+        global_addr, cast<PointerType>(share_addr->getType())));
  }

  for (auto i : need_remove) {
@ -124,3 +110,83 @@ void mem_share2global(llvm::Module *M) {
    i->eraseFromParent();
  }
 }
+
+void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
+  LLVMContext *C = &M->getContext();
+  llvm::Type *Int32T = Type::getInt32Ty(*C);
+  llvm::Type *Int64T = Type::getInt64Ty(*C);
+  llvm::Type *Int8T = Type::getInt8Ty(*C);
+
+  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
+  std::set<llvm::Instruction *> need_remove;
+  std::set<GlobalVariable *> need_remove_constant_memory;
+
+  // find all constant memory and generate corresponding global memory
+  for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
+    if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
+      if (auto PT = dyn_cast<PointerType>(I->getType())) {
+        unsigned AS = PT->getAddressSpace();
+        if (AS == 4) { // find a share memory
+          need_remove_constant_memory.insert(constant_memory);
+          // generate the corresponding global memory variable
+          auto new_name = "wrapper_global_" + constant_memory->getName().str();
+          auto element_type = PT->getElementType();
+          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
+            if (constant_memory->hasExternalLinkage() &&
+                array_type->getArrayNumElements() == 0) {
+              // external shared memory of []
+              // generate global type pointer
+              PointerType *PointerTy =
+                  PointerType::get(array_type->getElementType(), 0);
+              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
+              llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
+                  *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage, x1,
+                  "wrapper_global_data", NULL,
+                  llvm::GlobalValue::NotThreadLocal, 0, true);
+
+              corresponding_global_memory.insert(
+                  std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
+                                                                global_ptr));
+            } else {
+              llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                  *M, array_type, false, llvm::GlobalValue::ExternalLinkage,
+                  NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
+              corresponding_global_memory.insert(
+                  std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
+                                                                global_memory));
+            }
+          } else if (element_type->isStructTy()) {
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
+                NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
+            corresponding_global_memory.insert(
+                std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
+                                                              global_memory));
+          } else {
+            assert(0 && "The required Constant Memory Type is not supported\n");
+          }
+        }
+      }
+    }
+  }
+  fout << "ConstMemory2GlobalMemory\n";
+  for (auto k : corresponding_global_memory) {
+    auto const_addr = k.first;
+    auto global_addr = k.second;
+    const_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
+        global_addr, cast<PointerType>(const_addr->getType())));
+    // this file will be used by host translator
+    fout << const_addr->getName().str().c_str() << " to "
+         << global_addr->getName().str().c_str() << std::endl;
+  }
+  fout << "END\n";
+
+  for (auto i : need_remove) {
+    i->dropAllReferences();
+    i->eraseFromParent();
+  }
+  for (auto i : need_remove_constant_memory) {
+    i->dropAllReferences();
+    i->eraseFromParent();
+  }
+}
--- a/compilation/KernelTranslation/lib/tool.cpp
+++ b/compilation/KernelTranslation/lib/tool.cpp
@ -1,5 +1,6 @@
 #include "tool.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
@ -187,7 +188,52 @@ void remove_cuda_built_in(llvm::Module *M) {
  }
 }

-void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
+// copied from POCL
+static void breakConstantExpressions(llvm::Value *Val, llvm::Function *Func) {
+  std::vector<llvm::Value *> Users(Val->user_begin(), Val->user_end());
+  for (auto *U : Users) {
+    if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(U)) {
+      // First, make sure no users of this constant expression are themselves
+      // constant expressions.
+      breakConstantExpressions(U, Func);
+      // Convert this constant expression to an instruction.
+      llvm::Instruction *I = CE->getAsInstruction();
+      I->insertBefore(&*Func->begin()->begin());
+      CE->replaceAllUsesWith(I);
+      CE->destroyConstant();
+    }
+  }
+}
+
+void replace_dynamic_shared_memory(llvm::Module *M) {
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    if (!isKernelFunction(M, F))
+      continue;
+    for (Module::global_iterator i = M->global_begin(), e = M->global_end();
+         i != e; ++i) {
+      breakConstantExpressions(&*i, F);
+    }
+    auto dynamic_shared_memory_addr =
+        M->getGlobalVariable("dynamic_shared_memory");
+    if (!dynamic_shared_memory_addr) {
+      return;
+    }
+    auto load_shared_memory =
+        new LoadInst(dynamic_shared_memory_addr, "new_load");
+    auto new_bit_cast =
+        new BitCastInst(load_shared_memory,
+                        dynamic_shared_memory_addr->getType(), "new_bit_cast");
+    new_bit_cast->insertBefore(&*F->begin()->begin());
+    load_shared_memory->insertBefore(new_bit_cast);
+    dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) {
+      auto *Instr = dyn_cast<Instruction>(U.getUser());
+      return Instr != new_bit_cast && Instr != load_shared_memory;
+    });
+  }
+}
+
+void replace_built_in_function(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<llvm::Instruction *> need_remove;
@ -203,28 +249,60 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
    auto local_intra_warp_idx =
        builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
                             0, "local_intra_warp_idx");
-    global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
+    global_intra_warp_idx->replaceUsesWithIf(local_intra_warp_idx, [&](Use &U) {
+      auto *Instr = dyn_cast<Instruction>(U.getUser());
+      return Instr->getParent()->getParent()->getName().str() == func_name;
+    });
+
    auto global_inter_warp_idx =
        F->getParent()->getGlobalVariable("inter_warp_index");
+
    auto local_inter_warp_idx =
        builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
                             0, "local_inter_warp_idx");
-    global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
+
+    builder.CreateStore(ConstantInt::get(I32, 0), local_inter_warp_idx);
+
+    global_inter_warp_idx->replaceUsesWithIf(local_inter_warp_idx, [&](Use &U) {
+      auto *Instr = dyn_cast<Instruction>(U.getUser());
+      return Instr->getParent()->getParent()->getName().str() == func_name;
+    });

    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Load = dyn_cast<LoadInst>(BI)) {
          auto load_from = Load->getOperand(0);
-          if (load_from == F->getParent()->getGlobalVariable("block_size")) {
-            Load->replaceAllUsesWith(ConstantInt::get(
-                I32, block_dim[0] * block_dim[1] * block_dim[2]));
-            need_remove.push_back(Load);
-          }
        } else if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
-            if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
+            if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
+                func_name ==
+                    "_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv") {
+              auto block_size_addr = M->getGlobalVariable("block_size_x");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto val = builder.CreateLoad(block_size_addr);
+              Call->replaceAllUsesWith(val);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
+              auto block_size_addr = M->getGlobalVariable("block_size_y");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto val = builder.CreateLoad(block_size_addr);
+              Call->replaceAllUsesWith(val);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
+              auto block_size_addr = M->getGlobalVariable("block_size_z");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto val = builder.CreateLoad(block_size_addr);
+              Call->replaceAllUsesWith(val);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" ||
+                       func_name == "_ZN26__cuda_builtin_threadIdx_t17__fetch_"
+                                    "builtin_xEv") {
              // replace it by warp_id
+
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);

@ -234,12 +312,11 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
              thread_idx = builder.CreateBinOp(
                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
                  thread_idx, "thread_idx");
-              if (block_dim[1] != 1 || block_dim[2] != 1) {
-                printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
+
              thread_idx = builder.CreateBinOp(
                  Instruction::SRem, thread_idx,
-                    ConstantInt::get(I32, block_dim[0]), "thread_id_x");
-              }
+                  builder.CreateLoad(M->getGlobalVariable("block_size_x")),
+                  "thread_id_x");

              Call->replaceAllUsesWith(thread_idx);
              need_remove.push_back(Call);
@ -257,63 +334,61 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
              // tidy = tid / block_dim.x
              thread_idx = builder.CreateBinOp(
                  Instruction::SDiv, thread_idx,
-                  ConstantInt::get(I32, block_dim[0]),
-                  // builder.CreateLoad(M->getGlobalVariable("block_size_x")),
+                  builder.CreateLoad(M->getGlobalVariable("block_size_x")),
                  "thread_id_y");
-
              Call->replaceAllUsesWith(thread_idx);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
-              printf("[WARNING] We DO NOT support multi-dim block\n");
+              printf("[WARNING] We DO NOT support triple-dim block\n");
+              exit(1);
              auto zero = ConstantInt::get(I32, 0);
              Call->replaceAllUsesWith(zero);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
-              auto block_index_addr = M->getGlobalVariable("block_index");
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x" ||
+                       func_name == "_ZN25__cuda_builtin_blockIdx_t17__fetch_"
+                                    "builtin_xEv") {
+              auto block_index_addr = M->getGlobalVariable("block_index_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto block_idx = builder.CreateLoad(block_index_addr);
              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
-                       func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
-              printf("[WARNING We DO NOT support multi-dim grid\n");
-              auto zero = ConstantInt::get(I32, 0);
-              Call->replaceAllUsesWith(zero);
-              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
-              auto block_size_addr = M->getGlobalVariable("block_size_x");
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") {
+              auto block_index_addr = M->getGlobalVariable("block_index_y");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
-              auto block_size = ConstantInt::get(I32, block_dim[0]);
-              Call->replaceAllUsesWith(block_size);
+              auto block_idx = builder.CreateLoad(block_index_addr);
+              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
-              auto block_size_addr = M->getGlobalVariable("block_size_y");
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
+              auto block_index_addr = M->getGlobalVariable("block_index_z");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
-              auto block_size = ConstantInt::get(I32, block_dim[1]);
-              Call->replaceAllUsesWith(block_size);
+              auto block_idx = builder.CreateLoad(block_index_addr);
+              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
-              auto block_size_addr = M->getGlobalVariable("block_size_z");
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" ||
+                       func_name == "_ZN24__cuda_builtin_gridDim_t17__fetch_"
+                                    "builtin_xEv") {
+              auto grid_size_addr = M->getGlobalVariable("grid_size_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
-              auto block_size = ConstantInt::get(I32, block_dim[2]);
-              Call->replaceAllUsesWith(block_size);
-              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
-              auto grid_size_addr = M->getGlobalVariable("grid_size");
-              IRBuilder<> builder(context);
-              builder.SetInsertPoint(Call);
-              auto grid_size = ConstantInt::get(I32, grid_dim[0]);
+              auto grid_size = builder.CreateLoad(grid_size_addr);
              Call->replaceAllUsesWith(grid_size);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
-                       func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
-              printf("[WARNING We DO NOT support multi-dim grid\n");
-              auto one = ConstantInt::get(I32, 1);
-              Call->replaceAllUsesWith(one);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") {
+              auto grid_size_addr = M->getGlobalVariable("grid_size_y");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto grid_size = builder.CreateLoad(grid_size_addr);
+              Call->replaceAllUsesWith(grid_size);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
+              auto grid_size_addr = M->getGlobalVariable("grid_size_z");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto grid_size = builder.CreateLoad(grid_size_addr);
+              Call->replaceAllUsesWith(grid_size);
              need_remove.push_back(Call);
            }
          }
@ -334,6 +409,98 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
      }
    }
  }
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
+          if (Call->getCalledFunction()) {
+            auto func_name = Call->getCalledFunction()->getName().str();
+            auto callFn = Call->getCalledFunction();
+            if (func_name == "vprintf") {
+              /*
+               * replace CUDA's printf to C's printf
+               * CUDA:
+               * %0 = tail call i32 @vprintf(i8* getelementptr inbounds ([19 x
+               * i8], [19 x i8]* @.str, i64 0, i64 0), i8* null)
+               * C: %call1 = call i32 (i8*, ...) @printf(i8* getelementptr
+               * inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0))
+               */
+              // find/create C's printf function
+              std::vector<llvm::Type *> args;
+              args.push_back(llvm::Type::getInt8PtrTy(context));
+              llvm::FunctionType *printfType =
+                  FunctionType::get(I32, args, true);
+
+              llvm::FunctionCallee _f =
+                  M->getOrInsertFunction("printf", printfType);
+              llvm::Function *func_printf =
+                  llvm::cast<llvm::Function>(_f.getCallee());
+              // construct argument(s)
+              std::vector<Value *> printf_args;
+              // first argument is same between CUDA and C
+              auto placeholder = Call->getArgOperand(0);
+              printf_args.push_back(placeholder);
+              // insert arguments
+              auto compressed_args = Call->getArgOperand(1);
+              if (auto BC = dyn_cast<BitCastInst>(compressed_args)) {
+                auto src_alloc = BC->getOperand(0);
+                auto SrcPointTy =
+                    dyn_cast<PointerType>(BC->getOperand(0)->getType());
+                auto SrcTy = SrcPointTy->getElementType();
+                // reverse the bitcast
+                auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
+                assert(SrcTy->isStructTy() == 1);
+                auto StructTy = dyn_cast<StructType>(SrcTy);
+                for (int i = 0; i < StructTy->getNumElements(); i++) {
+                  std::vector<Value *> Indices;
+                  Indices.push_back(ConstantInt::get(I32, 0));
+                  Indices.push_back(ConstantInt::get(I32, i));
+                  auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type
+                                                           src_alloc, // Alloca
+                                                           Indices,   // Indices
+                                                           "", Call);
+                  auto new_load = new LoadInst(new_GEP, "", Call);
+                  printf_args.push_back(new_load);
+                }
+              }
+              auto c_printf_inst =
+                  llvm::CallInst::Create(func_printf, printf_args, "", Call);
+              // insert
+              Call->replaceAllUsesWith(c_printf_inst);
+              need_remove.push_back(Call);
+            } else if (func_name == "__nv_fast_log2f" ||
+                       func_name == "__nv_log2f" ||
+                       func_name == "__nv_fast_powf" ||
+                       func_name == "__nv_powf" || func_name == "__nv_logf" ||
+                       func_name == "__nv_expf" || func_name == "__nv_fabsf" ||
+                       func_name == "__nv_log10f" ||
+                       func_name == "__nv_fmodf" || func_name == "__nv_sqrt" ||
+                       func_name == "__nv_sqrtf" || func_name == "__nv_exp" ||
+                       func_name == "__nv_isnanf" ||
+                       func_name == "__nv_isinff" || func_name == "__nv_powi" ||
+                       func_name == "__nv_powif") {
+              Call->getCalledFunction()->deleteBody();
+            } else if (func_name == "llvm.nvvm.fma.rn.d") {
+              Call->getCalledFunction()->setName("__nvvm_fma_rn_d");
+            } else if (func_name == "llvm.nvvm.d2i.lo") {
+              Call->getCalledFunction()->setName("__nvvm_d2i_lo");
+            } else if (func_name == "llvm.nvvm.d2i.hi") {
+              Call->getCalledFunction()->setName("__nvvm_d2i_hi");
+            } else if (func_name == "llvm.nvvm.add.rn.d") {
+              Call->getCalledFunction()->setName("__nvvm_add_rn_d");
+            } else if (func_name == "llvm.nvvm.lohi.i2d") {
+              Call->getCalledFunction()->setName("__nvvm_lohi_i2d");
+            } else if (func_name == "llvm.nvvm.fabs.f") {
+              Call->getCalledFunction()->setName("__nvvm_fabs_f");
+            } else if (func_name == "llvm.nvvm.mul24.i") {
+              Call->getCalledFunction()->setName("__nvvm_mul24_i");
+            }
+          }
+        }
+      }
+    }
+  }

  for (auto inst : need_remove) {
    inst->eraseFromParent();
@ -382,6 +549,8 @@ bool has_warp_barrier(llvm::BasicBlock *B) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
+      if (Call->isInlineAsm())
+        continue;
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.bar.warp.sync") {
        return true;
@ -396,6 +565,8 @@ bool has_barrier(llvm::BasicBlock *B) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
+      if (Call->isInlineAsm())
+        continue;
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.barrier0" ||
          func_name == "llvm.nvvm.bar.warp.sync" ||
@ -412,6 +583,8 @@ bool has_block_barrier(llvm::BasicBlock *B) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
+      if (Call->isInlineAsm())
+        continue;
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.barrier0" ||
          func_name == "llvm.nvvm.barrier.sync") {
@ -478,3 +651,21 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
  }
  return 0;
 }
+
+/*
+  Print IR to String Output for Debugging Purposes
+*/
+// void printModule(llvm::Module *M) {
+//   std::string str;
+//   llvm::raw_string_ostream ss(str);
+//   std::cout << "### Printing Module ###" << std::endl;
+//   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+//     Function *F = &(*i);
+//     auto func_name = F->getName().str();
+//     std::cout << func_name << std::endl;
+//     for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+//       BasicBlock *B = &(*b);
+//       errs() << *B;
+//     }
+//   }
+// }
--- a/compilation/KernelTranslation/lib/warp_func.cpp
+++ b/compilation/KernelTranslation/lib/warp_func.cpp
@ -44,6 +44,8 @@ void handle_warp_vote(llvm::Module *M) {
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
        if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
+          if (vote_any_sync->isInlineAsm())
+            continue;
          auto func_name = vote_any_sync->getCalledFunction()->getName();
          if (func_name == "llvm.nvvm.vote.any.sync" ||
              func_name == "llvm.nvvm.vote.all.sync") {
--- a/compilation/examples/reduce/host.cpp
+++ b/compilation/examples/reduce/host.cpp
@ -1,82 +0,0 @@
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-
-#define NUM_WARP 2
-#define NUM_BLOCK 1
-
-int block_size = 32 * NUM_WARP;
-int block_size_x = block_size;
-int block_size_y = 1;
-int block_size_z = 1;
-__thread int block_index = 0;
-int grid_size = NUM_BLOCK;
-
-extern "C" {
-void *_Z7reduce0PiS_j_wrapper(void *);
-__thread int warp_shfl[32];
-}
-
-void *wrap(void *p) {
-  int **res = (int **)p;
-  block_index = (*(int *)res[3]);
-  _Z7reduce0PiS_j_wrapper(p);
-  return NULL;
-}
-
-void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
-  int **ret = new int *[4];
-
-  int **p0 = new int *;
-  *p0 = g_idata;
-  ret[0] = (int *)(p0);
-
-  int **p1 = new int *;
-  *p1 = g_odata;
-  ret[1] = (int *)(p1);
-
-  unsigned int *p2 = new unsigned int;
-  *p2 = n;
-  ret[2] = (int *)p2;
-
-  int *p3 = new int;
-  *p3 = bid;
-  ret[3] = (int *)p3;
-
-  return (void *)ret;
-}
-
-int main(int argc, char *argv[]) {
-  int *g_idata;
-
-  int size = block_size * NUM_BLOCK;
-  g_idata = new int[size * 2];
-  int *res = new int[size];
-
-  for (int i = 0; i < size; i++) {
-    g_idata[i] = i;
-  }
-
-  pthread_t threads[NUM_BLOCK];
-
-  void *inp[NUM_BLOCK];
-  for (long t = 0; t < NUM_BLOCK; t++) {
-    inp[t] = gen_input(t, g_idata, res, size);
-  }
-
-  for (long t = 0; t < NUM_BLOCK; t++) {
-    pthread_create(&threads[t], NULL, wrap, inp[t]);
-  }
-  for (long t = 0; t < NUM_BLOCK; t++)
-    pthread_join(threads[t], NULL);
-  int gold = 0;
-  for (int i = 0; i < size; i++) {
-    gold += g_idata[i];
-  }
-  assert(*res == gold && "Incorrect res\n");
-  printf("PASS\n");
-
-  pthread_exit(NULL);
-}
--- a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,150 +0,0 @@
-; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "kernel.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: convergent nounwind
-define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
-entry:
-  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
-  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
-  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
-  %mul = mul i32 %2, %1
-  %add = add i32 %mul, %0
-  %cmp = icmp ult i32 %add, %n
-  br i1 %cmp, label %cond.true, label %cond.end
-
-cond.true:                                        ; preds = %entry
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4, !tbaa !13
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
-  %idxprom5 = zext i32 %0 to i64
-  %arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
-  %arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
-  store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
-  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
-  %cmp839 = icmp ugt i32 %2, 1
-  br i1 %cmp839, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %if.end, %cond.end
-  %cmp18 = icmp eq i32 %0, 0
-  br i1 %cmp18, label %if.then19, label %if.end23
-
-for.body:                                         ; preds = %cond.end, %if.end
-  %s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
-  %mul9 = shl nuw nsw i32 %s.040, 1
-  %rem = urem i32 %0, %mul9
-  %cmp10 = icmp eq i32 %rem, 0
-  br i1 %cmp10, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %add11 = add i32 %s.040, %0
-  %idxprom12 = zext i32 %add11 to i64
-  %arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
-  %arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
-  %4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
-  %5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
-  %add16 = add nsw i32 %5, %4
-  store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
-  %cmp8 = icmp ult i32 %mul9, %2
-  br i1 %cmp8, label %for.body, label %for.cond.cleanup
-
-if.then19:                                        ; preds = %for.cond.cleanup
-  %idxprom21 = zext i32 %1 to i64
-  %arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
-  %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
-  store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
-  br label %if.end23
-
-if.end23:                                         ; preds = %if.then19, %for.cond.cleanup
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
-
-; Function Attrs: convergent nounwind
-declare void @llvm.nvvm.barrier.sync(i32) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { convergent nounwind }
-attributes #4 = { nounwind }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
-!10 = !{i32 0, i32 1024}
-!11 = !{i32 0, i32 2147483647}
-!12 = !{i32 1, i32 1025}
-!13 = !{!14, !14, i64 0}
-!14 = !{!"int", !15, i64 0}
-!15 = !{!"omnipotent char", !16, i64 0}
-!16 = !{!"Simple C++ TBAA"}
--- a/compilation/examples/reduce/run.sh
+++ b/compilation/examples/reduce/run.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
-../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
-llc --filetype=obj kernel.bc
-g++ host.cpp kernel.o -lpthread -o test
-./test
--- a/compilation/examples/reduce_shuffle/host.cpp
+++ b/compilation/examples/reduce_shuffle/host.cpp
@ -1,82 +0,0 @@
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-
-#define NUM_WARP 2
-#define NUM_BLOCK 1
-
-int block_size = 32 * NUM_WARP;
-int block_size_x = block_size;
-int block_size_y = 1;
-int block_size_z = 1;
-__thread int block_index = 0;
-int grid_size = NUM_BLOCK;
-
-extern "C" {
-void *_Z7reduce5PiS_j_wrapper(void *);
-__thread int warp_shfl[32];
-}
-
-void *wrap(void *p) {
-  int **res = (int **)p;
-  block_index = (*(int *)res[3]);
-  _Z7reduce5PiS_j_wrapper(p);
-  return NULL;
-}
-
-void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
-  int **ret = new int *[4];
-
-  int **p0 = new int *;
-  *p0 = g_idata;
-  ret[0] = (int *)(p0);
-
-  int **p1 = new int *;
-  *p1 = g_odata;
-  ret[1] = (int *)(p1);
-
-  unsigned int *p2 = new unsigned int;
-  *p2 = n;
-  ret[2] = (int *)p2;
-
-  int *p3 = new int;
-  *p3 = bid;
-  ret[3] = (int *)p3;
-
-  return (void *)ret;
-}
-
-int main(int argc, char *argv[]) {
-  int *g_idata;
-
-  int size = block_size * NUM_BLOCK;
-  g_idata = new int[size * 2];
-  int *res = new int[size];
-
-  for (int i = 0; i < size; i++) {
-    g_idata[i] = i;
-  }
-
-  pthread_t threads[NUM_BLOCK];
-
-  void *inp[NUM_BLOCK];
-  for (long t = 0; t < NUM_BLOCK; t++) {
-    inp[t] = gen_input(t, g_idata, res, size);
-  }
-
-  for (long t = 0; t < NUM_BLOCK; t++) {
-    pthread_create(&threads[t], NULL, wrap, inp[t]);
-  }
-  for (long t = 0; t < NUM_BLOCK; t++)
-    pthread_join(threads[t], NULL);
-  int gold = 0;
-  for (int i = 0; i < size; i++) {
-    gold += g_idata[i];
-  }
-  assert(*res == gold && "Incorrect res\n");
-  printf("PASS\n");
-
-  pthread_exit(NULL);
-}
--- a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,179 +0,0 @@
-; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "kernel.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: convergent nounwind
-define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
-entry:
-  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
-  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
-  %mul = shl i32 %1, 7
-  %add = add i32 %mul, %0
-  %cmp = icmp ult i32 %add, %n
-  br i1 %cmp, label %cond.true, label %cond.end
-
-cond.true:                                        ; preds = %entry
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
-  %2 = load i32, i32* %arrayidx, align 4, !tbaa !12
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
-  %add4 = add i32 %add, 64
-  %cmp5 = icmp ult i32 %add4, %n
-  br i1 %cmp5, label %if.then, label %if.end
-
-if.then:                                          ; preds = %cond.end
-  %idxprom7 = zext i32 %add4 to i64
-  %arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
-  %3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
-  %add9 = add nsw i32 %3, %cond
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %cond.end
-  %mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
-  %idxprom10 = zext i32 %0 to i64
-  %arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
-  %arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
-  store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
-  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
-  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
-  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
-  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
-  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
-  %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
-  %mul.i.i52 = mul nuw nsw i32 %5, %4
-  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
-  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
-  %mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
-  %add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
-  %add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
-  %cmp14 = icmp ult i32 %add8.i.i55, 32
-  br i1 %cmp14, label %if.then15, label %if.end32
-
-if.then15:                                        ; preds = %if.end
-  %add16 = add nuw nsw i32 %0, 32
-  %idxprom17 = zext i32 %add16 to i64
-  %arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
-  %arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
-  %8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
-  %add19 = add nsw i32 %8, %mySum.0
-  %9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
-  %add23 = add nsw i32 %9, %add19
-  %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
-  %add23.1 = add nsw i32 %10, %add23
-  %11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
-  %add23.2 = add nsw i32 %11, %add23.1
-  %12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
-  %add23.3 = add nsw i32 %12, %add23.2
-  %13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
-  %cmp27 = icmp eq i32 %add8.i.i55, 0
-  br i1 %cmp27, label %if.then28, label %if.end32
-
-if.then28:                                        ; preds = %if.then15
-  %add23.4 = add nsw i32 %13, %add23.3
-  %idxprom30 = zext i32 %1 to i64
-  %arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
-  store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
-  br label %if.end32
-
-if.end32:                                         ; preds = %if.end, %if.then28, %if.then15
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
-
-; Function Attrs: convergent nounwind
-declare void @llvm.nvvm.barrier.sync(i32) #3
-
-; Function Attrs: convergent inaccessiblememonly nounwind
-declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { convergent nounwind }
-attributes #4 = { convergent inaccessiblememonly nounwind }
-attributes #5 = { nounwind }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
-!10 = !{i32 0, i32 1024}
-!11 = !{i32 0, i32 2147483647}
-!12 = !{!13, !13, i64 0}
-!13 = !{!"int", !14, i64 0}
-!14 = !{!"omnipotent char", !15, i64 0}
-!15 = !{!"Simple C++ TBAA"}
-!16 = !{i32 0, i32 64}
-!17 = !{i32 1, i32 1025}
--- a/compilation/examples/reduce_shuffle/run.sh
+++ b/compilation/examples/reduce_shuffle/run.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
-../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
-llc --filetype=obj kernel.bc
-g++ host.cpp kernel.o -lpthread -o test
-./test
--- a/compilation/examples/run_example.sh
+++ b/compilation/examples/run_example.sh
@ -1,11 +0,0 @@
-#!bin/sh
-for file in ./*
-do
-    if test -d $file
-    then
-        echo executing $file
-        cd $file
-        bash run.sh
-        cd ..
-    fi
-done
--- a/compilation/examples/vecadd/host.cpp
+++ b/compilation/examples/vecadd/host.cpp
@ -1,84 +0,0 @@
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-#define NUM_BLOCK 1
-int N = 32;
-
-int block_size = 32;
-int block_size_x = block_size;
-int block_size_y = 1;
-int block_size_z = 1;
-__thread int block_index = 0;
-int grid_size = NUM_BLOCK;
-
-extern "C" {
-void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
-}
-
-void *wrap(void *p) {
-  int **res = (int **)p;
-  block_index = (*(int *)res[4]);
-  _Z9vectorAddPKfS0_Pfi_wrapper(p);
-  return NULL;
-}
-
-void *gen_input(int bid, float *A, float *B, float *C, int N) {
-  int **ret = new int *[5];
-
-  float **p0 = new float *;
-  *p0 = A;
-  ret[0] = (int *)(p0);
-
-  float **p1 = new float *;
-  *p1 = B;
-  ret[1] = (int *)(p1);
-
-  float **p2 = new float *;
-  *p2 = C;
-  ret[2] = (int *)(p2);
-
-  int *p3 = new int;
-  *p3 = N;
-  ret[3] = (int *)p3;
-
-  int *p4 = new int;
-  *p4 = bid;
-  ret[4] = (int *)p4;
-
-  return (void *)ret;
-}
-
-int main() {
-  float *A, *B, *C;
-
-  A = new float[N];
-  B = new float[N];
-  C = new float[N];
-
-  for (int i = 0; i < N; i++) {
-    A[i] = i;
-    B[i] = 1;
-    C[i] = 0;
-  }
-
-  pthread_t threads[NUM_BLOCK];
-
-  int rc;
-  for (long t = 0; t < NUM_BLOCK; t++) {
-    void *inp = gen_input(t, A, B, C, N);
-    rc = pthread_create(&threads[t], NULL, wrap, inp);
-  }
-  clock_t t1 = clock();
-  /* Last thing that main() should do */
-  for (long t = 0; t < NUM_BLOCK; t++)
-    pthread_join(threads[t], NULL);
-
-  for (int i = 0; i < N; i++) {
-    assert(C[i] == (A[i] + B[i]));
-  }
-  printf("PASS\n");
-  pthread_exit(NULL);
-}
--- a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,86 +0,0 @@
-; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
-source_filename = "kernel.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nounwind
-define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
-entry:
-  ret i32 999
-}
-
-; Function Attrs: nofree nounwind
-define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
-entry:
-  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
-  %idxprom8 = zext i32 %0 to i64
-  %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
-  %1 = load float, float* %arrayidx, align 4, !tbaa !11
-  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
-  %2 = load float, float* %arrayidx2, align 4, !tbaa !11
-  %add = fadd contract float %1, %2
-  %arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
-  store float %add, float* %arrayidx4, align 4, !tbaa !11
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
-!llvm.ident = !{!8}
-!nvvmir.version = !{!9}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
-!4 = !{null, !"align", i32 8}
-!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!6 = !{null, !"align", i32 16}
-!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
-!9 = !{i32 1, i32 4}
-!10 = !{i32 0, i32 1024}
-!11 = !{!12, !12, i64 0}
-!12 = !{!"float", !13, i64 0}
-!13 = !{!"omnipotent char", !14, i64 0}
-!14 = !{!"Simple C++ TBAA"}
--- a/compilation/examples/vecadd/run.sh
+++ b/compilation/examples/vecadd/run.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
-../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
-llc --filetype=obj kernel.bc
-g++ host.cpp kernel.o -lpthread -o test
-./test
--- a/docs/figures/workflow.png
+++ b/docs/figures/workflow.png
--- a/docs/workflow.md
+++ b/docs/workflow.md
@ -1,11 +0,0 @@
-# The workflow of CuPBoP
-
-The workflow of CuPBoP is described as following:
-![The workflow of executing CUDA applications on CuPBoP.](figures/workflow.png)
-First, CuPBoP uses Clang to compile the CUDA source code into NVVM IR,
-which consists of two parts: Host part and Kernel Part.
-In the next step, CuPBoP-compilation parses and transforms these NVVM IRs
-to make it suitable for executing on specific architectures.
-The CuPBoP-runtime compiles the transformed Host IR and executes the generated programs,
-which will compile the transformed Kernel IR and
-upload the compiled kernel programs to specific architectures.
--- a/examples/backprop/backprop.c
+++ b/examples/backprop/backprop.c
@ -0,0 +1,454 @@
+#include "backprop.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+//#define OPEN
+
+#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
+
+#define fastcopy(to, from, len)                                                \
+  {                                                                            \
+    register char *_to, *_from;                                                \
+    register int _i, _l;                                                       \
+    _to = (char *)(to);                                                        \
+    _from = (char *)(from);                                                    \
+    _l = (len);                                                                \
+    for (_i = 0; _i < _l; _i++)                                                \
+      *_to++ = *_from++;                                                       \
+  }
+
+/*** Return random number between 0.0 and 1.0 ***/
+float drnd() { return ((float)rand() / (float)BIGRND); }
+
+/*** Return random number between -1.0 and 1.0 ***/
+float dpn1() { return ((drnd() * 2.0) - 1.0); }
+
+/*** The squashing function.  Currently, it's a sigmoid. ***/
+
+float squash(x)
+float x;
+{
+  float m;
+  // x = -x;
+  // m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
+  // return(1.0 / (1.0 + m));
+  return (1.0 / (1.0 + exp(-x)));
+}
+
+/*** Allocate 1d array of floats ***/
+
+float *alloc_1d_dbl(n)
+int n;
+{
+  float *new;
+
+  new = (float *)malloc((unsigned)(n * sizeof(float)));
+  if (new == NULL) {
+    printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
+    return (NULL);
+  }
+  return (new);
+}
+
+/*** Allocate 2d array of floats ***/
+
+float **alloc_2d_dbl(m, n)
+int m, n;
+{
+  int i;
+  float **new;
+
+  new = (float **)malloc((unsigned)(m * sizeof(float *)));
+  if (new == NULL) {
+    printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
+    return (NULL);
+  }
+
+  for (i = 0; i < m; i++) {
+    new[i] = alloc_1d_dbl(n);
+  }
+
+  return (new);
+}
+
+bpnn_randomize_weights(w, m, n) float **w;
+int m, n;
+{
+  int i, j;
+
+  for (i = 0; i <= m; i++) {
+    for (j = 0; j <= n; j++) {
+      w[i][j] = (float)rand() / RAND_MAX;
+      //  w[i][j] = dpn1();
+    }
+  }
+}
+
+bpnn_randomize_row(w, m) float *w;
+int m;
+{
+  int i;
+  for (i = 0; i <= m; i++) {
+    // w[i] = (float) rand()/RAND_MAX;
+    w[i] = 0.1;
+  }
+}
+
+bpnn_zero_weights(w, m, n) float **w;
+int m, n;
+{
+  int i, j;
+
+  for (i = 0; i <= m; i++) {
+    for (j = 0; j <= n; j++) {
+      w[i][j] = 0.0;
+    }
+  }
+}
+
+void bpnn_initialize(seed) {
+  printf("Random number generator seed: %d\n", seed);
+  srand(seed);
+}
+
+BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
+int n_in, n_hidden, n_out;
+{
+  BPNN *newnet;
+
+  newnet = (BPNN *)malloc(sizeof(BPNN));
+  if (newnet == NULL) {
+    printf("BPNN_CREATE: Couldn't allocate neural network\n");
+    return (NULL);
+  }
+
+  newnet->input_n = n_in;
+  newnet->hidden_n = n_hidden;
+  newnet->output_n = n_out;
+  newnet->input_units = alloc_1d_dbl(n_in + 1);
+  newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
+  newnet->output_units = alloc_1d_dbl(n_out + 1);
+
+  newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
+  newnet->output_delta = alloc_1d_dbl(n_out + 1);
+  newnet->target = alloc_1d_dbl(n_out + 1);
+
+  newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
+  newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
+
+  newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
+  newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
+
+  return (newnet);
+}
+
+void bpnn_free(net) BPNN *net;
+{
+  int n1, n2, i;
+
+  n1 = net->input_n;
+  n2 = net->hidden_n;
+
+  free((char *)net->input_units);
+  free((char *)net->hidden_units);
+  free((char *)net->output_units);
+
+  free((char *)net->hidden_delta);
+  free((char *)net->output_delta);
+  free((char *)net->target);
+
+  for (i = 0; i <= n1; i++) {
+    free((char *)net->input_weights[i]);
+    free((char *)net->input_prev_weights[i]);
+  }
+  free((char *)net->input_weights);
+  free((char *)net->input_prev_weights);
+
+  for (i = 0; i <= n2; i++) {
+    free((char *)net->hidden_weights[i]);
+    free((char *)net->hidden_prev_weights[i]);
+  }
+  free((char *)net->hidden_weights);
+  free((char *)net->hidden_prev_weights);
+
+  free((char *)net);
+}
+
+/*** Creates a new fully-connected network from scratch,
+     with the given numbers of input, hidden, and output units.
+     Threshold units are automatically included.  All weights are
+     randomly initialized.
+     Space is also allocated for temporary storage (momentum weights,
+     error computations, etc).
+***/
+
+BPNN *bpnn_create(n_in, n_hidden, n_out)
+int n_in, n_hidden, n_out;
+{
+
+  BPNN *newnet;
+
+  newnet = bpnn_internal_create(n_in, n_hidden, n_out);
+
+#ifdef INITZERO
+  bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
+#else
+  bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
+#endif
+  bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
+  bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
+  bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
+  bpnn_randomize_row(newnet->target, n_out);
+  return (newnet);
+}
+
+void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
+int n1, n2;
+{
+  float sum;
+  int j, k;
+
+  /*** Set up thresholding unit ***/
+  l1[0] = 1.0;
+#ifdef OPEN
+  omp_set_num_threads(NUM_THREAD);
+#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
+#endif
+  /*** For each unit in second layer ***/
+  for (j = 1; j <= n2; j++) {
+
+    /*** Compute weighted sum of its inputs ***/
+    sum = 0.0;
+    for (k = 0; k <= n1; k++) {
+      sum += conn[k][j] * l1[k];
+    }
+    l2[j] = squash(sum);
+  }
+}
+
+// extern "C"
+void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
+    *output, *err;
+int nj;
+{
+  int j;
+  float o, t, errsum;
+  errsum = 0.0;
+  for (j = 1; j <= nj; j++) {
+    o = output[j];
+    t = target[j];
+    delta[j] = o * (1.0 - o) * (t - o);
+    errsum += ABS(delta[j]);
+  }
+  *err = errsum;
+}
+
+void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
+                       err) float *delta_h,
+    *delta_o, *hidden, **who, *err;
+int nh, no;
+{
+  int j, k;
+  float h, sum, errsum;
+
+  errsum = 0.0;
+  for (j = 1; j <= nh; j++) {
+    h = hidden[j];
+    sum = 0.0;
+    for (k = 1; k <= no; k++) {
+      sum += delta_o[k] * who[j][k];
+    }
+    delta_h[j] = h * (1.0 - h) * sum;
+    errsum += ABS(delta_h[j]);
+  }
+  *err = errsum;
+}
+
+void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
+    **w, **oldw;
+{
+  float new_dw;
+  int k, j;
+  ly[0] = 1.0;
+  // eta = 0.3;
+  // momentum = 0.3;
+
+#ifdef OPEN
+  omp_set_num_threads(NUM_THREAD);
+#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw)          \
+    firstprivate(ndelta, nly, momentum)
+#endif
+  for (j = 1; j <= ndelta; j++) {
+    for (k = 0; k <= nly; k++) {
+      new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
+      w[k][j] += new_dw;
+      oldw[k][j] = new_dw;
+    }
+  }
+}
+
+void bpnn_feedforward(net) BPNN *net;
+{
+  int in, hid, out;
+
+  in = net->input_n;
+  hid = net->hidden_n;
+  out = net->output_n;
+
+  /*** Feed forward input activations. ***/
+  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
+                    hid);
+  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
+                    hid, out);
+}
+
+void bpnn_train(net, eo, eh) BPNN *net;
+float *eo, *eh;
+{
+  int in, hid, out;
+  float out_err, hid_err;
+
+  in = net->input_n;
+  hid = net->hidden_n;
+  out = net->output_n;
+
+  /*** Feed forward input activations. ***/
+  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
+                    hid);
+  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
+                    hid, out);
+
+  /*** Compute error on output and hidden units. ***/
+  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
+                    &out_err);
+  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
+                    net->hidden_weights, net->hidden_units, &hid_err);
+  *eo = out_err;
+  *eh = hid_err;
+
+  /*** Adjust input and hidden weights. ***/
+  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
+                      net->hidden_weights, net->hidden_prev_weights);
+  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
+                      net->input_weights, net->input_prev_weights);
+}
+
+void bpnn_save(net, filename) BPNN *net;
+char *filename;
+{
+  int n1, n2, n3, i, j, memcnt;
+  float dvalue, **w;
+  char *mem;
+  /// add//
+  FILE *pFile;
+  pFile = fopen(filename, "w+");
+  ///////
+  /*
+  if ((fd = creat(filename, 0644)) == -1) {
+    printf("BPNN_SAVE: Cannot create '%s'\n", filename);
+    return;
+  }
+  */
+
+  n1 = net->input_n;
+  n2 = net->hidden_n;
+  n3 = net->output_n;
+  printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
+  // fflush(stdout);
+
+  // write(fd, (char *) &n1, sizeof(int));
+  // write(fd, (char *) &n2, sizeof(int));
+  // write(fd, (char *) &n3, sizeof(int));
+
+  fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
+  fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
+  fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
+
+  memcnt = 0;
+  w = net->input_weights;
+  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
+  for (i = 0; i <= n1; i++) {
+    for (j = 0; j <= n2; j++) {
+      dvalue = w[i][j];
+      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  // write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
+  fwrite(mem, (unsigned)(sizeof(float)),
+         (unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
+  free(mem);
+
+  memcnt = 0;
+  w = net->hidden_weights;
+  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
+  for (i = 0; i <= n2; i++) {
+    for (j = 0; j <= n3; j++) {
+      dvalue = w[i][j];
+      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  // write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
+  fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
+         pFile);
+  free(mem);
+
+  fclose(pFile);
+  return;
+}
+
+BPNN *bpnn_read(filename)
+char *filename;
+{
+  char *mem;
+  BPNN *new;
+  int fd, n1, n2, n3, i, j, memcnt;
+
+  if ((fd = open(filename, 0, 0644)) == -1) {
+    return (NULL);
+  }
+
+  printf("Reading '%s'\n", filename); // fflush(stdout);
+
+  read(fd, (char *)&n1, sizeof(int));
+  read(fd, (char *)&n2, sizeof(int));
+  read(fd, (char *)&n3, sizeof(int));
+  new = bpnn_internal_create(n1, n2, n3);
+
+  printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
+  printf("Reading input weights..."); // fflush(stdout);
+
+  memcnt = 0;
+  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
+  read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
+  for (i = 0; i <= n1; i++) {
+    for (j = 0; j <= n2; j++) {
+      fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  free(mem);
+
+  printf("Done\nReading hidden weights..."); // fflush(stdout);
+
+  memcnt = 0;
+  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
+  read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
+  for (i = 0; i <= n2; i++) {
+    for (j = 0; j <= n3; j++) {
+      fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  free(mem);
+  close(fd);
+
+  printf("Done\n"); // fflush(stdout);
+
+  bpnn_zero_weights(new->input_prev_weights, n1, n2);
+  bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
+
+  return (new);
+}
--- a/examples/backprop/backprop.h
+++ b/examples/backprop/backprop.h
@ -0,0 +1,50 @@
+#ifndef _BACKPROP_H_
+#define _BACKPROP_H_
+
+#define BIGRND 0x7fffffff
+
+#define GPU
+#define THREADS 256
+#define WIDTH 16  // shared memory width
+#define HEIGHT 16 // shared memory height
+
+#define ETA 0.3      // eta value
+#define MOMENTUM 0.3 // momentum value
+#define NUM_THREAD 4 // OpenMP threads
+
+typedef struct {
+  int input_n;  /* number of input units */
+  int hidden_n; /* number of hidden units */
+  int output_n; /* number of output units */
+
+  float *input_units;  /* the input units */
+  float *hidden_units; /* the hidden units */
+  float *output_units; /* the output units */
+
+  float *hidden_delta; /* storage for hidden unit error */
+  float *output_delta; /* storage for output unit error */
+
+  float *target; /* storage for target vector */
+
+  float **input_weights;  /* weights from input to hidden layer */
+  float **hidden_weights; /* weights from hidden to output layer */
+
+  /*** The next two are for momentum ***/
+  float **input_prev_weights;  /* previous change on input to hidden wgt */
+  float **hidden_prev_weights; /* previous change on hidden to output wgt */
+} BPNN;
+
+/*** User-level functions ***/
+
+void bpnn_initialize();
+
+BPNN *bpnn_create();
+void bpnn_free();
+
+void bpnn_train();
+void bpnn_feedforward();
+
+void bpnn_save();
+BPNN *bpnn_read();
+
+#endif
--- a/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,615 @@
+; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "backprop_cuda.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
+
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
+@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
+@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
+entry:
+  %input_cuda.addr = alloca float*, align 8
+  %output_hidden_cuda.addr = alloca float*, align 8
+  %input_hidden_cuda.addr = alloca float*, align 8
+  %hidden_partial_sum.addr = alloca float*, align 8
+  %in.addr = alloca i32, align 4
+  %hid.addr = alloca i32, align 4
+  %by = alloca i32, align 4
+  %tx = alloca i32, align 4
+  %ty = alloca i32, align 4
+  %index = alloca i32, align 4
+  %index_in = alloca i32, align 4
+  %i = alloca i32, align 4
+  %power_two = alloca i32, align 4
+  store float* %input_cuda, float** %input_cuda.addr, align 8
+  store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
+  store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
+  store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
+  store i32 %in, i32* %in.addr, align 4
+  store i32 %hid, i32* %hid.addr, align 4
+  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
+  store i32 %call, i32* %by, align 4
+  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call1, i32* %tx, align 4
+  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
+  store i32 %call2, i32* %ty, align 4
+  %0 = load i32, i32* %hid.addr, align 4
+  %add = add nsw i32 %0, 1
+  %mul = mul nsw i32 %add, 16
+  %1 = load i32, i32* %by, align 4
+  %mul3 = mul nsw i32 %mul, %1
+  %2 = load i32, i32* %hid.addr, align 4
+  %add4 = add nsw i32 %2, 1
+  %3 = load i32, i32* %ty, align 4
+  %mul5 = mul nsw i32 %add4, %3
+  %add6 = add nsw i32 %mul3, %mul5
+  %4 = load i32, i32* %tx, align 4
+  %add7 = add nsw i32 %add6, %4
+  %add8 = add nsw i32 %add7, 1
+  %5 = load i32, i32* %hid.addr, align 4
+  %add9 = add nsw i32 %5, 1
+  %add10 = add nsw i32 %add8, %add9
+  store i32 %add10, i32* %index, align 4
+  %6 = load i32, i32* %by, align 4
+  %mul11 = mul nsw i32 16, %6
+  %7 = load i32, i32* %ty, align 4
+  %add12 = add nsw i32 %mul11, %7
+  %add13 = add nsw i32 %add12, 1
+  store i32 %add13, i32* %index_in, align 4
+  %8 = load i32, i32* %tx, align 4
+  %cmp = icmp eq i32 %8, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %9 = load float*, float** %input_cuda.addr, align 8
+  %10 = load i32, i32* %index_in, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
+  %11 = load float, float* %arrayidx, align 4
+  %12 = load i32, i32* %ty, align 4
+  %idxprom14 = sext i32 %12 to i64
+  %arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
+  store float %11, float* %arrayidx15, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  call void @llvm.nvvm.barrier0()
+  %13 = load float*, float** %input_hidden_cuda.addr, align 8
+  %14 = load i32, i32* %index, align 4
+  %idxprom16 = sext i32 %14 to i64
+  %arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
+  %15 = load float, float* %arrayidx17, align 4
+  %16 = load i32, i32* %ty, align 4
+  %idxprom18 = sext i32 %16 to i64
+  %arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
+  %17 = load i32, i32* %tx, align 4
+  %idxprom20 = sext i32 %17 to i64
+  %arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
+  store float %15, float* %arrayidx21, align 4
+  call void @llvm.nvvm.barrier0()
+  %18 = load i32, i32* %ty, align 4
+  %idxprom22 = sext i32 %18 to i64
+  %arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
+  %19 = load i32, i32* %tx, align 4
+  %idxprom24 = sext i32 %19 to i64
+  %arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
+  %20 = load float, float* %arrayidx25, align 4
+  %21 = load i32, i32* %ty, align 4
+  %idxprom26 = sext i32 %21 to i64
+  %arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
+  %22 = load float, float* %arrayidx27, align 4
+  %mul28 = fmul contract float %20, %22
+  %23 = load i32, i32* %ty, align 4
+  %idxprom29 = sext i32 %23 to i64
+  %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
+  %24 = load i32, i32* %tx, align 4
+  %idxprom31 = sext i32 %24 to i64
+  %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
+  store float %mul28, float* %arrayidx32, align 4
+  call void @llvm.nvvm.barrier0()
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %25 = load i32, i32* %i, align 4
+  %conv = sitofp i32 %25 to float
+  %call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
+  %cmp34 = fcmp ole float %conv, %call33
+  br i1 %cmp34, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %26 = load i32, i32* %i, align 4
+  %conv35 = sitofp i32 %26 to float
+  %call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
+  %conv37 = fptosi float %call36 to i32
+  store i32 %conv37, i32* %power_two, align 4
+  %27 = load i32, i32* %ty, align 4
+  %28 = load i32, i32* %power_two, align 4
+  %rem = srem i32 %27, %28
+  %cmp38 = icmp eq i32 %rem, 0
+  br i1 %cmp38, label %if.then39, label %if.end54
+
+if.then39:                                        ; preds = %for.body
+  %29 = load i32, i32* %ty, align 4
+  %idxprom40 = sext i32 %29 to i64
+  %arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
+  %30 = load i32, i32* %tx, align 4
+  %idxprom42 = sext i32 %30 to i64
+  %arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
+  %31 = load float, float* %arrayidx43, align 4
+  %32 = load i32, i32* %ty, align 4
+  %33 = load i32, i32* %power_two, align 4
+  %div = sdiv i32 %33, 2
+  %add44 = add nsw i32 %32, %div
+  %idxprom45 = sext i32 %add44 to i64
+  %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
+  %34 = load i32, i32* %tx, align 4
+  %idxprom47 = sext i32 %34 to i64
+  %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
+  %35 = load float, float* %arrayidx48, align 4
+  %add49 = fadd contract float %31, %35
+  %36 = load i32, i32* %ty, align 4
+  %idxprom50 = sext i32 %36 to i64
+  %arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
+  %37 = load i32, i32* %tx, align 4
+  %idxprom52 = sext i32 %37 to i64
+  %arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
+  store float %add49, float* %arrayidx53, align 4
+  br label %if.end54
+
+if.end54:                                         ; preds = %if.then39, %for.body
+  call void @llvm.nvvm.barrier0()
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end54
+  %38 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %38, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %39 = load i32, i32* %ty, align 4
+  %idxprom55 = sext i32 %39 to i64
+  %arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
+  %40 = load i32, i32* %tx, align 4
+  %idxprom57 = sext i32 %40 to i64
+  %arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
+  %41 = load float, float* %arrayidx58, align 4
+  %42 = load float*, float** %input_hidden_cuda.addr, align 8
+  %43 = load i32, i32* %index, align 4
+  %idxprom59 = sext i32 %43 to i64
+  %arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
+  store float %41, float* %arrayidx60, align 4
+  call void @llvm.nvvm.barrier0()
+  %44 = load i32, i32* %tx, align 4
+  %cmp61 = icmp eq i32 %44, 0
+  br i1 %cmp61, label %if.then62, label %if.end71
+
+if.then62:                                        ; preds = %for.end
+  %45 = load i32, i32* %tx, align 4
+  %idxprom63 = sext i32 %45 to i64
+  %arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
+  %46 = load i32, i32* %ty, align 4
+  %idxprom65 = sext i32 %46 to i64
+  %arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
+  %47 = load float, float* %arrayidx66, align 4
+  %48 = load float*, float** %hidden_partial_sum.addr, align 8
+  %49 = load i32, i32* %by, align 4
+  %50 = load i32, i32* %hid.addr, align 4
+  %mul67 = mul nsw i32 %49, %50
+  %51 = load i32, i32* %ty, align 4
+  %add68 = add nsw i32 %mul67, %51
+  %idxprom69 = sext i32 %add68 to i64
+  %arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
+  store float %47, float* %arrayidx70, align 4
+  br label %if.end71
+
+if.end71:                                         ; preds = %if.then62, %for.end
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  ret i32 %0
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline convergent nounwind
+define internal float @_ZL7__log2ff(float %__a) #1 {
+entry:
+  %__a.addr = alloca float, align 4
+  store float %__a, float* %__a.addr, align 4
+  %0 = load float, float* %__a.addr, align 4
+  %call = call float @__nv_fast_log2f(float %0) #2
+  ret float %call
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
+entry:
+  %__a.addr = alloca float, align 4
+  %__b.addr = alloca float, align 4
+  store float %__a, float* %__a.addr, align 4
+  store float %__b, float* %__b.addr, align 4
+  %0 = load float, float* %__a.addr, align 4
+  %1 = load float, float* %__b.addr, align 4
+  %call = call float @__nv_fast_powf(float %0, float %1) #2
+  ret float %call
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
+entry:
+  %delta.addr = alloca float*, align 8
+  %hid.addr = alloca i32, align 4
+  %ly.addr = alloca float*, align 8
+  %in.addr = alloca i32, align 4
+  %w.addr = alloca float*, align 8
+  %oldw.addr = alloca float*, align 8
+  %by = alloca i32, align 4
+  %tx = alloca i32, align 4
+  %ty = alloca i32, align 4
+  %index = alloca i32, align 4
+  %index_y = alloca i32, align 4
+  %index_x = alloca i32, align 4
+  store float* %delta, float** %delta.addr, align 8
+  store i32 %hid, i32* %hid.addr, align 4
+  store float* %ly, float** %ly.addr, align 8
+  store i32 %in, i32* %in.addr, align 4
+  store float* %w, float** %w.addr, align 8
+  store float* %oldw, float** %oldw.addr, align 8
+  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
+  store i32 %call, i32* %by, align 4
+  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call1, i32* %tx, align 4
+  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
+  store i32 %call2, i32* %ty, align 4
+  %0 = load i32, i32* %hid.addr, align 4
+  %add = add nsw i32 %0, 1
+  %mul = mul nsw i32 %add, 16
+  %1 = load i32, i32* %by, align 4
+  %mul3 = mul nsw i32 %mul, %1
+  %2 = load i32, i32* %hid.addr, align 4
+  %add4 = add nsw i32 %2, 1
+  %3 = load i32, i32* %ty, align 4
+  %mul5 = mul nsw i32 %add4, %3
+  %add6 = add nsw i32 %mul3, %mul5
+  %4 = load i32, i32* %tx, align 4
+  %add7 = add nsw i32 %add6, %4
+  %add8 = add nsw i32 %add7, 1
+  %5 = load i32, i32* %hid.addr, align 4
+  %add9 = add nsw i32 %5, 1
+  %add10 = add nsw i32 %add8, %add9
+  store i32 %add10, i32* %index, align 4
+  %6 = load i32, i32* %by, align 4
+  %mul11 = mul nsw i32 16, %6
+  %7 = load i32, i32* %ty, align 4
+  %add12 = add nsw i32 %mul11, %7
+  %add13 = add nsw i32 %add12, 1
+  store i32 %add13, i32* %index_y, align 4
+  %8 = load i32, i32* %tx, align 4
+  %add14 = add nsw i32 %8, 1
+  store i32 %add14, i32* %index_x, align 4
+  %9 = load float*, float** %delta.addr, align 8
+  %10 = load i32, i32* %index_x, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
+  %11 = load float, float* %arrayidx, align 4
+  %conv = fpext float %11 to double
+  %mul15 = fmul contract double 3.000000e-01, %conv
+  %12 = load float*, float** %ly.addr, align 8
+  %13 = load i32, i32* %index_y, align 4
+  %idxprom16 = sext i32 %13 to i64
+  %arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
+  %14 = load float, float* %arrayidx17, align 4
+  %conv18 = fpext float %14 to double
+  %mul19 = fmul contract double %mul15, %conv18
+  %15 = load float*, float** %oldw.addr, align 8
+  %16 = load i32, i32* %index, align 4
+  %idxprom20 = sext i32 %16 to i64
+  %arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
+  %17 = load float, float* %arrayidx21, align 4
+  %conv22 = fpext float %17 to double
+  %mul23 = fmul contract double 3.000000e-01, %conv22
+  %add24 = fadd contract double %mul19, %mul23
+  %18 = load float*, float** %w.addr, align 8
+  %19 = load i32, i32* %index, align 4
+  %idxprom25 = sext i32 %19 to i64
+  %arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
+  %20 = load float, float* %arrayidx26, align 4
+  %conv27 = fpext float %20 to double
+  %add28 = fadd contract double %conv27, %add24
+  %conv29 = fptrunc double %add28 to float
+  store float %conv29, float* %arrayidx26, align 4
+  %21 = load float*, float** %delta.addr, align 8
+  %22 = load i32, i32* %index_x, align 4
+  %idxprom30 = sext i32 %22 to i64
+  %arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
+  %23 = load float, float* %arrayidx31, align 4
+  %conv32 = fpext float %23 to double
+  %mul33 = fmul contract double 3.000000e-01, %conv32
+  %24 = load float*, float** %ly.addr, align 8
+  %25 = load i32, i32* %index_y, align 4
+  %idxprom34 = sext i32 %25 to i64
+  %arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
+  %26 = load float, float* %arrayidx35, align 4
+  %conv36 = fpext float %26 to double
+  %mul37 = fmul contract double %mul33, %conv36
+  %27 = load float*, float** %oldw.addr, align 8
+  %28 = load i32, i32* %index, align 4
+  %idxprom38 = sext i32 %28 to i64
+  %arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
+  %29 = load float, float* %arrayidx39, align 4
+  %conv40 = fpext float %29 to double
+  %mul41 = fmul contract double 3.000000e-01, %conv40
+  %add42 = fadd contract double %mul37, %mul41
+  %conv43 = fptrunc double %add42 to float
+  %30 = load float*, float** %oldw.addr, align 8
+  %31 = load i32, i32* %index, align 4
+  %idxprom44 = sext i32 %31 to i64
+  %arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
+  store float %conv43, float* %arrayidx45, align 4
+  call void @llvm.nvvm.barrier0()
+  %32 = load i32, i32* %ty, align 4
+  %cmp = icmp eq i32 %32, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %33 = load i32, i32* %by, align 4
+  %cmp46 = icmp eq i32 %33, 0
+  br i1 %cmp46, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %34 = load float*, float** %delta.addr, align 8
+  %35 = load i32, i32* %index_x, align 4
+  %idxprom47 = sext i32 %35 to i64
+  %arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
+  %36 = load float, float* %arrayidx48, align 4
+  %conv49 = fpext float %36 to double
+  %mul50 = fmul contract double 3.000000e-01, %conv49
+  %37 = load float*, float** %oldw.addr, align 8
+  %38 = load i32, i32* %index_x, align 4
+  %idxprom51 = sext i32 %38 to i64
+  %arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
+  %39 = load float, float* %arrayidx52, align 4
+  %conv53 = fpext float %39 to double
+  %mul54 = fmul contract double 3.000000e-01, %conv53
+  %add55 = fadd contract double %mul50, %mul54
+  %40 = load float*, float** %w.addr, align 8
+  %41 = load i32, i32* %index_x, align 4
+  %idxprom56 = sext i32 %41 to i64
+  %arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
+  %42 = load float, float* %arrayidx57, align 4
+  %conv58 = fpext float %42 to double
+  %add59 = fadd contract double %conv58, %add55
+  %conv60 = fptrunc double %add59 to float
+  store float %conv60, float* %arrayidx57, align 4
+  %43 = load float*, float** %delta.addr, align 8
+  %44 = load i32, i32* %index_x, align 4
+  %idxprom61 = sext i32 %44 to i64
+  %arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
+  %45 = load float, float* %arrayidx62, align 4
+  %conv63 = fpext float %45 to double
+  %mul64 = fmul contract double 3.000000e-01, %conv63
+  %46 = load float*, float** %oldw.addr, align 8
+  %47 = load i32, i32* %index_x, align 4
+  %idxprom65 = sext i32 %47 to i64
+  %arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
+  %48 = load float, float* %arrayidx66, align 4
+  %conv67 = fpext float %48 to double
+  %mul68 = fmul contract double 3.000000e-01, %conv67
+  %add69 = fadd contract double %mul64, %mul68
+  %conv70 = fptrunc double %add69 to float
+  %49 = load float*, float** %oldw.addr, align 8
+  %50 = load i32, i32* %index_x, align 4
+  %idxprom71 = sext i32 %50 to i64
+  %arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
+  store float %conv70, float* %arrayidx72, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
+
+; Function Attrs: alwaysinline convergent inlinehint nounwind
+define internal float @__nv_fast_log2f(float %a) #4 {
+  %call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %1 = icmp ne i32 %call.i, 0
+  br i1 %1, label %2, label %4
+
+2:                                                ; preds = %0
+  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
+  br label %__nvvm_builtin_log2f.exit
+
+4:                                                ; preds = %0
+  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
+  br label %__nvvm_builtin_log2f.exit
+
+__nvvm_builtin_log2f.exit:                        ; preds = %4, %2
+  %retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %retval.0.i
+}
+
+; Function Attrs: convergent nounwind
+declare i32 @__nvvm_reflect(i8*) #5
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.lg2.approx.f(float) #3
+
+; Function Attrs: alwaysinline convergent inlinehint nounwind
+define internal float @__nv_fast_powf(float %a, float %b) #4 {
+  %call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %1 = icmp ne i32 %call.i.i, 0
+  br i1 %1, label %2, label %4
+
+2:                                                ; preds = %0
+  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
+  br label %__nv_fast_log2f.exit
+
+4:                                                ; preds = %0
+  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
+  br label %__nv_fast_log2f.exit
+
+__nv_fast_log2f.exit:                             ; preds = %4, %2
+  %retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
+  %6 = fmul float %b, %retval.0.i.i
+  %call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %7 = icmp ne i32 %call.i.i1, 0
+  br i1 %7, label %8, label %10
+
+8:                                                ; preds = %__nv_fast_log2f.exit
+  %9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
+  br label %__nv_exp2f.exit
+
+10:                                               ; preds = %__nv_fast_log2f.exit
+  %11 = call float @llvm.nvvm.ex2.approx.f(float %6)
+  br label %__nv_exp2f.exit
+
+__nv_exp2f.exit:                                  ; preds = %10, %8
+  %retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
+  ret float %retval.0.i.i2
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.ex2.approx.f(float) #3
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+attributes #3 = { nounwind readnone }
+attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
+!llvm.ident = !{!9}
+!nvvmir.version = !{!10}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
+!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
+!5 = !{null, !"align", i32 8}
+!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!7 = !{null, !"align", i32 16}
+!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!10 = !{i32 1, i32 4}
--- a/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
--- a/examples/backprop/backprop_cuda.cu
+++ b/examples/backprop/backprop_cuda.cu
@ -0,0 +1,195 @@
+#include <cuda.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+// includes, kernels
+#include "backprop.h"
+#include "backprop_cuda_kernel.cu"
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
+                                  int n2);
+
+extern "C" void bpnn_output_error(float *delta, float *target, float *output,
+                                  int nj, float *err);
+
+extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
+                                  int no, float **who, float *hidden,
+                                  float *err);
+
+extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
+                                    int nly, float **w, float **oldw);
+
+extern "C" int setup(int argc, char **argv);
+
+extern "C" float **alloc_2d_dbl(int m, int n);
+
+extern "C" float squash(float x);
+
+double gettime() {
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  return t.tv_sec + t.tv_usec * 1e-6;
+}
+
+unsigned int num_threads = 0;
+unsigned int num_blocks = 0;
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  cudaSetDevice(0);
+  setup(argc, argv);
+}
+
+extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
+  int in, hid, out;
+  float out_err, hid_err;
+
+  in = net->input_n;
+  hid = net->hidden_n;
+  out = net->output_n;
+
+#ifdef GPU
+  int m = 0;
+  float *input_hidden_cuda;
+  float *input_cuda;
+  float *output_hidden_cuda;
+  float *partial_sum;
+  float *hidden_partial_sum;
+  float *hidden_delta_cuda;
+  float *input_prev_weights_cuda;
+  float sum;
+  float *input_weights_one_dim;
+  float *input_weights_prev_one_dim;
+  num_blocks = in / 16;
+  dim3 grid(1, num_blocks);
+  dim3 threads(16, 16);
+
+  input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
+  input_weights_prev_one_dim =
+      (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
+  partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
+
+  // this preprocessing stage is added to correct the bugs of wrong memcopy
+  // using two-dimensional net->inputweights
+  for (int k = 0; k <= in; k++) {
+    for (int j = 0; j <= hid; j++) {
+      input_weights_one_dim[m] = net->input_weights[k][j];
+      input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
+      m++;
+    }
+  }
+
+  cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
+  cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
+  cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
+  cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
+
+#endif
+
+#ifdef CPU
+
+  printf("Performing CPU computation\n");
+  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
+                    hid);
+
+#endif
+
+#ifdef GPU
+
+  printf("Performing GPU computation\n");
+
+  // printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
+
+  cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
+             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
+
+  bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
+                                            input_hidden_cuda,
+                                            hidden_partial_sum, in, hid);
+
+  cudaThreadSynchronize();
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) {
+    printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
+    exit(EXIT_FAILURE);
+  }
+
+  cudaMemcpy(partial_sum, hidden_partial_sum,
+             num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
+
+  for (int j = 1; j <= hid; j++) {
+    sum = 0.0;
+    for (int k = 0; k < num_blocks; k++) {
+      sum += partial_sum[k * hid + j - 1];
+    }
+    sum += net->input_weights[0][j];
+    net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
+  }
+#endif
+
+  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
+                    hid, out);
+  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
+                    &out_err);
+  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
+                    net->hidden_weights, net->hidden_units, &hid_err);
+  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
+                      net->hidden_weights, net->hidden_prev_weights);
+
+#ifdef CPU
+
+  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
+                      net->input_weights, net->input_prev_weights);
+
+#endif
+
+#ifdef GPU
+
+  cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
+  cudaMalloc((void **)&input_prev_weights_cuda,
+             (in + 1) * (hid + 1) * sizeof(float));
+
+  cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
+             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
+             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
+
+  bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
+                                              input_cuda, in, input_hidden_cuda,
+                                              input_prev_weights_cuda);
+
+  cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
+             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < (in + 1) * (hid + 1); i++) {
+    printf("%f ", input_weights_one_dim[i]);
+  }
+  printf("\n");
+
+  cudaFree(input_cuda);
+  cudaFree(output_hidden_cuda);
+  cudaFree(input_hidden_cuda);
+  cudaFree(hidden_partial_sum);
+  cudaFree(input_prev_weights_cuda);
+  cudaFree(hidden_delta_cuda);
+
+  free(partial_sum);
+  free(input_weights_one_dim);
+  free(input_weights_prev_one_dim);
+
+#endif
+}
--- a/examples/backprop/backprop_cuda_kernel.cu
+++ b/examples/backprop/backprop_cuda_kernel.cu
@ -0,0 +1,96 @@
+#ifndef _BACKPROP_CUDA_KERNEL_H_
+#define _BACKPROP_CUDA_KERNEL_H_
+
+#include "backprop.h"
+#include "cuda.h"
+#include "math.h"
+#include <stdio.h>
+
+__global__ void bpnn_layerforward_CUDA(float *input_cuda,
+                                       float *output_hidden_cuda,
+                                       float *input_hidden_cuda,
+                                       float *hidden_partial_sum, int in,
+                                       int hid) {
+  int by = blockIdx.y;
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
+
+  int index_in = HEIGHT * by + ty + 1;
+
+  __shared__ float input_node[HEIGHT];
+  __shared__ float weight_matrix[HEIGHT][WIDTH];
+
+  if (tx == 0)
+    input_node[ty] = input_cuda[index_in];
+
+  __syncthreads();
+
+  weight_matrix[ty][tx] = input_hidden_cuda[index];
+
+  __syncthreads();
+
+  weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
+
+  __syncthreads();
+
+  for (int i = 1; i <= __log2f(HEIGHT); i++) {
+
+    int power_two = __powf(2, i);
+
+    if (ty % power_two == 0)
+      weight_matrix[ty][tx] =
+          weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
+
+    __syncthreads();
+  }
+
+  //__syncthreads();
+
+  input_hidden_cuda[index] = weight_matrix[ty][tx];
+
+  /*
+     for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
+
+             unsigned int power_two = i - 1;
+             if( (ty & power_two) == 0 ) {
+                  weight_matrix[ty][tx] = weight_matrix[ty][tx] +
+     weight_matrix[ty + power_two/2][tx];
+             }
+     }
+     */
+
+  __syncthreads();
+
+  if (tx == 0) {
+    hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
+  }
+}
+
+__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
+                                         int in, float *w, float *oldw) {
+
+  int by = blockIdx.y;
+
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
+  int index_y = HEIGHT * by + ty + 1;
+  int index_x = tx + 1;
+  // eta = 0.3;
+  // momentum = 0.3;
+
+  w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
+  oldw[index] =
+      ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
+
+  __syncthreads();
+
+  if (ty == 0 && by == 0) {
+    w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
+    oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
+  }
+}
+#endif
--- a/examples/backprop/facetrain.c
+++ b/examples/backprop/facetrain.c
@ -0,0 +1,48 @@
+#include "backprop.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+extern char *strcpy();
+extern void exit();
+
+int layer_size = 0;
+
+backprop_face() {
+  BPNN *net;
+  int i;
+  float out_err, hid_err;
+  net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
+
+  printf("Input layer size : %d\n", layer_size);
+  load(net);
+  // entering the training kernel, only one iteration
+  printf("Starting training kernel\n");
+  bpnn_train_cuda(net, &out_err, &hid_err);
+  bpnn_free(net);
+  printf("Training done\n");
+}
+
+int setup(argc, argv)
+int argc;
+char *argv[];
+{
+
+  int seed;
+
+  if (argc != 2) {
+    fprintf(stderr, "usage: backprop <num of input elements>\n");
+    exit(0);
+  }
+  layer_size = atoi(argv[1]);
+  if (layer_size % 16 != 0) {
+    fprintf(stderr, "The number of input points must be divided by 16\n");
+    exit(0);
+  }
+
+  seed = 7;
+  bpnn_initialize(seed);
+  backprop_face();
+
+  exit(0);
+}
--- a/examples/backprop/imagenet.c
+++ b/examples/backprop/imagenet.c
@ -0,0 +1,22 @@
+#include "backprop.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+extern layer_size;
+
+load(net) BPNN *net;
+{
+  float *units;
+  int nr, nc, imgsize, i, j, k;
+
+  nr = layer_size;
+
+  imgsize = nr * nc;
+  units = net->input_units;
+
+  k = 1;
+  for (i = 0; i < nr; i++) {
+    units[k] = (float)rand() / RAND_MAX;
+    k++;
+  }
+}
--- a/examples/backprop/run.sh
+++ b/examples/backprop/run.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+clang -c -emit-llvm backprop.c
+clang -c -emit-llvm facetrain.c
+clang -c -emit-llvm imagenet.c
+
+llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
+../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
+../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+llc --relocation-model=pic --filetype=obj  backprop.bc
+llc --relocation-model=pic --filetype=obj  facetrain.bc
+llc --relocation-model=pic --filetype=obj  imagenet.bc
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o demo \
+    -fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
+    -lc -lx86Runtime -lthreadPool -lpthread
+
+./demo 1024 > res.log
+if grep -q -e "0.173289 0.259645 0.350836" res.log; then
+    echo "Pass"
+else
+    echo "Error result"
+    exit 1
+fi
--- a/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,307 @@
+; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "bfs.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+%struct.Node = type { i32, i32 }
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
+entry:
+  %g_graph_nodes.addr = alloca %struct.Node*, align 8
+  %g_graph_edges.addr = alloca i32*, align 8
+  %g_graph_mask.addr = alloca i8*, align 8
+  %g_updating_graph_mask.addr = alloca i8*, align 8
+  %g_graph_visited.addr = alloca i8*, align 8
+  %g_cost.addr = alloca i32*, align 8
+  %no_of_nodes.addr = alloca i32, align 4
+  %tid = alloca i32, align 4
+  %i = alloca i32, align 4
+  %id = alloca i32, align 4
+  store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
+  store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
+  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
+  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
+  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
+  store i32* %g_cost, i32** %g_cost.addr, align 8
+  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
+  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %mul = mul i32 %call, 512
+  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %add = add i32 %mul, %call1
+  store i32 %add, i32* %tid, align 4
+  %0 = load i32, i32* %tid, align 4
+  %1 = load i32, i32* %no_of_nodes.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.end26
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i8*, i8** %g_graph_mask.addr, align 8
+  %3 = load i32, i32* %tid, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %if.then, label %if.end26
+
+if.then:                                          ; preds = %land.lhs.true
+  %5 = load i8*, i8** %g_graph_mask.addr, align 8
+  %6 = load i32, i32* %tid, align 4
+  %idxprom2 = sext i32 %6 to i64
+  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
+  store i8 0, i8* %arrayidx3, align 1
+  %7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
+  %8 = load i32, i32* %tid, align 4
+  %idxprom4 = sext i32 %8 to i64
+  %arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
+  %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
+  %9 = load i32, i32* %starting, align 4
+  store i32 %9, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.then
+  %10 = load i32, i32* %i, align 4
+  %11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
+  %12 = load i32, i32* %tid, align 4
+  %idxprom6 = sext i32 %12 to i64
+  %arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
+  %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
+  %13 = load i32, i32* %no_of_edges, align 4
+  %14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
+  %15 = load i32, i32* %tid, align 4
+  %idxprom8 = sext i32 %15 to i64
+  %arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
+  %starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
+  %16 = load i32, i32* %starting10, align 4
+  %add11 = add nsw i32 %13, %16
+  %cmp12 = icmp slt i32 %10, %add11
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %17 = load i32*, i32** %g_graph_edges.addr, align 8
+  %18 = load i32, i32* %i, align 4
+  %idxprom13 = sext i32 %18 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
+  %19 = load i32, i32* %arrayidx14, align 4
+  store i32 %19, i32* %id, align 4
+  %20 = load i8*, i8** %g_graph_visited.addr, align 8
+  %21 = load i32, i32* %id, align 4
+  %idxprom15 = sext i32 %21 to i64
+  %arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
+  %22 = load i8, i8* %arrayidx16, align 1
+  %tobool17 = trunc i8 %22 to i1
+  br i1 %tobool17, label %if.end, label %if.then18
+
+if.then18:                                        ; preds = %for.body
+  %23 = load i32*, i32** %g_cost.addr, align 8
+  %24 = load i32, i32* %tid, align 4
+  %idxprom19 = sext i32 %24 to i64
+  %arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
+  %25 = load i32, i32* %arrayidx20, align 4
+  %add21 = add nsw i32 %25, 1
+  %26 = load i32*, i32** %g_cost.addr, align 8
+  %27 = load i32, i32* %id, align 4
+  %idxprom22 = sext i32 %27 to i64
+  %arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
+  store i32 %add21, i32* %arrayidx23, align 4
+  %28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
+  %29 = load i32, i32* %id, align 4
+  %idxprom24 = sext i32 %29 to i64
+  %arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
+  store i8 1, i8* %arrayidx25, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then18, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %30 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %30, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %if.end26
+
+if.end26:                                         ; preds = %for.end, %land.lhs.true, %entry
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
+entry:
+  %g_graph_mask.addr = alloca i8*, align 8
+  %g_updating_graph_mask.addr = alloca i8*, align 8
+  %g_graph_visited.addr = alloca i8*, align 8
+  %g_over.addr = alloca i8*, align 8
+  %no_of_nodes.addr = alloca i32, align 4
+  %tid = alloca i32, align 4
+  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
+  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
+  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
+  store i8* %g_over, i8** %g_over.addr, align 8
+  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
+  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %mul = mul i32 %call, 512
+  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %add = add i32 %mul, %call1
+  store i32 %add, i32* %tid, align 4
+  %0 = load i32, i32* %tid, align 4
+  %1 = load i32, i32* %no_of_nodes.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
+  %3 = load i32, i32* %tid, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %5 = load i8*, i8** %g_graph_mask.addr, align 8
+  %6 = load i32, i32* %tid, align 4
+  %idxprom2 = sext i32 %6 to i64
+  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
+  store i8 1, i8* %arrayidx3, align 1
+  %7 = load i8*, i8** %g_graph_visited.addr, align 8
+  %8 = load i32, i32* %tid, align 4
+  %idxprom4 = sext i32 %8 to i64
+  %arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
+  store i8 1, i8* %arrayidx5, align 1
+  %9 = load i8*, i8** %g_over.addr, align 8
+  store i8 1, i8* %9, align 1
+  %10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
+  %11 = load i32, i32* %tid, align 4
+  %idxprom6 = sext i32 %11 to i64
+  %arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
+  store i8 0, i8* %arrayidx7, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
+!llvm.ident = !{!9}
+!nvvmir.version = !{!10}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
+!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
+!5 = !{null, !"align", i32 8}
+!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!7 = !{null, !"align", i32 16}
+!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!10 = !{i32 1, i32 4}
--- a/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
--- a/examples/bfs/bfs.cu
+++ b/examples/bfs/bfs.cu
@ -0,0 +1,213 @@
+#include <cuda.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_THREADS_PER_BLOCK 512
+
+int no_of_nodes;
+int edge_list_size;
+FILE *fp;
+
+// Structure to hold a node information
+struct Node {
+  int starting;
+  int no_of_edges;
+};
+
+#include "kernel.cu"
+#include "kernel2.cu"
+
+void BFSGraph(int argc, char **argv);
+
+////////////////////////////////////////////////////////////////////////////////
+// Main Program
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  cudaSetDevice(0);
+  no_of_nodes = 0;
+  edge_list_size = 0;
+  BFSGraph(argc, argv);
+}
+
+void Usage(int argc, char **argv) {
+
+  fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Apply BFS on a Graph using CUDA
+////////////////////////////////////////////////////////////////////////////////
+void BFSGraph(int argc, char **argv) {
+
+  char *input_f;
+  if (argc != 2) {
+    Usage(argc, argv);
+    exit(0);
+  }
+
+  input_f = argv[1];
+  printf("Reading File\n");
+  // Read in Graph from a file
+  fp = fopen(input_f, "r");
+  if (!fp) {
+    printf("Error Reading graph file\n");
+    return;
+  }
+
+  int source = 0;
+
+  fscanf(fp, "%d", &no_of_nodes);
+
+  int num_of_blocks = 1;
+  int num_of_threads_per_block = no_of_nodes;
+
+  // Make execution Parameters according to the number of nodes
+  // Distribute threads across multiple Blocks if necessary
+  if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
+    num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
+    num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
+  }
+
+  // allocate host memory
+  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
+  bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
+  bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
+  bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
+
+  int start, edgeno;
+  // initalize the memory
+  for (unsigned int i = 0; i < no_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
+    h_graph_nodes[i].starting = start;
+    h_graph_nodes[i].no_of_edges = edgeno;
+    h_graph_mask[i] = false;
+    h_updating_graph_mask[i] = false;
+    h_graph_visited[i] = false;
+  }
+
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  source = 0;
+
+  // set the source node as true in the mask
+  h_graph_mask[source] = true;
+  h_graph_visited[source] = true;
+
+  fscanf(fp, "%d", &edge_list_size);
+
+  int id, cost;
+  int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
+  for (int i = 0; i < edge_list_size; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &cost);
+    h_graph_edges[i] = id;
+  }
+
+  if (fp)
+    fclose(fp);
+
+  printf("Read File\n");
+
+  // Copy the Node list to device memory
+  Node *d_graph_nodes;
+  cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
+  cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
+             cudaMemcpyHostToDevice);
+
+  // Copy the Edge List to device Memory
+  int *d_graph_edges;
+  cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
+  cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
+             cudaMemcpyHostToDevice);
+
+  // Copy the Mask to device memory
+  bool *d_graph_mask;
+  cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
+  cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
+             cudaMemcpyHostToDevice);
+
+  bool *d_updating_graph_mask;
+  cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
+  cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
+             sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
+
+  // Copy the Visited nodes array to device memory
+  bool *d_graph_visited;
+  cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
+  cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
+             cudaMemcpyHostToDevice);
+
+  // allocate mem for the result on host side
+  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
+  for (int i = 0; i < no_of_nodes; i++)
+    h_cost[i] = -1;
+  h_cost[source] = 0;
+
+  // allocate device memory for result
+  int *d_cost;
+  cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
+  cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
+
+  // make a bool to check if the execution is over
+  bool *d_over;
+  cudaMalloc((void **)&d_over, sizeof(bool));
+
+  printf("Copied Everything to GPU memory\n");
+
+  // setup execution parameters
+  dim3 grid(num_of_blocks, 1, 1);
+  dim3 threads(num_of_threads_per_block, 1, 1);
+
+  int k = 0;
+  printf("Start traversing the tree\n");
+  bool stop;
+  // Call the Kernel untill all the elements of Frontier are not false
+  do {
+    // if no thread changes this value then the loop stops
+    stop = false;
+    cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
+
+    Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
+                                 d_updating_graph_mask, d_graph_visited, d_cost,
+                                 no_of_nodes);
+    cudaDeviceSynchronize();
+    // check if kernel execution generated and error
+
+    Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
+                                  d_graph_visited, d_over, no_of_nodes);
+    cudaDeviceSynchronize();
+    // check if kernel execution generated and error
+
+    cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
+
+    k++;
+  } while (stop);
+
+  printf("Kernel Executed %d times\n", k);
+
+  // copy result from device to host
+  cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
+
+  // Store the result into a file
+  FILE *fpo = fopen("result.txt", "w");
+  for (int i = 0; i < no_of_nodes; i++)
+    fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
+  fclose(fpo);
+  printf("Result stored in result.txt\n");
+
+  // cleanup memory
+  free(h_graph_nodes);
+  free(h_graph_edges);
+  free(h_graph_mask);
+  free(h_updating_graph_mask);
+  free(h_graph_visited);
+  free(h_cost);
+
+  cudaFree(d_graph_nodes);
+  cudaFree(d_graph_edges);
+  cudaFree(d_graph_mask);
+  cudaFree(d_updating_graph_mask);
+  cudaFree(d_graph_visited);
+  cudaFree(d_cost);
+}
--- a/examples/bfs/kernel.cu
+++ b/examples/bfs/kernel.cu
@ -0,0 +1,23 @@
+#ifndef _KERNEL_H_
+#define _KERNEL_H_
+
+__global__ void
+Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
+{
+	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
+	if( tid<no_of_nodes && g_graph_mask[tid])
+	{
+		g_graph_mask[tid]=false;
+		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
+			{
+			int id = g_graph_edges[i];
+			if(!g_graph_visited[id])
+				{
+				g_cost[id]=g_cost[tid]+1;
+				g_updating_graph_mask[id]=true;
+				}
+			}
+	}
+}
+
+#endif
--- a/examples/bfs/kernel2.cu
+++ b/examples/bfs/kernel2.cu
@ -0,0 +1,18 @@
+#ifndef _KERNEL2_H_
+#define _KERNEL2_H_
+
+__global__ void
+Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
+{
+	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
+	if( tid<no_of_nodes && g_updating_graph_mask[tid])
+	{
+
+		g_graph_mask[tid]=true;
+		g_graph_visited[tid]=true;
+		*g_over=true;
+		g_updating_graph_mask[tid]=false;
+	}
+}
+
+#endif
--- a/examples/bfs/run.sh
+++ b/examples/bfs/run.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -e
+llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
+../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
+../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
+    -o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+./bfs.out ../../rodinia-data/bfs/graph65536.txt
+if grep -q "0) cost:0" result.txt; then
+    echo "Pass"
+else
+    echo "Error result"
+    exit 1
+fi
--- a/examples/btree/common.h
+++ b/examples/btree/common.h
@ -0,0 +1,343 @@
+// # ifdef __cplusplus
+// extern "C" {
+// # endif
+
+// #ifndef LIST_H
+// # define LIST_H
+
+//===============================================================================================================================================================================================================200
+//	DEFINE/INCLUDE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	INCLUDE (for some reason these are not recognized when defined in main
+// file before this one is included)
+//======================================================================================================================================================150
+
+#include <stdbool.h> // (in path known to compiler)			needed by true/false, bool
+#include <stdint.h>  // (in path known to compiler)			needed by uint32_t
+#include <stdlib.h>  // (in path known to compiler)			needed by malloc
+
+//======================================================================================================================================================150
+//	DEFINE
+//======================================================================================================================================================150
+
+#define fp float
+
+#define Version "1.5"
+
+#ifdef WINDOWS
+#define bool char
+#define false 0
+#define true 1
+#endif
+
+/* #define DEFAULT_ORDER 256 */
+
+#ifdef RD_WG_SIZE_0_0
+#define DEFAULT_ORDER RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+#define DEFAULT_ORDER RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+#define DEFAULT_ORDER RD_WG_SIZE
+#else
+#define DEFAULT_ORDER 256
+#endif
+
+/* #ifdef RD_WG_SIZE_1_0 */
+/*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
+/* #elif defined(RD_WG_SIZE_1) */
+/*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1 */
+/* #elif defined(RD_WG_SIZE) */
+/*         #define  DEFAULT_ORDER_2 RD_WG_SIZE */
+/* #else */
+/*         #define  DEFAULT_ORDER_2 256 */
+/* #endif */
+
+/* #define DEFAULT_ORDER 508 */
+
+#define malloc(size)                                                           \
+  ({                                                                           \
+    void *_tmp;                                                                \
+                                                                               \
+    if (!(_tmp = malloc(size))) {                                              \
+      fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__);    \
+      exit(-1);                                                                \
+    }                                                                          \
+                                                                               \
+    _tmp;                                                                      \
+  })
+
+//======================================================================================================================================================150
+//	STRUCTURES
+//======================================================================================================================================================150
+
+// struct list_item;
+typedef struct list_item list_item_t;
+
+typedef struct list_t {
+  list_item_t *head, *tail;
+  uint32_t length;
+  int32_t (*compare)(const void *key, const void *with);
+  void (*datum_delete)(void *);
+} list_t;
+
+typedef list_item_t *list_iterator_t;
+typedef list_item_t *list_reverse_iterator_t;
+
+/* Type representing the record
+ * to which a given key refers.
+ * In a real B+ tree system, the
+ * record would hold data (in a database)
+ * or a file (in an operating system)
+ * or some other information.
+ * Users can rewrite this part of the code
+ * to change the type and content
+ * of the value field.
+ */
+typedef struct record {
+  int value;
+} record;
+
+/* Type representing a node in the B+ tree.
+ * This type is general enough to serve for both
+ * the leaf and the internal node.
+ * The heart of the node is the array
+ * of keys and the array of corresponding
+ * pointers.  The relation between keys
+ * and pointers differs between leaves and
+ * internal nodes.  In a leaf, the index
+ * of each key equals the index of its corresponding
+ * pointer, with a maximum of order - 1 key-pointer
+ * pairs.  The last pointer points to the
+ * leaf to the right (or NULL in the case
+ * of the rightmost leaf).
+ * In an internal node, the first pointer
+ * refers to lower nodes with keys less than
+ * the smallest key in the keys array.  Then,
+ * with indices i starting at 0, the pointer
+ * at i + 1 points to the subtree with keys
+ * greater than or equal to the key in this
+ * node at index i.
+ * The num_keys field is used to keep
+ * track of the number of valid keys.
+ * In an internal node, the number of valid
+ * pointers is always num_keys + 1.
+ * In a leaf, the number of valid pointers
+ * to data is always num_keys.  The
+ * last leaf pointer points to the next leaf.
+ */
+typedef struct node {
+  void **pointers;
+  int *keys;
+  struct node *parent;
+  bool is_leaf;
+  int num_keys;
+  struct node *next; // Used for queue.
+} node;
+
+//
+typedef struct knode {
+  int location;
+  int indices[DEFAULT_ORDER + 1];
+  int keys[DEFAULT_ORDER + 1];
+  bool is_leaf;
+  int num_keys;
+} knode;
+
+struct list_item {
+  struct list_item *pred, *next;
+  void *datum;
+};
+
+//===============================================================================================================================================================================================================200
+//	PROTOTYPES
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+// Other
+//======================================================================================================================================================150
+
+void list_item_init(list_item_t *li, void *datum);
+
+void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
+
+void list_insert_item_tail(list_t *l, list_item_t *i);
+
+void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
+
+void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
+
+void list_insert_item_sorted(list_t *l, list_item_t *i);
+
+//======================================================================================================================================================150
+// ???
+//======================================================================================================================================================150
+
+void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
+               void (*datum_delete)(void *datum));
+
+void list_delete(list_t *l);
+
+void list_reset(list_t *l);
+
+void list_insert_head(list_t *l, void *v);
+
+void list_insert_tail(list_t *l, void *v);
+
+void list_insert_before(list_t *l, list_item_t *next, void *v);
+
+void list_insert_after(list_t *l, list_item_t *pred, void *v);
+
+void list_insert_sorted(list_t *l, void *v);
+
+void list_insert_item_head(list_t *l, list_item_t *i);
+
+void list_remove_item(list_t *l, list_item_t *i);
+
+void list_remove_head(list_t *l);
+
+void list_remove_tail(list_t *l);
+
+list_item_t *list_find_item(list_t *l, void *datum);
+
+list_item_t *list_get_head_item(list_t *l);
+
+list_item_t *list_get_tail_item(list_t *l);
+
+void *list_find(list_t *l, void *datum);
+
+void *list_get_head(list_t *l);
+
+void *list_get_tail(list_t *l);
+
+uint32_t list_get_length(list_t *l);
+
+bool list_is_empty(list_t *l);
+
+bool list_not_empty(list_t *l);
+
+void list_visit_items(list_t *l, void (*visitor)(void *v));
+
+void *list_item_get_datum(list_item_t *li);
+
+void list_iterator_init(list_t *l, list_iterator_t *li);
+
+void list_iterator_delete(list_iterator_t *li);
+
+void list_iterator_next(list_iterator_t *li);
+
+void list_iterator_prev(list_iterator_t *li);
+
+void *list_iterator_get_datum(list_iterator_t *li);
+
+bool list_iterator_is_valid(list_iterator_t *li);
+
+void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
+
+void list_reverse_iterator_delete(list_iterator_t *li);
+
+void list_reverse_iterator_next(list_iterator_t *li);
+
+void list_reverse_iterator_prev(list_iterator_t *li);
+
+void *list_reverse_iterator_get_datum(list_iterator_t *li);
+
+bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
+
+//======================================================================================================================================================150
+// Output and utility
+//======================================================================================================================================================150
+
+void *kmalloc(int size);
+
+long transform_to_cuda(node *n,
+                       bool verbose); // returns actual mem used in a long
+
+void usage_1(void);
+
+void usage_2(void);
+
+void enqueue(node *new_node);
+
+node *dequeue(void);
+
+int height(node *root);
+
+int path_to_root(node *root, node *child);
+
+void print_leaves(node *root);
+
+void print_tree(node *root);
+
+node *find_leaf(node *root, int key, bool verbose);
+
+record *find(node *root, int key, bool verbose);
+
+int cut(int length);
+
+//======================================================================================================================================================150
+// Insertion
+//======================================================================================================================================================150
+
+record *make_record(int value);
+
+node *make_node(void);
+
+node *make_leaf(void);
+
+int get_left_index(node *parent, node *left);
+
+node *insert_into_leaf(node *leaf, int key, record *pointer);
+
+node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
+                                       record *pointer);
+
+node *insert_into_node(node *root, node *parent, int left_index, int key,
+                       node *right);
+
+node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
+                                       int key, node *right);
+
+node *insert_into_parent(node *root, node *left, int key, node *right);
+
+node *insert_into_new_root(node *left, int key, node *right);
+
+node *start_new_tree(int key, record *pointer);
+
+node *insert(node *root, int key, int value);
+
+//======================================================================================================================================================150
+// Deletion
+//======================================================================================================================================================150
+
+int get_neighbor_index(node *n);
+
+node *adjust_root(node *root);
+
+node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
+                     int k_prime);
+
+node *redistribute_nodes(node *root, node *n, node *neighbor,
+                         int neighbor_index, int k_prime_index, int k_prime);
+
+node *delete_entry(node *root, node *n, int key, void *pointer);
+
+node *deleteVal(node *root, int key);
+
+//===============================================================================================================================================================================================================200
+//	HEADER
+//===============================================================================================================================================================================================================200
+
+// int main(	int argc,
+// char *argv []);
+
+//===============================================================================================================================================================================================================200
+//	END
+//===============================================================================================================================================================================================================200
+
+// #endif
+
+// # ifdef __cplusplus
+// }
+// # endif
--- a/examples/btree/kernel/kernel_gpu_cuda.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda.cu
@ -0,0 +1,54 @@
+//========================================================================================================================================================================================================200
+//	findK function
+//========================================================================================================================================================================================================200
+
+__global__ void
+findK(	long height,
+		knode *knodesD,
+		long knodes_elem,
+		record *recordsD,
+
+		long *currKnodeD,
+		long *offsetD,
+		int *keysD,
+		record *ansD)
+{
+
+	// private thread IDs
+	int thid = threadIdx.x;
+	int bid = blockIdx.x;
+
+	// processtree levels
+	int i;
+	for(i = 0; i < height; i++){
+
+		// if value is between the two keys
+		if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
+			// this conditional statement is inserted to avoid crush due to but in original code
+			// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
+			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
+			if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
+				offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
+			}
+		}
+		__syncthreads();
+
+		// set for next tree level
+		if(thid==0){
+			currKnodeD[bid] = offsetD[bid];
+		}
+		__syncthreads();
+
+	}
+
+	//At this point, we have a candidate leaf node which may contain
+	//the target record.  Check each key to hopefully find the record
+	if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
+		ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
+	}
+
+}
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_2.cu
@ -0,0 +1,70 @@
+//========================================================================================================================================================================================================200
+//	findRangeK function
+//========================================================================================================================================================================================================200
+
+__global__ void
+findRangeK(	long height,
+
+			knode *knodesD,
+			long knodes_elem,
+
+			long *currKnodeD,
+			long *offsetD,
+			long *lastKnodeD,
+			long *offset_2D,
+			int *startD,
+			int *endD,
+			int *RecstartD,
+			int *ReclenD)
+{
+
+	// private thread IDs
+	int thid = threadIdx.x;
+	int bid = blockIdx.x;
+
+	// ???
+	int i;
+	for(i = 0; i < height; i++){
+
+		if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
+			// this conditional statement is inserted to avoid crush due to but in original code
+			// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
+			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
+			if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
+				offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
+			}
+		}
+		if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
+			// this conditional statement is inserted to avoid crush due to but in original code
+			// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
+			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
+			if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
+				offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
+			}
+		}
+		__syncthreads();
+
+		// set for next tree level
+		if(thid==0){
+			currKnodeD[bid] = offsetD[bid];
+			lastKnodeD[bid] = offset_2D[bid];
+		}
+		__syncthreads();
+	}
+
+	// Find the index of the starting record
+	if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
+		RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
+	}
+	__syncthreads();
+
+	// Find the index of the ending record
+	if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
+		ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
+	}
+
+}
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
@ -0,0 +1,292 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//========================================================================================================================================================================================================200
+//	DEFINE/INCLUDE
+//========================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	COMMON
+//======================================================================================================================================================150
+
+#include "../common.h"								// (in main program directory)			needed to recognized input variables
+
+//======================================================================================================================================================150
+//	UTILITIES
+//======================================================================================================================================================150
+
+#include "../util/cuda/cuda.h"					// (in path specified to compiler)	needed by for device functions
+#include "../util/timer/timer.h"					// (in path specified to compiler)	needed by timer
+
+//======================================================================================================================================================150
+//	KERNEL
+//======================================================================================================================================================150
+
+#include "./kernel_gpu_cuda.cu"						// (in current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
+
+//======================================================================================================================================================150
+//	HEADER
+//======================================================================================================================================================150
+
+#include "./kernel_gpu_cuda_wrapper.h"				// (in current directory)
+
+//========================================================================================================================================================================================================200
+//	KERNEL_GPU_CUDA_WRAPPER FUNCTION
+//========================================================================================================================================================================================================200
+
+void
+kernel_gpu_cuda_wrapper(record *records,
+						long records_mem,
+						knode *knodes,
+						long knodes_elem,
+						long knodes_mem,
+
+						int order,
+						long maxheight,
+						int count,
+
+						long *currKnode,
+						long *offset,
+						int *keys,
+						record *ans)
+{
+
+	//======================================================================================================================================================150
+	//	CPU VARIABLES
+	//======================================================================================================================================================150
+
+	// timer
+	long long time0;
+	long long time1;
+	long long time2;
+	long long time3;
+	long long time4;
+	long long time5;
+	long long time6;
+
+	time0 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU SETUP
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	INITIAL DRIVER OVERHEAD
+	//====================================================================================================100
+
+	cudaThreadSynchronize();
+
+	//====================================================================================================100
+	//	EXECUTION PARAMETERS
+	//====================================================================================================100
+
+	int numBlocks;
+	numBlocks = count;									// max # of blocks can be 65,535
+	int threadsPerBlock;
+	threadsPerBlock = order < 1024 ? order : 1024;
+
+	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
+
+	time1 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY				(MALLOC)
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	recordsD
+	//==================================================50
+
+	record *recordsD;
+	cudaMalloc((void**)&recordsD, records_mem);
+	checkCUDAError("cudaMalloc  recordsD");
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	knode *knodesD;
+	cudaMalloc((void**)&knodesD, knodes_mem);
+	checkCUDAError("cudaMalloc  recordsD");
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	long *currKnodeD;
+	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
+	checkCUDAError("cudaMalloc  currKnodeD");
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	long *offsetD;
+	cudaMalloc((void**)&offsetD, count*sizeof(long));
+	checkCUDAError("cudaMalloc  offsetD");
+
+	//==================================================50
+	//	keysD
+	//==================================================50
+
+	int *keysD;
+	cudaMalloc((void**)&keysD, count*sizeof(int));
+	checkCUDAError("cudaMalloc  keysD");
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansD
+	//==================================================50
+
+	record *ansD;
+	cudaMalloc((void**)&ansD, count*sizeof(record));
+	checkCUDAError("cudaMalloc ansD");
+
+	time2 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	GPU MEMORY				(MALLOC) COPY IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	recordsD
+	//==================================================50
+
+	cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy memD");
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy memD");
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
+
+	//==================================================50
+	//	keysD
+	//==================================================50
+
+	cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy keysD");
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansD
+	//==================================================50
+
+	cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy ansD");
+
+	time3 = get_time();
+
+	//======================================================================================================================================================150
+	// findK kernel
+	//======================================================================================================================================================150
+
+	findK<<<numBlocks, threadsPerBlock>>>(	maxheight,
+
+											knodesD,
+											knodes_elem,
+
+											recordsD,
+
+											currKnodeD,
+											offsetD,
+											keysD,
+											ansD);
+	cudaThreadSynchronize();
+	checkCUDAError("findK");
+
+	time4 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY (CONTD.)
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansD
+	//==================================================50
+
+	cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
+	checkCUDAError("cudaMemcpy ansD");
+
+	time5 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY DEALLOCATION
+	//======================================================================================================================================================150
+
+	cudaFree(recordsD);
+	cudaFree(knodesD);
+
+	cudaFree(currKnodeD);
+	cudaFree(offsetD);
+	cudaFree(keysD);
+	cudaFree(ansD);
+
+	time6 = get_time();
+
+	//======================================================================================================================================================150
+	//	DISPLAY TIMING
+	//======================================================================================================================================================150
+
+	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
+
+	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
+	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
+	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
+
+	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
+
+	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
+	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
+
+	printf("Total time:\n");
+	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
+
+}
+
+//========================================================================================================================================================================================================200
+//	END
+//========================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
@ -0,0 +1,23 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//========================================================================================================================================================================================================200
+//	KERNEL_GPU_CUDA_WRAPPER HEADER
+//========================================================================================================================================================================================================200
+
+void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
+                             long knodes_elem, long knodes_mem,
+
+                             int order, long maxheight, int count,
+
+                             long *currKnode, long *offset, int *keys,
+                             record *ans);
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
@ -0,0 +1,347 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//========================================================================================================================================================================================================200
+//	INCLUDE
+//========================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	COMMON
+//======================================================================================================================================================150
+
+#include "../common.h"									// (in the main program folder)	needed to recognized input parameters
+
+//======================================================================================================================================================150
+//	UTILITIES
+//======================================================================================================================================================150
+
+#include "../util/cuda/cuda.h"							// (in library path specified to compiler)	needed by for device functions
+#include "../util/timer/timer.h"						// (in library path specified to compiler)	needed by timer
+
+//======================================================================================================================================================150
+//	KERNEL
+//======================================================================================================================================================150
+
+#include "./kernel_gpu_cuda_2.cu"						// (in the current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
+
+//======================================================================================================================================================150
+//	HEADER
+//======================================================================================================================================================150
+
+#include "./kernel_gpu_cuda_wrapper_2.h"				// (in the current directory)
+
+//========================================================================================================================================================================================================200
+//	FUNCTION
+//========================================================================================================================================================================================================200
+
+void
+kernel_gpu_cuda_wrapper_2(	knode *knodes,
+							long knodes_elem,
+							long knodes_mem,
+
+							int order,
+							long maxheight,
+							int count,
+
+							long *currKnode,
+							long *offset,
+							long *lastKnode,
+							long *offset_2,
+							int *start,
+							int *end,
+							int *recstart,
+							int *reclength)
+{
+
+	//======================================================================================================================================================150
+	//	CPU VARIABLES
+	//======================================================================================================================================================150
+
+	// timer
+	long long time0;
+	long long time1;
+	long long time2;
+	long long time3;
+	long long time4;
+	long long time5;
+	long long time6;
+
+	time0 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU SETUP
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	INITIAL DRIVER OVERHEAD
+	//====================================================================================================100
+
+	cudaThreadSynchronize();
+
+	//====================================================================================================100
+	//	EXECUTION PARAMETERS
+	//====================================================================================================100
+
+	int numBlocks;
+	numBlocks = count;
+	int threadsPerBlock;
+	threadsPerBlock = order < 1024 ? order : 1024;
+
+	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
+
+	time1 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY				MALLOC
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	knode *knodesD;
+	cudaMalloc((void**)&knodesD, knodes_mem);
+	checkCUDAError("cudaMalloc  recordsD");
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	long *currKnodeD;
+	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
+	checkCUDAError("cudaMalloc  currKnodeD");
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	long *offsetD;
+	cudaMalloc((void**)&offsetD, count*sizeof(long));
+	checkCUDAError("cudaMalloc  offsetD");
+
+	//==================================================50
+	//	lastKnodeD
+	//==================================================50
+
+	long *lastKnodeD;
+	cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
+	checkCUDAError("cudaMalloc  lastKnodeD");
+
+	//==================================================50
+	//	offset_2D
+	//==================================================50
+
+	long *offset_2D;
+	cudaMalloc((void**)&offset_2D, count*sizeof(long));
+	checkCUDAError("cudaMalloc  offset_2D");
+
+	//==================================================50
+	//	startD
+	//==================================================50
+
+	int *startD;
+	cudaMalloc((void**)&startD, count*sizeof(int));
+	checkCUDAError("cudaMalloc startD");
+
+	//==================================================50
+	//	endD
+	//==================================================50
+
+	int *endD;
+	cudaMalloc((void**)&endD, count*sizeof(int));
+	checkCUDAError("cudaMalloc endD");
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansDStart
+	//==================================================50
+
+	int *ansDStart;
+	cudaMalloc((void**)&ansDStart, count*sizeof(int));
+	checkCUDAError("cudaMalloc ansDStart");
+
+	//==================================================50
+	//	ansDLength
+	//==================================================50
+
+	int *ansDLength;
+	cudaMalloc((void**)&ansDLength, count*sizeof(int));
+	checkCUDAError("cudaMalloc ansDLength");
+
+	time2 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy memD");
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
+
+	//==================================================50
+	//	lastKnodeD
+	//==================================================50
+
+	cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
+
+	//==================================================50
+	//	offset_2D
+	//==================================================50
+
+	cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
+
+	//==================================================50
+	//	startD
+	//==================================================50
+
+	cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMemcpy startD");
+
+	//==================================================50
+	//	endD
+	//==================================================50
+
+	cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMemcpy endD");
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansDStart
+	//==================================================50
+
+	cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMemcpy ansDStart");
+
+	//==================================================50
+	//	ansDLength
+	//==================================================50
+
+	cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
+	checkCUDAError("cudaMemcpy ansDLength");
+
+	time3 = get_time();
+
+	//======================================================================================================================================================150
+	//	KERNEL
+	//======================================================================================================================================================150
+
+	// [GPU] findRangeK kernel
+	findRangeK<<<numBlocks, threadsPerBlock>>>(	maxheight,
+												knodesD,
+												knodes_elem,
+
+												currKnodeD,
+												offsetD,
+												lastKnodeD,
+												offset_2D,
+												startD,
+												endD,
+												ansDStart,
+												ansDLength);
+	cudaThreadSynchronize();
+	checkCUDAError("findRangeK");
+
+	time4 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY (CONTD.)
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansDStart
+	//==================================================50
+
+	cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
+	checkCUDAError("cudaMemcpy ansDStart");
+
+	//==================================================50
+	//	ansDLength
+	//==================================================50
+
+	cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
+	checkCUDAError("cudaMemcpy ansDLength");
+
+	time5 = get_time();
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY DEALLOCATION
+	//======================================================================================================================================================150
+
+	cudaFree(knodesD);
+
+	cudaFree(currKnodeD);
+	cudaFree(offsetD);
+	cudaFree(lastKnodeD);
+	cudaFree(offset_2D);
+	cudaFree(startD);
+	cudaFree(endD);
+	cudaFree(ansDStart);
+	cudaFree(ansDLength);
+
+	time6 = get_time();
+
+	//======================================================================================================================================================150
+	//	DISPLAY TIMING
+	//======================================================================================================================================================150
+
+	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
+
+	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
+	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
+	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
+
+	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
+
+	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
+	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
+
+	printf("Total time:\n");
+	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
+
+}
+
+//========================================================================================================================================================================================================200
+//	END
+//========================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
@ -0,0 +1,23 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//========================================================================================================================================================================================================200
+//	KERNEL_GPU_CUDA_WRAPPER HEADER
+//========================================================================================================================================================================================================200
+
+void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
+
+                               int order, long maxheight, int count,
+
+                               long *currKnode, long *offset, long *lastKnode,
+                               long *offset_2, int *start, int *end,
+                               int *recstart, int *reclength);
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,332 @@
+; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
+%struct.record = type { i32 }
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
+
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
+entry:
+  %height.addr = alloca i64, align 8
+  %knodesD.addr = alloca %struct.knode*, align 8
+  %knodes_elem.addr = alloca i64, align 8
+  %recordsD.addr = alloca %struct.record*, align 8
+  %currKnodeD.addr = alloca i64*, align 8
+  %offsetD.addr = alloca i64*, align 8
+  %keysD.addr = alloca i32*, align 8
+  %ansD.addr = alloca %struct.record*, align 8
+  %thid = alloca i32, align 4
+  %bid = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i64 %height, i64* %height.addr, align 8
+  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
+  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
+  store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
+  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
+  store i64* %offsetD, i64** %offsetD.addr, align 8
+  store i32* %keysD, i32** %keysD.addr, align 8
+  store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
+  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call, i32* %thid, align 4
+  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call1, i32* %bid, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %conv = sext i32 %0 to i64
+  %1 = load i64, i64* %height.addr, align 8
+  %cmp = icmp slt i64 %conv, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %3 = load i64*, i64** %currKnodeD.addr, align 8
+  %4 = load i32, i32* %bid, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
+  %5 = load i64, i64* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
+  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
+  %6 = load i32, i32* %thid, align 4
+  %idxprom3 = sext i32 %6 to i64
+  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
+  %7 = load i32, i32* %arrayidx4, align 4
+  %8 = load i32*, i32** %keysD.addr, align 8
+  %9 = load i32, i32* %bid, align 4
+  %idxprom5 = sext i32 %9 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
+  %10 = load i32, i32* %arrayidx6, align 4
+  %cmp7 = icmp sle i32 %7, %10
+  br i1 %cmp7, label %land.lhs.true, label %if.end34
+
+land.lhs.true:                                    ; preds = %for.body
+  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %12 = load i64*, i64** %currKnodeD.addr, align 8
+  %13 = load i32, i32* %bid, align 4
+  %idxprom8 = sext i32 %13 to i64
+  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
+  %14 = load i64, i64* %arrayidx9, align 8
+  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
+  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
+  %15 = load i32, i32* %thid, align 4
+  %add = add nsw i32 %15, 1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
+  %16 = load i32, i32* %arrayidx13, align 4
+  %17 = load i32*, i32** %keysD.addr, align 8
+  %18 = load i32, i32* %bid, align 4
+  %idxprom14 = sext i32 %18 to i64
+  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
+  %19 = load i32, i32* %arrayidx15, align 4
+  %cmp16 = icmp sgt i32 %16, %19
+  br i1 %cmp16, label %if.then, label %if.end34
+
+if.then:                                          ; preds = %land.lhs.true
+  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %21 = load i64*, i64** %offsetD.addr, align 8
+  %22 = load i32, i32* %bid, align 4
+  %idxprom17 = sext i32 %22 to i64
+  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
+  %23 = load i64, i64* %arrayidx18, align 8
+  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
+  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
+  %24 = load i32, i32* %thid, align 4
+  %idxprom20 = sext i32 %24 to i64
+  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
+  %25 = load i32, i32* %arrayidx21, align 4
+  %conv22 = sext i32 %25 to i64
+  %26 = load i64, i64* %knodes_elem.addr, align 8
+  %cmp23 = icmp slt i64 %conv22, %26
+  br i1 %cmp23, label %if.then24, label %if.end
+
+if.then24:                                        ; preds = %if.then
+  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %28 = load i64*, i64** %offsetD.addr, align 8
+  %29 = load i32, i32* %bid, align 4
+  %idxprom25 = sext i32 %29 to i64
+  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
+  %30 = load i64, i64* %arrayidx26, align 8
+  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
+  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
+  %31 = load i32, i32* %thid, align 4
+  %idxprom29 = sext i32 %31 to i64
+  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
+  %32 = load i32, i32* %arrayidx30, align 4
+  %conv31 = sext i32 %32 to i64
+  %33 = load i64*, i64** %offsetD.addr, align 8
+  %34 = load i32, i32* %bid, align 4
+  %idxprom32 = sext i32 %34 to i64
+  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
+  store i64 %conv31, i64* %arrayidx33, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then24, %if.then
+  br label %if.end34
+
+if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
+  call void @llvm.nvvm.barrier0()
+  %35 = load i32, i32* %thid, align 4
+  %cmp35 = icmp eq i32 %35, 0
+  br i1 %cmp35, label %if.then36, label %if.end41
+
+if.then36:                                        ; preds = %if.end34
+  %36 = load i64*, i64** %offsetD.addr, align 8
+  %37 = load i32, i32* %bid, align 4
+  %idxprom37 = sext i32 %37 to i64
+  %arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
+  %38 = load i64, i64* %arrayidx38, align 8
+  %39 = load i64*, i64** %currKnodeD.addr, align 8
+  %40 = load i32, i32* %bid, align 4
+  %idxprom39 = sext i32 %40 to i64
+  %arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
+  store i64 %38, i64* %arrayidx40, align 8
+  br label %if.end41
+
+if.end41:                                         ; preds = %if.then36, %if.end34
+  call void @llvm.nvvm.barrier0()
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end41
+  %41 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %41, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %43 = load i64*, i64** %currKnodeD.addr, align 8
+  %44 = load i32, i32* %bid, align 4
+  %idxprom42 = sext i32 %44 to i64
+  %arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
+  %45 = load i64, i64* %arrayidx43, align 8
+  %arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
+  %keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
+  %46 = load i32, i32* %thid, align 4
+  %idxprom46 = sext i32 %46 to i64
+  %arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
+  %47 = load i32, i32* %arrayidx47, align 4
+  %48 = load i32*, i32** %keysD.addr, align 8
+  %49 = load i32, i32* %bid, align 4
+  %idxprom48 = sext i32 %49 to i64
+  %arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
+  %50 = load i32, i32* %arrayidx49, align 4
+  %cmp50 = icmp eq i32 %47, %50
+  br i1 %cmp50, label %if.then51, label %if.end63
+
+if.then51:                                        ; preds = %for.end
+  %51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
+  %52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %53 = load i64*, i64** %currKnodeD.addr, align 8
+  %54 = load i32, i32* %bid, align 4
+  %idxprom52 = sext i32 %54 to i64
+  %arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
+  %55 = load i64, i64* %arrayidx53, align 8
+  %arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
+  %indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
+  %56 = load i32, i32* %thid, align 4
+  %idxprom56 = sext i32 %56 to i64
+  %arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
+  %57 = load i32, i32* %arrayidx57, align 4
+  %idxprom58 = sext i32 %57 to i64
+  %arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
+  %value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
+  %58 = load i32, i32* %value, align 4
+  %59 = load %struct.record*, %struct.record** %ansD.addr, align 8
+  %60 = load i32, i32* %bid, align 4
+  %idxprom60 = sext i32 %60 to i64
+  %arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
+  %value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
+  store i32 %58, i32* %value62, align 4
+  br label %if.end63
+
+if.end63:                                         ; preds = %if.then51, %for.end
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+attributes #3 = { nounwind readnone }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,475 @@
+; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
+
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
+entry:
+  %height.addr = alloca i64, align 8
+  %knodesD.addr = alloca %struct.knode*, align 8
+  %knodes_elem.addr = alloca i64, align 8
+  %currKnodeD.addr = alloca i64*, align 8
+  %offsetD.addr = alloca i64*, align 8
+  %lastKnodeD.addr = alloca i64*, align 8
+  %offset_2D.addr = alloca i64*, align 8
+  %startD.addr = alloca i32*, align 8
+  %endD.addr = alloca i32*, align 8
+  %RecstartD.addr = alloca i32*, align 8
+  %ReclenD.addr = alloca i32*, align 8
+  %thid = alloca i32, align 4
+  %bid = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i64 %height, i64* %height.addr, align 8
+  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
+  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
+  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
+  store i64* %offsetD, i64** %offsetD.addr, align 8
+  store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
+  store i64* %offset_2D, i64** %offset_2D.addr, align 8
+  store i32* %startD, i32** %startD.addr, align 8
+  store i32* %endD, i32** %endD.addr, align 8
+  store i32* %RecstartD, i32** %RecstartD.addr, align 8
+  store i32* %ReclenD, i32** %ReclenD.addr, align 8
+  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call, i32* %thid, align 4
+  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call1, i32* %bid, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %conv = sext i32 %0 to i64
+  %1 = load i64, i64* %height.addr, align 8
+  %cmp = icmp slt i64 %conv, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %3 = load i64*, i64** %currKnodeD.addr, align 8
+  %4 = load i32, i32* %bid, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
+  %5 = load i64, i64* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
+  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
+  %6 = load i32, i32* %thid, align 4
+  %idxprom3 = sext i32 %6 to i64
+  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
+  %7 = load i32, i32* %arrayidx4, align 4
+  %8 = load i32*, i32** %startD.addr, align 8
+  %9 = load i32, i32* %bid, align 4
+  %idxprom5 = sext i32 %9 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
+  %10 = load i32, i32* %arrayidx6, align 4
+  %cmp7 = icmp sle i32 %7, %10
+  br i1 %cmp7, label %land.lhs.true, label %if.end34
+
+land.lhs.true:                                    ; preds = %for.body
+  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %12 = load i64*, i64** %currKnodeD.addr, align 8
+  %13 = load i32, i32* %bid, align 4
+  %idxprom8 = sext i32 %13 to i64
+  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
+  %14 = load i64, i64* %arrayidx9, align 8
+  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
+  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
+  %15 = load i32, i32* %thid, align 4
+  %add = add nsw i32 %15, 1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
+  %16 = load i32, i32* %arrayidx13, align 4
+  %17 = load i32*, i32** %startD.addr, align 8
+  %18 = load i32, i32* %bid, align 4
+  %idxprom14 = sext i32 %18 to i64
+  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
+  %19 = load i32, i32* %arrayidx15, align 4
+  %cmp16 = icmp sgt i32 %16, %19
+  br i1 %cmp16, label %if.then, label %if.end34
+
+if.then:                                          ; preds = %land.lhs.true
+  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %21 = load i64*, i64** %currKnodeD.addr, align 8
+  %22 = load i32, i32* %bid, align 4
+  %idxprom17 = sext i32 %22 to i64
+  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
+  %23 = load i64, i64* %arrayidx18, align 8
+  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
+  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
+  %24 = load i32, i32* %thid, align 4
+  %idxprom20 = sext i32 %24 to i64
+  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
+  %25 = load i32, i32* %arrayidx21, align 4
+  %conv22 = sext i32 %25 to i64
+  %26 = load i64, i64* %knodes_elem.addr, align 8
+  %cmp23 = icmp slt i64 %conv22, %26
+  br i1 %cmp23, label %if.then24, label %if.end
+
+if.then24:                                        ; preds = %if.then
+  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %28 = load i64*, i64** %currKnodeD.addr, align 8
+  %29 = load i32, i32* %bid, align 4
+  %idxprom25 = sext i32 %29 to i64
+  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
+  %30 = load i64, i64* %arrayidx26, align 8
+  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
+  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
+  %31 = load i32, i32* %thid, align 4
+  %idxprom29 = sext i32 %31 to i64
+  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
+  %32 = load i32, i32* %arrayidx30, align 4
+  %conv31 = sext i32 %32 to i64
+  %33 = load i64*, i64** %offsetD.addr, align 8
+  %34 = load i32, i32* %bid, align 4
+  %idxprom32 = sext i32 %34 to i64
+  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
+  store i64 %conv31, i64* %arrayidx33, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then24, %if.then
+  br label %if.end34
+
+if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
+  %35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %36 = load i64*, i64** %lastKnodeD.addr, align 8
+  %37 = load i32, i32* %bid, align 4
+  %idxprom35 = sext i32 %37 to i64
+  %arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
+  %38 = load i64, i64* %arrayidx36, align 8
+  %arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
+  %keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
+  %39 = load i32, i32* %thid, align 4
+  %idxprom39 = sext i32 %39 to i64
+  %arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
+  %40 = load i32, i32* %arrayidx40, align 4
+  %41 = load i32*, i32** %endD.addr, align 8
+  %42 = load i32, i32* %bid, align 4
+  %idxprom41 = sext i32 %42 to i64
+  %arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
+  %43 = load i32, i32* %arrayidx42, align 4
+  %cmp43 = icmp sle i32 %40, %43
+  br i1 %cmp43, label %land.lhs.true44, label %if.end75
+
+land.lhs.true44:                                  ; preds = %if.end34
+  %44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %45 = load i64*, i64** %lastKnodeD.addr, align 8
+  %46 = load i32, i32* %bid, align 4
+  %idxprom45 = sext i32 %46 to i64
+  %arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
+  %47 = load i64, i64* %arrayidx46, align 8
+  %arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
+  %keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
+  %48 = load i32, i32* %thid, align 4
+  %add49 = add nsw i32 %48, 1
+  %idxprom50 = sext i32 %add49 to i64
+  %arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
+  %49 = load i32, i32* %arrayidx51, align 4
+  %50 = load i32*, i32** %endD.addr, align 8
+  %51 = load i32, i32* %bid, align 4
+  %idxprom52 = sext i32 %51 to i64
+  %arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
+  %52 = load i32, i32* %arrayidx53, align 4
+  %cmp54 = icmp sgt i32 %49, %52
+  br i1 %cmp54, label %if.then55, label %if.end75
+
+if.then55:                                        ; preds = %land.lhs.true44
+  %53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %54 = load i64*, i64** %lastKnodeD.addr, align 8
+  %55 = load i32, i32* %bid, align 4
+  %idxprom56 = sext i32 %55 to i64
+  %arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
+  %56 = load i64, i64* %arrayidx57, align 8
+  %arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
+  %indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
+  %57 = load i32, i32* %thid, align 4
+  %idxprom60 = sext i32 %57 to i64
+  %arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
+  %58 = load i32, i32* %arrayidx61, align 4
+  %conv62 = sext i32 %58 to i64
+  %59 = load i64, i64* %knodes_elem.addr, align 8
+  %cmp63 = icmp slt i64 %conv62, %59
+  br i1 %cmp63, label %if.then64, label %if.end74
+
+if.then64:                                        ; preds = %if.then55
+  %60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %61 = load i64*, i64** %lastKnodeD.addr, align 8
+  %62 = load i32, i32* %bid, align 4
+  %idxprom65 = sext i32 %62 to i64
+  %arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
+  %63 = load i64, i64* %arrayidx66, align 8
+  %arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
+  %indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
+  %64 = load i32, i32* %thid, align 4
+  %idxprom69 = sext i32 %64 to i64
+  %arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
+  %65 = load i32, i32* %arrayidx70, align 4
+  %conv71 = sext i32 %65 to i64
+  %66 = load i64*, i64** %offset_2D.addr, align 8
+  %67 = load i32, i32* %bid, align 4
+  %idxprom72 = sext i32 %67 to i64
+  %arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
+  store i64 %conv71, i64* %arrayidx73, align 8
+  br label %if.end74
+
+if.end74:                                         ; preds = %if.then64, %if.then55
+  br label %if.end75
+
+if.end75:                                         ; preds = %if.end74, %land.lhs.true44, %if.end34
+  call void @llvm.nvvm.barrier0()
+  %68 = load i32, i32* %thid, align 4
+  %cmp76 = icmp eq i32 %68, 0
+  br i1 %cmp76, label %if.then77, label %if.end86
+
+if.then77:                                        ; preds = %if.end75
+  %69 = load i64*, i64** %offsetD.addr, align 8
+  %70 = load i32, i32* %bid, align 4
+  %idxprom78 = sext i32 %70 to i64
+  %arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
+  %71 = load i64, i64* %arrayidx79, align 8
+  %72 = load i64*, i64** %currKnodeD.addr, align 8
+  %73 = load i32, i32* %bid, align 4
+  %idxprom80 = sext i32 %73 to i64
+  %arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
+  store i64 %71, i64* %arrayidx81, align 8
+  %74 = load i64*, i64** %offset_2D.addr, align 8
+  %75 = load i32, i32* %bid, align 4
+  %idxprom82 = sext i32 %75 to i64
+  %arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
+  %76 = load i64, i64* %arrayidx83, align 8
+  %77 = load i64*, i64** %lastKnodeD.addr, align 8
+  %78 = load i32, i32* %bid, align 4
+  %idxprom84 = sext i32 %78 to i64
+  %arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
+  store i64 %76, i64* %arrayidx85, align 8
+  br label %if.end86
+
+if.end86:                                         ; preds = %if.then77, %if.end75
+  call void @llvm.nvvm.barrier0()
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end86
+  %79 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %79, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %81 = load i64*, i64** %currKnodeD.addr, align 8
+  %82 = load i32, i32* %bid, align 4
+  %idxprom87 = sext i32 %82 to i64
+  %arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
+  %83 = load i64, i64* %arrayidx88, align 8
+  %arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
+  %keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
+  %84 = load i32, i32* %thid, align 4
+  %idxprom91 = sext i32 %84 to i64
+  %arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
+  %85 = load i32, i32* %arrayidx92, align 4
+  %86 = load i32*, i32** %startD.addr, align 8
+  %87 = load i32, i32* %bid, align 4
+  %idxprom93 = sext i32 %87 to i64
+  %arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
+  %88 = load i32, i32* %arrayidx94, align 4
+  %cmp95 = icmp eq i32 %85, %88
+  br i1 %cmp95, label %if.then96, label %if.end105
+
+if.then96:                                        ; preds = %for.end
+  %89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %90 = load i64*, i64** %currKnodeD.addr, align 8
+  %91 = load i32, i32* %bid, align 4
+  %idxprom97 = sext i32 %91 to i64
+  %arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
+  %92 = load i64, i64* %arrayidx98, align 8
+  %arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
+  %indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
+  %93 = load i32, i32* %thid, align 4
+  %idxprom101 = sext i32 %93 to i64
+  %arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
+  %94 = load i32, i32* %arrayidx102, align 4
+  %95 = load i32*, i32** %RecstartD.addr, align 8
+  %96 = load i32, i32* %bid, align 4
+  %idxprom103 = sext i32 %96 to i64
+  %arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
+  store i32 %94, i32* %arrayidx104, align 4
+  br label %if.end105
+
+if.end105:                                        ; preds = %if.then96, %for.end
+  call void @llvm.nvvm.barrier0()
+  %97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %98 = load i64*, i64** %lastKnodeD.addr, align 8
+  %99 = load i32, i32* %bid, align 4
+  %idxprom106 = sext i32 %99 to i64
+  %arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
+  %100 = load i64, i64* %arrayidx107, align 8
+  %arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
+  %keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
+  %101 = load i32, i32* %thid, align 4
+  %idxprom110 = sext i32 %101 to i64
+  %arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
+  %102 = load i32, i32* %arrayidx111, align 4
+  %103 = load i32*, i32** %endD.addr, align 8
+  %104 = load i32, i32* %bid, align 4
+  %idxprom112 = sext i32 %104 to i64
+  %arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
+  %105 = load i32, i32* %arrayidx113, align 4
+  %cmp114 = icmp eq i32 %102, %105
+  br i1 %cmp114, label %if.then115, label %if.end127
+
+if.then115:                                       ; preds = %if.end105
+  %106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
+  %107 = load i64*, i64** %lastKnodeD.addr, align 8
+  %108 = load i32, i32* %bid, align 4
+  %idxprom116 = sext i32 %108 to i64
+  %arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
+  %109 = load i64, i64* %arrayidx117, align 8
+  %arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
+  %indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
+  %110 = load i32, i32* %thid, align 4
+  %idxprom120 = sext i32 %110 to i64
+  %arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
+  %111 = load i32, i32* %arrayidx121, align 4
+  %112 = load i32*, i32** %RecstartD.addr, align 8
+  %113 = load i32, i32* %bid, align 4
+  %idxprom122 = sext i32 %113 to i64
+  %arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
+  %114 = load i32, i32* %arrayidx123, align 4
+  %sub = sub nsw i32 %111, %114
+  %add124 = add nsw i32 %sub, 1
+  %115 = load i32*, i32** %ReclenD.addr, align 8
+  %116 = load i32, i32* %bid, align 4
+  %idxprom125 = sext i32 %116 to i64
+  %arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
+  store i32 %add124, i32* %arrayidx126, align 4
+  br label %if.end127
+
+if.end127:                                        ; preds = %if.then115, %if.end105
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+attributes #3 = { nounwind readnone }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/main.c
+++ b/examples/btree/main.c
--- a/examples/btree/run.sh
+++ b/examples/btree/run.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+clang -c -emit-llvm util/timer/timer.c
+clang -c -emit-llvm util/num/num.c
+#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
+#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
+#clang++ kernel/kernel_gpu_cuda_wrapper.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
+#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
+clang -c -emit-llvm main.c
+
+llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
+llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
+../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
+../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
+../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
+../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
+
+llc --relocation-model=pic --filetype=obj  main.bc
+llc --relocation-model=pic --filetype=obj  cuda.bc
+llc --relocation-model=pic --filetype=obj  num.bc
+llc --relocation-model=pic --filetype=obj  timer.bc
+llc --relocation-model=pic --filetype=obj  kernel1.bc
+llc --relocation-model=pic --filetype=obj  kernel2.bc
+llc --relocation-model=pic --filetype=obj  host1.bc
+llc --relocation-model=pic --filetype=obj  host2.bc
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o b+tree.out \
+    -fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
+    -lc -lx86Runtime -lthreadPool -lpthread
+
+./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
+    command ../../rodinia-data/b+tree/command.txt
+if grep -q "0    840187    6001" output.txt; then
+    echo "Pass"
+else
+    echo "Error result"
+    exit 1
+fi
--- a/examples/btree/util/cuda/cuda.cu
+++ b/examples/btree/util/cuda/cuda.cu
@ -0,0 +1,75 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	SET_DEVICE CODE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	INCLUDE/DEFINE
+//======================================================================================================================================================150
+
+#include "cuda.h"					// (in library path specified to compiler)
+
+//======================================================================================================================================================150
+//	FUNCTIONS
+//======================================================================================================================================================150
+
+//====================================================================================================100
+//	SET DEVICE
+//====================================================================================================100
+
+void setdevice(void){
+
+	// variables
+	int num_devices;
+	int device;
+
+	// work
+	cudaGetDeviceCount(&num_devices);
+	if (num_devices > 1) {
+
+		// variables
+		int max_multiprocessors;
+		int max_device;
+		cudaDeviceProp properties;
+
+		// initialize variables
+		max_multiprocessors = 0;
+		max_device = 0;
+
+		for (device = 0; device < num_devices; device++) {
+			cudaGetDeviceProperties(&properties, device);
+			if (max_multiprocessors < properties.multiProcessorCount) {
+				max_multiprocessors = properties.multiProcessorCount;
+				max_device = device;
+			}
+		}
+		cudaSetDevice(max_device);
+	}
+
+}
+
+//====================================================================================================100
+//	GET LAST ERROR
+//====================================================================================================100
+
+void checkCUDAError(const char *msg)
+{
+	cudaError_t err = cudaGetLastError();
+	if( cudaSuccess != err) {
+		// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
+		printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
+		fflush(NULL);
+		exit(EXIT_FAILURE);
+	}
+}
+
+//===============================================================================================================================================================================================================200
+//	END
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/util/cuda/cuda.h
+++ b/examples/btree/util/cuda/cuda.h
@ -0,0 +1,37 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	SET_DEVICE HEADER
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	INCLUDE/DEFINE
+//======================================================================================================================================================150
+
+#include <stdio.h> // (in library path known to compiler)		needed by printf
+
+//======================================================================================================================================================150
+//	FUNCTION PROTOTYPES
+//======================================================================================================================================================150
+
+//====================================================================================================100
+//	SET DEVICE
+//====================================================================================================100
+
+void setdevice(void);
+
+//====================================================================================================100
+//	GET LAST ERROR
+//====================================================================================================100
+
+void checkCUDAError(const char *msg);
+
+//===============================================================================================================================================================================================================200
+//	END SET_DEVICE HEADER
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/util/num/num.c
+++ b/examples/btree/util/num/num.c
@ -0,0 +1,55 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	DESCRIPTION
+//===============================================================================================================================================================================================================200
+
+// Returns:	0 if string does not represent integer
+//			1 if string represents integer
+
+//===============================================================================================================================================================================================================200
+//	NUM CODE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	ISINTEGER FUNCTION
+//======================================================================================================================================================150
+
+int isInteger(char *str) {
+
+  //====================================================================================================100
+  //	make sure it's not empty
+  //====================================================================================================100
+
+  if (*str == '\0') {
+    return 0;
+  }
+
+  //====================================================================================================100
+  //	if any digit is not a number, return false
+  //====================================================================================================100
+
+  for (; *str != '\0'; str++) {
+    if (*str < 48 ||
+        *str >
+            57) { // digit characters (need to include . if checking for float)
+      return 0;
+    }
+  }
+
+  //====================================================================================================100
+  //	it got past all my checks so I think it's a number
+  //====================================================================================================100
+
+  return 1;
+}
+
+//===============================================================================================================================================================================================================200
+//	END NUM CODE
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/util/num/num.h
+++ b/examples/btree/util/num/num.h
@ -0,0 +1,21 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	FILE HEADER
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	ISINTEGER FUNCTION PROTOTYPE
+//======================================================================================================================================================150
+
+int isInteger(char *str);
+
+//===============================================================================================================================================================================================================200
+//	END FILE HEADER
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/util/timer/timer.c
+++ b/examples/btree/util/timer/timer.c
@ -0,0 +1,36 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	TIMER CODE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	INCLUDE/DEFINE
+//======================================================================================================================================================150
+
+#include <stdlib.h>
+
+//======================================================================================================================================================150
+//	FUNCTIONS
+//======================================================================================================================================================150
+
+//====================================================================================================100
+//	DISPLAY TIME
+//====================================================================================================100
+
+// Returns the current system time in microseconds
+long long get_time() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (tv.tv_sec * 1000000) + tv.tv_usec;
+}
+
+//===============================================================================================================================================================================================================200
+//	END TIMER CODE
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/btree/util/timer/timer.h
+++ b/examples/btree/util/timer/timer.h
@ -0,0 +1,21 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	TIMER HEADER
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	FUNCTION PROTOTYPES
+//======================================================================================================================================================150
+
+long long get_time();
+
+//===============================================================================================================================================================================================================200
+//	END TIMER HEADER
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/cfd/euler3d.cu
+++ b/examples/cfd/euler3d.cu
@ -0,0 +1,662 @@
+#include <fstream>
+#include <helper_cuda.h>
+#include <helper_timer.h>
+#include <iostream>
+
+/*
+ * Options
+ *
+ */
+#define GAMMA 1.4f
+#define iterations 2
+// #ifndef block_length
+// 	#define block_length 192
+// #endif
+
+#define NDIM 3
+#define NNB 4
+
+#define RK 3 // 3rd order RK
+#define ff_mach 1.2f
+#define deg_angle_of_attack 0.0f
+
+/*
+ * not options
+ */
+
+#ifdef RD_WG_SIZE_0_0
+#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+#define BLOCK_SIZE_0 RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE_0 RD_WG_SIZE
+#else
+#define BLOCK_SIZE_0 192
+#endif
+
+#ifdef RD_WG_SIZE_1_0
+#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
+#elif defined(RD_WG_SIZE_1)
+#define BLOCK_SIZE_1 RD_WG_SIZE_1
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE_1 RD_WG_SIZE
+#else
+#define BLOCK_SIZE_1 192
+#endif
+
+#ifdef RD_WG_SIZE_2_0
+#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
+#elif defined(RD_WG_SIZE_1)
+#define BLOCK_SIZE_2 RD_WG_SIZE_2
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE_2 RD_WG_SIZE
+#else
+#define BLOCK_SIZE_2 192
+#endif
+
+#ifdef RD_WG_SIZE_3_0
+#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
+#elif defined(RD_WG_SIZE_3)
+#define BLOCK_SIZE_3 RD_WG_SIZE_3
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE_3 RD_WG_SIZE
+#else
+#define BLOCK_SIZE_3 192
+#endif
+
+#ifdef RD_WG_SIZE_4_0
+#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
+#elif defined(RD_WG_SIZE_4)
+#define BLOCK_SIZE_4 RD_WG_SIZE_4
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE_4 RD_WG_SIZE
+#else
+#define BLOCK_SIZE_4 192
+#endif
+
+// #if block_length > 128
+// #warning "the kernels may fail too launch on some systems if the block length
+// is too large" #endif
+
+#define VAR_DENSITY 0
+#define VAR_MOMENTUM 1
+#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
+#define NVAR (VAR_DENSITY_ENERGY + 1)
+
+/*
+ * Generic functions
+ */
+template <typename T> T *alloc(int N) {
+  T *t;
+  checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
+  return t;
+}
+
+template <typename T> void dealloc(T *array) {
+  checkCudaErrors(cudaFree((void *)array));
+}
+
+template <typename T> void copy(T *dst, T *src, int N) {
+  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
+                             cudaMemcpyDeviceToDevice));
+}
+
+template <typename T> void upload(T *dst, T *src, int N) {
+  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
+                             cudaMemcpyHostToDevice));
+}
+
+template <typename T> void download(T *dst, T *src, int N) {
+  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
+                             cudaMemcpyDeviceToHost));
+}
+
+void dump(float *variables, int nel, int nelr) {
+  float *h_variables = new float[nelr * NVAR];
+  download(h_variables, variables, nelr * NVAR);
+
+  {
+    std::ofstream file("density");
+    file << nel << " " << nelr << std::endl;
+    for (int i = 0; i < nel; i++)
+      file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
+  }
+
+  {
+    std::ofstream file("momentum");
+    file << nel << " " << nelr << std::endl;
+    for (int i = 0; i < nel; i++) {
+      for (int j = 0; j != NDIM; j++)
+        file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
+      file << std::endl;
+    }
+  }
+
+  {
+    std::ofstream file("density_energy");
+    file << nel << " " << nelr << std::endl;
+    for (int i = 0; i < nel; i++)
+      file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
+  }
+  delete[] h_variables;
+}
+
+/*
+ * Element-based Cell-centered FVM solver functions
+ */
+__constant__ float ff_variable[NVAR];
+__constant__ float3 ff_flux_contribution_momentum_x[1];
+__constant__ float3 ff_flux_contribution_momentum_y[1];
+__constant__ float3 ff_flux_contribution_momentum_z[1];
+__constant__ float3 ff_flux_contribution_density_energy[1];
+
+__global__ void cuda_initialize_variables(int nelr, float *variables) {
+  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
+  for (int j = 0; j < NVAR; j++)
+    variables[i + j * nelr] = ff_variable[j];
+}
+void initialize_variables(int nelr, float *variables) {
+  dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
+  cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
+  getLastCudaError("initialize_variables failed");
+}
+
+__device__ __host__ inline void compute_flux_contribution(
+    float &density, float3 &momentum, float &density_energy, float &pressure,
+    float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
+    float3 &fc_momentum_z, float3 &fc_density_energy) {
+  fc_momentum_x.x = velocity.x * momentum.x + pressure;
+  fc_momentum_x.y = velocity.x * momentum.y;
+  fc_momentum_x.z = velocity.x * momentum.z;
+
+  fc_momentum_y.x = fc_momentum_x.y;
+  fc_momentum_y.y = velocity.y * momentum.y + pressure;
+  fc_momentum_y.z = velocity.y * momentum.z;
+
+  fc_momentum_z.x = fc_momentum_x.z;
+  fc_momentum_z.y = fc_momentum_y.z;
+  fc_momentum_z.z = velocity.z * momentum.z + pressure;
+
+  float de_p = density_energy + pressure;
+  fc_density_energy.x = velocity.x * de_p;
+  fc_density_energy.y = velocity.y * de_p;
+  fc_density_energy.z = velocity.z * de_p;
+}
+
+__device__ inline void compute_velocity(float &density, float3 &momentum,
+                                        float3 &velocity) {
+  velocity.x = momentum.x / density;
+  velocity.y = momentum.y / density;
+  velocity.z = momentum.z / density;
+}
+
+__device__ inline float compute_speed_sqd(float3 &velocity) {
+  return velocity.x * velocity.x + velocity.y * velocity.y +
+         velocity.z * velocity.z;
+}
+
+__device__ inline float compute_pressure(float &density, float &density_energy,
+                                         float &speed_sqd) {
+  return (float(GAMMA) - float(1.0f)) *
+         (density_energy - float(0.5f) * density * speed_sqd);
+}
+
+__device__ inline float compute_speed_of_sound(float &density,
+                                               float &pressure) {
+  return sqrtf(float(GAMMA) * pressure / density);
+}
+
+__global__ void cuda_compute_step_factor(int nelr, float *variables,
+                                         float *areas, float *step_factors) {
+  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
+
+  float density = variables[i + VAR_DENSITY * nelr];
+  float3 momentum;
+  momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
+  momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
+  momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
+
+  float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
+
+  float3 velocity;
+  compute_velocity(density, momentum, velocity);
+  float speed_sqd = compute_speed_sqd(velocity);
+  float pressure = compute_pressure(density, density_energy, speed_sqd);
+  float speed_of_sound = compute_speed_of_sound(density, pressure);
+
+  // dt = float(0.5f) * sqrtf(areas[i]) /  (||v|| + c).... but when we do time
+  // stepping, this later would need to be divided by the area, so we just do it
+  // all at once
+  step_factors[i] =
+      float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
+}
+void compute_step_factor(int nelr, float *variables, float *areas,
+                         float *step_factors) {
+  dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
+  cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
+  getLastCudaError("compute_step_factor failed");
+}
+
+/*
+ *
+ *
+ */
+__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
+                                  float *normals, float *variables,
+                                  float *fluxes) {
+  const float smoothing_coefficient = float(0.2f);
+  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
+
+  int j, nb;
+  float3 normal;
+  float normal_len;
+  float factor;
+
+  float density_i = variables[i + VAR_DENSITY * nelr];
+  float3 momentum_i;
+  momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
+  momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
+  momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
+
+  float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
+
+  float3 velocity_i;
+  compute_velocity(density_i, momentum_i, velocity_i);
+  float speed_sqd_i = compute_speed_sqd(velocity_i);
+  float speed_i = sqrtf(speed_sqd_i);
+  float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
+  float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
+  float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
+      flux_contribution_i_momentum_z;
+  float3 flux_contribution_i_density_energy;
+  compute_flux_contribution(
+      density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
+      flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
+      flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
+
+  float flux_i_density = float(0.0f);
+  float3 flux_i_momentum;
+  flux_i_momentum.x = float(0.0f);
+  flux_i_momentum.y = float(0.0f);
+  flux_i_momentum.z = float(0.0f);
+  float flux_i_density_energy = float(0.0f);
+
+  float3 velocity_nb;
+  float density_nb, density_energy_nb;
+  float3 momentum_nb;
+  float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
+      flux_contribution_nb_momentum_z;
+  float3 flux_contribution_nb_density_energy;
+  float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
+
+#pragma unroll
+  for (j = 0; j < NNB; j++) {
+    nb = elements_surrounding_elements[i + j * nelr];
+    normal.x = normals[i + (j + 0 * NNB) * nelr];
+    normal.y = normals[i + (j + 1 * NNB) * nelr];
+    normal.z = normals[i + (j + 2 * NNB) * nelr];
+    normal_len =
+        sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
+
+    if (nb >= 0) // a legitimate neighbor
+    {
+      density_nb = variables[nb + VAR_DENSITY * nelr];
+      momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
+      momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
+      momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
+      density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
+      compute_velocity(density_nb, momentum_nb, velocity_nb);
+      speed_sqd_nb = compute_speed_sqd(velocity_nb);
+      pressure_nb =
+          compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
+      speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
+      compute_flux_contribution(
+          density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
+          flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
+          flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
+
+      // artificial viscosity
+      factor = -normal_len * smoothing_coefficient * float(0.5f) *
+               (speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
+                speed_of_sound_nb);
+      flux_i_density += factor * (density_i - density_nb);
+      flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
+      flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
+      flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
+      flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
+
+      // accumulate cell-centered fluxes
+      factor = float(0.5f) * normal.x;
+      flux_i_density += factor * (momentum_nb.x + momentum_i.x);
+      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
+                                         flux_contribution_i_density_energy.x);
+      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
+                                     flux_contribution_i_momentum_x.x);
+      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
+                                     flux_contribution_i_momentum_y.x);
+      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
+                                     flux_contribution_i_momentum_z.x);
+
+      factor = float(0.5f) * normal.y;
+      flux_i_density += factor * (momentum_nb.y + momentum_i.y);
+      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
+                                         flux_contribution_i_density_energy.y);
+      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
+                                     flux_contribution_i_momentum_x.y);
+      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
+                                     flux_contribution_i_momentum_y.y);
+      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
+                                     flux_contribution_i_momentum_z.y);
+
+      factor = float(0.5f) * normal.z;
+      flux_i_density += factor * (momentum_nb.z + momentum_i.z);
+      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
+                                         flux_contribution_i_density_energy.z);
+      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
+                                     flux_contribution_i_momentum_x.z);
+      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
+                                     flux_contribution_i_momentum_y.z);
+      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
+                                     flux_contribution_i_momentum_z.z);
+    } else if (nb == -1) // a wing boundary
+    {
+      flux_i_momentum.x += normal.x * pressure_i;
+      flux_i_momentum.y += normal.y * pressure_i;
+      flux_i_momentum.z += normal.z * pressure_i;
+    } else if (nb == -2) // a far field boundary
+    {
+      factor = float(0.5f) * normal.x;
+      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
+      flux_i_density_energy +=
+          factor * (ff_flux_contribution_density_energy[0].x +
+                    flux_contribution_i_density_energy.x);
+      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
+                                     flux_contribution_i_momentum_x.x);
+      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
+                                     flux_contribution_i_momentum_y.x);
+      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
+                                     flux_contribution_i_momentum_z.x);
+
+      factor = float(0.5f) * normal.y;
+      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
+      flux_i_density_energy +=
+          factor * (ff_flux_contribution_density_energy[0].y +
+                    flux_contribution_i_density_energy.y);
+      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
+                                     flux_contribution_i_momentum_x.y);
+      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
+                                     flux_contribution_i_momentum_y.y);
+      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
+                                     flux_contribution_i_momentum_z.y);
+
+      factor = float(0.5f) * normal.z;
+      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
+      flux_i_density_energy +=
+          factor * (ff_flux_contribution_density_energy[0].z +
+                    flux_contribution_i_density_energy.z);
+      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
+                                     flux_contribution_i_momentum_x.z);
+      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
+                                     flux_contribution_i_momentum_y.z);
+      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
+                                     flux_contribution_i_momentum_z.z);
+    }
+  }
+
+  fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
+  fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
+  fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
+  fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
+  fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
+}
+void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
+                  float *variables, float *fluxes) {
+  dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
+  cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
+                                variables, fluxes);
+  getLastCudaError("compute_flux failed");
+}
+
+__global__ void cuda_time_step(int j, int nelr, float *old_variables,
+                               float *variables, float *step_factors,
+                               float *fluxes) {
+  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
+
+  float factor = step_factors[i] / float(RK + 1 - j);
+
+  variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
+                                      factor * fluxes[i + VAR_DENSITY * nelr];
+  variables[i + VAR_DENSITY_ENERGY * nelr] =
+      old_variables[i + VAR_DENSITY_ENERGY * nelr] +
+      factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
+  variables[i + (VAR_MOMENTUM + 0) * nelr] =
+      old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
+      factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
+  variables[i + (VAR_MOMENTUM + 1) * nelr] =
+      old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
+      factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
+  variables[i + (VAR_MOMENTUM + 2) * nelr] =
+      old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
+      factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
+}
+void time_step(int j, int nelr, float *old_variables, float *variables,
+               float *step_factors, float *fluxes) {
+  dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
+  cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
+                             fluxes);
+  getLastCudaError("update failed");
+}
+
+/*
+ * Main function
+ */
+int main(int argc, char **argv) {
+  printf("WG size of kernel:initialize = %d, WG size of "
+         "kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
+         "%d, WG size of kernel:time_step = %d\n",
+         BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
+
+  if (argc < 2) {
+    std::cout << "specify data file name" << std::endl;
+    return 0;
+  }
+  const char *data_file_name = argv[1];
+
+  cudaDeviceProp prop;
+  int dev;
+
+  checkCudaErrors(cudaSetDevice(0));
+
+  // set far field conditions and load them into constant memory on the gpu
+  {
+    float h_ff_variable[NVAR];
+    const float angle_of_attack =
+        float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
+
+    h_ff_variable[VAR_DENSITY] = float(1.4);
+
+    float ff_pressure = float(1.0f);
+    float ff_speed_of_sound =
+        sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
+    float ff_speed = float(ff_mach) * ff_speed_of_sound;
+
+    float3 ff_velocity;
+    ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
+    ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
+    ff_velocity.z = 0.0f;
+
+    h_ff_variable[VAR_MOMENTUM + 0] =
+        h_ff_variable[VAR_DENSITY] * ff_velocity.x;
+    h_ff_variable[VAR_MOMENTUM + 1] =
+        h_ff_variable[VAR_DENSITY] * ff_velocity.y;
+    h_ff_variable[VAR_MOMENTUM + 2] =
+        h_ff_variable[VAR_DENSITY] * ff_velocity.z;
+
+    h_ff_variable[VAR_DENSITY_ENERGY] =
+        h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
+        (ff_pressure / float(GAMMA - 1.0f));
+
+    float3 h_ff_momentum;
+    h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
+    h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
+    h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
+    float3 h_ff_flux_contribution_momentum_x;
+    float3 h_ff_flux_contribution_momentum_y;
+    float3 h_ff_flux_contribution_momentum_z;
+    float3 h_ff_flux_contribution_density_energy;
+    compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
+                              h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
+                              ff_velocity, h_ff_flux_contribution_momentum_x,
+                              h_ff_flux_contribution_momentum_y,
+                              h_ff_flux_contribution_momentum_z,
+                              h_ff_flux_contribution_density_energy);
+
+    // copy far field conditions to the gpu
+    checkCudaErrors(
+        cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
+    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
+                                       &h_ff_flux_contribution_momentum_x,
+                                       sizeof(float3)));
+    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
+                                       &h_ff_flux_contribution_momentum_y,
+                                       sizeof(float3)));
+    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
+                                       &h_ff_flux_contribution_momentum_z,
+                                       sizeof(float3)));
+
+    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
+                                       &h_ff_flux_contribution_density_energy,
+                                       sizeof(float3)));
+  }
+  int nel;
+  int nelr;
+
+  // read in domain geometry
+  float *areas;
+  int *elements_surrounding_elements;
+  float *normals;
+  {
+    std::ifstream file(data_file_name);
+
+    file >> nel;
+    nelr =
+        BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
+
+    float *h_areas = new float[nelr];
+    int *h_elements_surrounding_elements = new int[nelr * NNB];
+    float *h_normals = new float[nelr * NDIM * NNB];
+
+    // read in data
+    for (int i = 0; i < nel; i++) {
+      file >> h_areas[i];
+      for (int j = 0; j < NNB; j++) {
+        file >> h_elements_surrounding_elements[i + j * nelr];
+        if (h_elements_surrounding_elements[i + j * nelr] < 0)
+          h_elements_surrounding_elements[i + j * nelr] = -1;
+        h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
+                                                         // Fortran numbering
+
+        for (int k = 0; k < NDIM; k++) {
+          file >> h_normals[i + (j + k * NNB) * nelr];
+          h_normals[i + (j + k * NNB) * nelr] =
+              -h_normals[i + (j + k * NNB) * nelr];
+        }
+      }
+    }
+
+    // fill in remaining data
+    int last = nel - 1;
+    for (int i = nel; i < nelr; i++) {
+      h_areas[i] = h_areas[last];
+      for (int j = 0; j < NNB; j++) {
+        // duplicate the last element
+        h_elements_surrounding_elements[i + j * nelr] =
+            h_elements_surrounding_elements[last + j * nelr];
+        for (int k = 0; k < NDIM; k++)
+          h_normals[last + (j + k * NNB) * nelr] =
+              h_normals[last + (j + k * NNB) * nelr];
+      }
+    }
+
+    areas = alloc<float>(nelr);
+    upload<float>(areas, h_areas, nelr);
+
+    elements_surrounding_elements = alloc<int>(nelr * NNB);
+    upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
+                nelr * NNB);
+
+    normals = alloc<float>(nelr * NDIM * NNB);
+    upload<float>(normals, h_normals, nelr * NDIM * NNB);
+
+    delete[] h_areas;
+    delete[] h_elements_surrounding_elements;
+    delete[] h_normals;
+  }
+
+  // Create arrays and set initial conditions
+  float *variables = alloc<float>(nelr * NVAR);
+  initialize_variables(nelr, variables);
+
+  float *old_variables = alloc<float>(nelr * NVAR);
+  float *fluxes = alloc<float>(nelr * NVAR);
+  float *step_factors = alloc<float>(nelr);
+
+  // make sure all memory is floatly allocated before we start timing
+  initialize_variables(nelr, old_variables);
+  initialize_variables(nelr, fluxes);
+  cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
+  // make sure CUDA isn't still doing something before we start timing
+  cudaThreadSynchronize();
+
+  // these need to be computed the first time in order to compute time step
+  std::cout << "Starting..." << std::endl;
+
+  StopWatchInterface *timer = 0;
+  //	unsigned int timer = 0;
+
+  // CUT_SAFE_CALL( cutCreateTimer( &timer));
+  // CUT_SAFE_CALL( cutStartTimer( timer));
+  sdkCreateTimer(&timer);
+  sdkStartTimer(&timer);
+  // Begin iterations
+  for (int i = 0; i < iterations; i++) {
+    copy<float>(old_variables, variables, nelr * NVAR);
+
+    // for the first iteration we compute the time step
+    compute_step_factor(nelr, variables, areas, step_factors);
+    getLastCudaError("compute_step_factor failed");
+
+    for (int j = 0; j < RK; j++) {
+      compute_flux(nelr, elements_surrounding_elements, normals, variables,
+                   fluxes);
+      getLastCudaError("compute_flux failed");
+      time_step(j, nelr, old_variables, variables, step_factors, fluxes);
+      getLastCudaError("time_step failed");
+    }
+  }
+
+  cudaThreadSynchronize();
+  //	CUT_SAFE_CALL( cutStopTimer(timer) );
+  sdkStopTimer(&timer);
+
+  std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
+            << " seconds per iteration" << std::endl;
+
+  std::cout << "Saving solution..." << std::endl;
+  dump(variables, nel, nelr);
+  std::cout << "Saved solution..." << std::endl;
+
+  std::cout << "Cleaning up..." << std::endl;
+  dealloc<float>(areas);
+  dealloc<int>(elements_surrounding_elements);
+  dealloc<float>(normals);
+
+  dealloc<float>(variables);
+  dealloc<float>(old_variables);
+  dealloc<float>(fluxes);
+  dealloc<float>(step_factors);
+
+  std::cout << "Done..." << std::endl;
+
+  return 0;
+}
--- a/examples/cfd/run.sh
+++ b/examples/cfd/run.sh
@ -0,0 +1,15 @@
+# # #!/bin/bash
+clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
+
+/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
+/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc  host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+
+./a.out ../rodinia-data/cfd/fvcorr.domn.097K
+# ./demo 1024
+# # # ./demo -f ../../data/matrix3.txt
+# # # run -f ../../data/gaussian/matrix3.txt
--- a/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,396 @@
+; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "gaussian.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.__cuda_builtin_blockDim_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
+
+$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
+
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
+entry:
+  %m_cuda.addr = alloca float*, align 8
+  %a_cuda.addr = alloca float*, align 8
+  %Size.addr = alloca i32, align 4
+  %t.addr = alloca i32, align 4
+  store float* %m_cuda, float** %m_cuda.addr, align 8
+  store float* %a_cuda, float** %a_cuda.addr, align 8
+  store i32 %Size, i32* %Size.addr, align 4
+  store i32 %t, i32* %t.addr, align 4
+  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
+  %mul = mul i32 %call1, %call2
+  %add = add i32 %call, %mul
+  %0 = load i32, i32* %Size.addr, align 4
+  %sub = sub nsw i32 %0, 1
+  %1 = load i32, i32* %t.addr, align 4
+  %sub3 = sub nsw i32 %sub, %1
+  %cmp = icmp uge i32 %add, %sub3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %2 = load float*, float** %a_cuda.addr, align 8
+  %3 = load i32, i32* %Size.addr, align 4
+  %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
+  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %mul6 = mul i32 %call4, %call5
+  %call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %add8 = add i32 %mul6, %call7
+  %4 = load i32, i32* %t.addr, align 4
+  %add9 = add i32 %add8, %4
+  %add10 = add i32 %add9, 1
+  %mul11 = mul i32 %3, %add10
+  %idx.ext = zext i32 %mul11 to i64
+  %add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
+  %5 = load i32, i32* %t.addr, align 4
+  %idx.ext12 = sext i32 %5 to i64
+  %add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
+  %6 = load float, float* %add.ptr13, align 4
+  %7 = load float*, float** %a_cuda.addr, align 8
+  %8 = load i32, i32* %Size.addr, align 4
+  %9 = load i32, i32* %t.addr, align 4
+  %mul14 = mul nsw i32 %8, %9
+  %idx.ext15 = sext i32 %mul14 to i64
+  %add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
+  %10 = load i32, i32* %t.addr, align 4
+  %idx.ext17 = sext i32 %10 to i64
+  %add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
+  %11 = load float, float* %add.ptr18, align 4
+  %div = fdiv float %6, %11
+  %12 = load float*, float** %m_cuda.addr, align 8
+  %13 = load i32, i32* %Size.addr, align 4
+  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
+  %call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %mul21 = mul i32 %call19, %call20
+  %call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %add23 = add i32 %mul21, %call22
+  %14 = load i32, i32* %t.addr, align 4
+  %add24 = add i32 %add23, %14
+  %add25 = add i32 %add24, 1
+  %mul26 = mul i32 %13, %add25
+  %idx.ext27 = zext i32 %mul26 to i64
+  %add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
+  %15 = load i32, i32* %t.addr, align 4
+  %idx.ext29 = sext i32 %15 to i64
+  %add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
+  store float %div, float* %add.ptr30, align 4
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  ret i32 %0
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
+entry:
+  %m_cuda.addr = alloca float*, align 8
+  %a_cuda.addr = alloca float*, align 8
+  %b_cuda.addr = alloca float*, align 8
+  %Size.addr = alloca i32, align 4
+  %j1.addr = alloca i32, align 4
+  %t.addr = alloca i32, align 4
+  %xidx = alloca i32, align 4
+  %yidx = alloca i32, align 4
+  store float* %m_cuda, float** %m_cuda.addr, align 8
+  store float* %a_cuda, float** %a_cuda.addr, align 8
+  store float* %b_cuda, float** %b_cuda.addr, align 8
+  store i32 %Size, i32* %Size.addr, align 4
+  store i32 %j1, i32* %j1.addr, align 4
+  store i32 %t, i32* %t.addr, align 4
+  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
+  %mul = mul i32 %call1, %call2
+  %add = add i32 %call, %mul
+  %0 = load i32, i32* %Size.addr, align 4
+  %sub = sub nsw i32 %0, 1
+  %1 = load i32, i32* %t.addr, align 4
+  %sub3 = sub nsw i32 %sub, %1
+  %cmp = icmp uge i32 %add, %sub3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end58
+
+if.end:                                           ; preds = %entry
+  %call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
+  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
+  %call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
+  %mul7 = mul i32 %call5, %call6
+  %add8 = add i32 %call4, %mul7
+  %2 = load i32, i32* %Size.addr, align 4
+  %3 = load i32, i32* %t.addr, align 4
+  %sub9 = sub nsw i32 %2, %3
+  %cmp10 = icmp uge i32 %add8, %sub9
+  br i1 %cmp10, label %if.then11, label %if.end12
+
+if.then11:                                        ; preds = %if.end
+  br label %if.end58
+
+if.end12:                                         ; preds = %if.end
+  %call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
+  %mul15 = mul i32 %call13, %call14
+  %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %add17 = add i32 %mul15, %call16
+  store i32 %add17, i32* %xidx, align 4
+  %call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
+  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
+  %mul20 = mul i32 %call18, %call19
+  %call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
+  %add22 = add i32 %mul20, %call21
+  store i32 %add22, i32* %yidx, align 4
+  %4 = load float*, float** %m_cuda.addr, align 8
+  %5 = load i32, i32* %Size.addr, align 4
+  %6 = load i32, i32* %xidx, align 4
+  %add23 = add nsw i32 %6, 1
+  %7 = load i32, i32* %t.addr, align 4
+  %add24 = add nsw i32 %add23, %7
+  %mul25 = mul nsw i32 %5, %add24
+  %8 = load i32, i32* %t.addr, align 4
+  %add26 = add nsw i32 %mul25, %8
+  %idxprom = sext i32 %add26 to i64
+  %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
+  %9 = load float, float* %arrayidx, align 4
+  %10 = load float*, float** %a_cuda.addr, align 8
+  %11 = load i32, i32* %Size.addr, align 4
+  %12 = load i32, i32* %t.addr, align 4
+  %mul27 = mul nsw i32 %11, %12
+  %13 = load i32, i32* %yidx, align 4
+  %14 = load i32, i32* %t.addr, align 4
+  %add28 = add nsw i32 %13, %14
+  %add29 = add nsw i32 %mul27, %add28
+  %idxprom30 = sext i32 %add29 to i64
+  %arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
+  %15 = load float, float* %arrayidx31, align 4
+  %mul32 = fmul contract float %9, %15
+  %16 = load float*, float** %a_cuda.addr, align 8
+  %17 = load i32, i32* %Size.addr, align 4
+  %18 = load i32, i32* %xidx, align 4
+  %add33 = add nsw i32 %18, 1
+  %19 = load i32, i32* %t.addr, align 4
+  %add34 = add nsw i32 %add33, %19
+  %mul35 = mul nsw i32 %17, %add34
+  %20 = load i32, i32* %yidx, align 4
+  %21 = load i32, i32* %t.addr, align 4
+  %add36 = add nsw i32 %20, %21
+  %add37 = add nsw i32 %mul35, %add36
+  %idxprom38 = sext i32 %add37 to i64
+  %arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
+  %22 = load float, float* %arrayidx39, align 4
+  %sub40 = fsub contract float %22, %mul32
+  store float %sub40, float* %arrayidx39, align 4
+  %23 = load i32, i32* %yidx, align 4
+  %cmp41 = icmp eq i32 %23, 0
+  br i1 %cmp41, label %if.then42, label %if.end58
+
+if.then42:                                        ; preds = %if.end12
+  %24 = load float*, float** %m_cuda.addr, align 8
+  %25 = load i32, i32* %Size.addr, align 4
+  %26 = load i32, i32* %xidx, align 4
+  %add43 = add nsw i32 %26, 1
+  %27 = load i32, i32* %t.addr, align 4
+  %add44 = add nsw i32 %add43, %27
+  %mul45 = mul nsw i32 %25, %add44
+  %28 = load i32, i32* %yidx, align 4
+  %29 = load i32, i32* %t.addr, align 4
+  %add46 = add nsw i32 %28, %29
+  %add47 = add nsw i32 %mul45, %add46
+  %idxprom48 = sext i32 %add47 to i64
+  %arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
+  %30 = load float, float* %arrayidx49, align 4
+  %31 = load float*, float** %b_cuda.addr, align 8
+  %32 = load i32, i32* %t.addr, align 4
+  %idxprom50 = sext i32 %32 to i64
+  %arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
+  %33 = load float, float* %arrayidx51, align 4
+  %mul52 = fmul contract float %30, %33
+  %34 = load float*, float** %b_cuda.addr, align 8
+  %35 = load i32, i32* %xidx, align 4
+  %add53 = add nsw i32 %35, 1
+  %36 = load i32, i32* %t.addr, align 4
+  %add54 = add nsw i32 %add53, %36
+  %idxprom55 = sext i32 %add54 to i64
+  %arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
+  %37 = load float, float* %arrayidx56, align 4
+  %sub57 = fsub contract float %37, %mul52
+  store float %sub57, float* %arrayidx56, align 4
+  br label %if.end58
+
+if.end58:                                         ; preds = %if.then, %if.then11, %if.then42, %if.end12
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  ret i32 %0
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
+!llvm.ident = !{!9}
+!nvvmir.version = !{!10}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
+!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
+!5 = !{null, !"align", i32 8}
+!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!7 = !{null, !"align", i32 16}
+!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!10 = !{i32 1, i32 4}
--- a/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
--- a/examples/gauss/gaussian.cu
+++ b/examples/gauss/gaussian.cu
@ -0,0 +1,522 @@
+/*-----------------------------------------------------------
+ ** gaussian.cu -- The program is to solve a linear system Ax = b
+ **   by using Gaussian Elimination. The algorithm on page 101
+ **   ("Foundations of Parallel Programming") is used.
+ **   The sequential version is gaussian.c.  This parallel
+ **   implementation converts three independent for() loops
+ **   into three Fans.  Use the data file ge_3.dat to verify
+ **   the correction of the output.
+ **
+ ** Written by Andreas Kura, 02/15/95
+ ** Modified by Chong-wei Xu, 04/20/95
+ ** Modified by Chris Gregg for CUDA, 07/20/2009
+ **-----------------------------------------------------------
+ */
+#include "cuda_runtime.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#ifdef TIMING
+#include "timing.h"
+#endif
+
+#ifdef RD_WG_SIZE_0_0
+#define MAXBLOCKSIZE RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+#define MAXBLOCKSIZE RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+#define MAXBLOCKSIZE RD_WG_SIZE
+#else
+#define MAXBLOCKSIZE 512
+#endif
+
+// 2D defines. Go from specific to general
+#ifdef RD_WG_SIZE_1_0
+#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
+#elif defined(RD_WG_SIZE_1)
+#define BLOCK_SIZE_XY RD_WG_SIZE_1
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE_XY RD_WG_SIZE
+#else
+#define BLOCK_SIZE_XY 1
+#endif
+
+#ifdef TIMING
+struct timeval tv;
+struct timeval tv_total_start, tv_total_end;
+struct timeval tv_h2d_start, tv_h2d_end;
+struct timeval tv_d2h_start, tv_d2h_end;
+struct timeval tv_kernel_start, tv_kernel_end;
+struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+struct timeval tv_close_start, tv_close_end;
+float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
+      d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+int Size;
+float *a, *b, *finalVec;
+float *m;
+
+FILE *fp;
+
+void InitProblemOnce(char *filename);
+void InitPerRun();
+void ForwardSub();
+void BackSub();
+__global__ void Fan1(float *m, float *a, int Size, int t);
+__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
+void InitMat(float *ary, int nrow, int ncol);
+void InitAry(float *ary, int ary_size);
+void PrintMat(float *ary, int nrow, int ncolumn);
+void PrintAry(float *ary, int ary_size);
+void PrintDeviceProperties();
+void checkCUDAError(const char *msg);
+
+unsigned int totalKernelTime = 0;
+
+// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
+void create_matrix(float *m, int size) {
+  int i, j;
+  float lamda = -0.01;
+  float coe[2 * size - 1];
+  float coe_i = 0.0;
+
+  for (i = 0; i < size; i++) {
+    coe_i = 10 * exp(lamda * i);
+    j = size - 1 + i;
+    coe[j] = coe_i;
+    j = size - 1 - i;
+    coe[j] = coe_i;
+  }
+
+  for (i = 0; i < size; i++) {
+    for (j = 0; j < size; j++) {
+      m[i * size + j] = coe[size - 1 - i + j];
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
+         MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
+  int verbose = 1;
+  int i, j;
+  char flag;
+  if (argc < 2) {
+    printf("Usage: gaussian -f filename / -s size [-q]\n\n");
+    printf("-q (quiet) suppresses printing the matrix and result values.\n");
+    printf("-f (filename) path of input file\n");
+    printf(
+        "-s (size) size of matrix. Create matrix and rhs in this program \n");
+    printf(
+        "The first line of the file contains the dimension of the matrix, n.");
+    printf("The second line of the file is a newline.\n");
+    printf("The next n lines contain n tab separated values for the matrix.");
+    printf("The next line of the file is a newline.\n");
+    printf("The next line of the file is a 1xn vector with tab separated "
+           "values.\n");
+    printf("The next line of the file is a newline. (optional)\n");
+    printf("The final line of the file is the pre-computed solution. "
+           "(optional)\n");
+    printf("Example: matrix4.txt:\n");
+    printf("4\n");
+    printf("\n");
+    printf("-0.6	-0.5	0.7	0.3\n");
+    printf("-0.3	-0.9	0.3	0.7\n");
+    printf("-0.4	-0.5	-0.3	-0.8\n");
+    printf("0.0	-0.1	0.2	0.9\n");
+    printf("\n");
+    printf("-0.85	-0.68	0.24	-0.53\n");
+    printf("\n");
+    printf("0.7	0.0	-0.4	-0.5\n");
+    exit(0);
+  }
+
+  cudaSetDevice(0);
+
+  PrintDeviceProperties();
+  // char filename[100];
+  // sprintf(filename,"matrices/matrix%d.txt",size);
+
+  for (i = 1; i < argc; i++) {
+    if (argv[i][0] == '-') { // flag
+      flag = argv[i][1];
+      switch (flag) {
+      case 's': // platform
+        i++;
+        Size = atoi(argv[i]);
+        printf("Create matrix internally in parse, size = %d \n", Size);
+
+        a = (float *)malloc(Size * Size * sizeof(float));
+        create_matrix(a, Size);
+
+        b = (float *)malloc(Size * sizeof(float));
+        for (j = 0; j < Size; j++)
+          b[j] = 1.0;
+
+        m = (float *)malloc(Size * Size * sizeof(float));
+        break;
+      case 'f': // platform
+        i++;
+        printf("Read file from %s \n", argv[i]);
+        InitProblemOnce(argv[i]);
+        break;
+      case 'q': // quiet
+        verbose = 1;
+        break;
+      }
+    }
+  }
+
+  // InitProblemOnce(filename);
+
+  InitPerRun();
+  // begin timing
+  struct timeval time_start;
+  gettimeofday(&time_start, NULL);
+
+  // run kernels
+  ForwardSub();
+
+  // end timing
+  struct timeval time_end;
+  gettimeofday(&time_end, NULL);
+  unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
+                            (time_start.tv_sec * 1000000 + time_start.tv_usec);
+
+  if (verbose) {
+    printf("Matrix m is: \n");
+    PrintMat(m, Size, Size);
+
+    printf("Matrix a is: \n");
+    PrintMat(a, Size, Size);
+
+    printf("Array b is: \n");
+    PrintAry(b, Size);
+  }
+  BackSub();
+  if (verbose) {
+    printf("The final solution is: \n");
+    PrintAry(finalVec, Size);
+  }
+  printf("\nTime total (including memory transfers)\t%f sec\n",
+         time_total * 1e-6);
+  printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
+
+  /*printf("%d,%d\n",size,time_total);
+  fprintf(stderr,"%d,%d\n",size,time_total);*/
+
+  free(m);
+  free(a);
+  free(b);
+
+#ifdef TIMING
+  printf("Exec: %f\n", kernel_time);
+#endif
+}
+/*------------------------------------------------------
+ ** PrintDeviceProperties
+ **-----------------------------------------------------
+ */
+void PrintDeviceProperties() {
+  cudaDeviceProp deviceProp;
+  int nDevCount = 0;
+
+  cudaGetDeviceCount(&nDevCount);
+  printf("Total Device found: %d", nDevCount);
+  for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
+    memset(&deviceProp, 0, sizeof(deviceProp));
+    if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
+      printf("\nDevice Name \t\t - %s ", deviceProp.name);
+      printf("\n**************************************");
+      printf("\nTotal Global Memory\t\t\t - %lu KB",
+             deviceProp.totalGlobalMem / 1024);
+      printf("\nShared memory available per block \t - %lu KB",
+             deviceProp.sharedMemPerBlock / 1024);
+      printf("\nNumber of registers per thread block \t - %d",
+             deviceProp.regsPerBlock);
+      printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
+      printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
+      printf("\nMaximum threads per block \t\t - %d",
+             deviceProp.maxThreadsPerBlock);
+      printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
+             deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
+             deviceProp.maxThreadsDim[2]);
+      printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
+             deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
+             deviceProp.maxGridSize[2]);
+      printf("\nTotal constant memory \t\t\t - %zu bytes",
+             deviceProp.totalConstMem);
+      printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
+      printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
+      printf("\nTexture Alignment \t\t\t - %zu bytes",
+             deviceProp.textureAlignment);
+      printf("\nDevice Overlap \t\t\t\t - %s",
+             deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
+      printf("\nNumber of Multi processors \t\t - %d\n\n",
+             deviceProp.multiProcessorCount);
+    } else
+      printf("\n%s", cudaGetErrorString(cudaGetLastError()));
+  }
+}
+
+/*------------------------------------------------------
+ ** InitProblemOnce -- Initialize all of matrices and
+ ** vectors by opening a data file specified by the user.
+ **
+ ** We used dynamic array *a, *b, and *m to allocate
+ ** the memory storages.
+ **------------------------------------------------------
+ */
+void InitProblemOnce(char *filename) {
+  // char *filename = argv[1];
+
+  // printf("Enter the data file name: ");
+  // scanf("%s", filename);
+  printf("The file name is: %s\n", filename);
+
+  fp = fopen(filename, "r");
+
+  fscanf(fp, "%d", &Size);
+
+  a = (float *)malloc(Size * Size * sizeof(float));
+
+  InitMat(a, Size, Size);
+  printf("The input matrix a is:\n");
+  PrintMat(a, Size, Size);
+  b = (float *)malloc(Size * sizeof(float));
+
+  InitAry(b, Size);
+  printf("The input array b is:\n");
+  PrintAry(b, Size);
+
+  m = (float *)malloc(Size * Size * sizeof(float));
+}
+
+/*------------------------------------------------------
+ ** InitPerRun() -- Initialize the contents of the
+ ** multipier matrix **m
+ **------------------------------------------------------
+ */
+void InitPerRun() {
+  int i;
+  for (i = 0; i < Size * Size; i++)
+    *(m + i) = 0.0;
+}
+
+/*-------------------------------------------------------
+ ** Fan1() -- Calculate multiplier matrix
+ ** Pay attention to the index.  Index i give the range
+ ** which starts from 0 to range-1.  The real values of
+ ** the index should be adjust and related with the value
+ ** of t which is defined on the ForwardSub().
+ **-------------------------------------------------------
+ */
+__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
+  // if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
+  // 		printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
+  // Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
+  // }
+
+  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
+    return;
+  *(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
+      *(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
+      *(a_cuda + Size * t + t);
+}
+
+/*-------------------------------------------------------
+ ** Fan2() -- Modify the matrix A into LUD
+ **-------------------------------------------------------
+ */
+
+__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
+                     int j1, int t) {
+  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
+    return;
+  if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
+    return;
+
+  int xidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int yidx = blockIdx.y * blockDim.y + threadIdx.y;
+  // printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
+  // blockDim.x: %d, blockDim.y:
+  // %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
+
+  a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
+      m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
+  // a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
+  if (yidx == 0) {
+    // printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
+    // printf("xidx:%d,yidx:%d\n",xidx,yidx);
+    b_cuda[xidx + 1 + t] -=
+        m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
+  }
+}
+
+/*------------------------------------------------------
+ ** ForwardSub() -- Forward substitution of Gaussian
+ ** elimination.
+ **------------------------------------------------------
+ */
+void ForwardSub() {
+  int t;
+  float *m_cuda, *a_cuda, *b_cuda;
+
+  int A = 1;
+  int B = 2;
+  int C = 3;
+  int D = 4;
+  int E = 5;
+  int F = 6;
+  // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
+  // A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
+  // threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
+
+  // allocate memory on GPU
+  cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
+
+  cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
+
+  cudaMalloc((void **)&b_cuda, Size * sizeof(float));
+
+  // copy memory to GPU
+  cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
+
+  int block_size, grid_size;
+
+  block_size = MAXBLOCKSIZE;
+  grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
+  printf("1d grid size: %d\n", grid_size);
+
+  dim3 dimBlock(block_size);
+  dim3 dimGrid(grid_size);
+  // dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
+
+  int blockSize2d, gridSize2d;
+  blockSize2d = BLOCK_SIZE_XY;
+  gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
+
+  dim3 dimBlockXY(blockSize2d, blockSize2d);
+
+  printf("BlockXY: %d \n", blockSize2d);
+  dim3 dimGridXY(gridSize2d, gridSize2d);
+
+#ifdef TIMING
+  gettimeofday(&tv_kernel_start, NULL);
+#endif
+  printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
+  // begin timing kernels
+  struct timeval time_start;
+  gettimeofday(&time_start, NULL);
+  for (t = 0; t < (Size - 1); t++) {
+    Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
+    cudaDeviceSynchronize();
+    Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
+    cudaDeviceSynchronize();
+    checkCUDAError("Fan2");
+  }
+  // end timing kernels
+  struct timeval time_end;
+  gettimeofday(&time_end, NULL);
+  totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
+                    (time_start.tv_sec * 1000000 + time_start.tv_usec);
+
+#ifdef TIMING
+  tvsub(&time_end, &tv_kernel_start, &tv);
+  kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
+#endif
+
+  // copy memory back to CPU
+  cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
+  cudaFree(m_cuda);
+  cudaFree(a_cuda);
+  cudaFree(b_cuda);
+}
+
+/*------------------------------------------------------
+ ** BackSub() -- Backward substitution
+ **------------------------------------------------------
+ */
+
+void BackSub() {
+  // create a new vector to hold the final answer
+  finalVec = (float *)malloc(Size * sizeof(float));
+  // solve "bottom up"
+  int i, j;
+  for (i = 0; i < Size; i++) {
+    finalVec[Size - i - 1] = b[Size - i - 1];
+    for (j = 0; j < i; j++) {
+      finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
+                                finalVec[Size - j - 1];
+    }
+    finalVec[Size - i - 1] =
+        finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
+  }
+}
+
+void InitMat(float *ary, int nrow, int ncol) {
+  int i, j;
+
+  for (i = 0; i < nrow; i++) {
+    for (j = 0; j < ncol; j++) {
+      fscanf(fp, "%f", ary + Size * i + j);
+    }
+  }
+}
+
+/*------------------------------------------------------
+ ** PrintMat() -- Print the contents of the matrix
+ **------------------------------------------------------
+ */
+void PrintMat(float *ary, int nrow, int ncol) {
+  return;
+  int i, j;
+
+  for (i = 0; i < nrow; i++) {
+    for (j = 0; j < ncol; j++) {
+      printf("%8.2f ", *(ary + Size * i + j));
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+/*------------------------------------------------------
+ ** InitAry() -- Initialize the array (vector) by reading
+ ** data from the data file
+ **------------------------------------------------------
+ */
+void InitAry(float *ary, int ary_size) {
+  int i;
+
+  for (i = 0; i < ary_size; i++) {
+    fscanf(fp, "%f", &ary[i]);
+  }
+}
+
+/*------------------------------------------------------
+ ** PrintAry() -- Print the contents of the array (vector)
+ **------------------------------------------------------
+ */
+void PrintAry(float *ary, int ary_size) {
+  int i;
+  for (i = 0; i < ary_size; i++) {
+    printf("%.2f ", ary[i]);
+  }
+  printf("\n\n");
+}
+void checkCUDAError(const char *msg) {
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
--- a/examples/gauss/run.sh
+++ b/examples/gauss/run.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
+../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
+../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+g++ -Wall -L../../build/runtime \
+     -L../../build/runtime/threadPool \
+     -o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
+
+if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
+    echo "Pass"
+else
+    echo "Error result"
+    exit 1
+fi
--- a/examples/heartwall/AVI/avilib.c
+++ b/examples/heartwall/AVI/avilib.c
--- a/examples/heartwall/AVI/avilib.h
+++ b/examples/heartwall/AVI/avilib.h
@ -0,0 +1,317 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ *  avilib.h
+ *
+ *  Copyright (C) Thomas Östreich - June 2001
+ *  multiple audio track support Copyright (C) 2002 Thomas Östreich
+ *
+ *  Original code:
+ *  Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
+ *
+ *  This file is part of transcode, a linux video stream processing tool
+ *
+ *  transcode is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  transcode is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+// #include <windows.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef AVILIB_H
+#define AVILIB_H
+
+#define AVI_MAX_TRACKS 8
+
+typedef struct {
+  unsigned long key;
+  unsigned long pos;
+  unsigned long len;
+} video_index_entry;
+
+typedef struct {
+  unsigned long pos;
+  unsigned long len;
+  unsigned long tot;
+} audio_index_entry;
+
+typedef struct track_s {
+
+  long a_fmt;   /* Audio format, see #defines below */
+  long a_chans; /* Audio channels, 0 for no audio */
+  long a_rate;  /* Rate in Hz */
+  long a_bits;  /* bits per audio sample */
+  long mp3rate; /* mp3 bitrate kbs*/
+
+  long audio_strn;   /* Audio stream number */
+  long audio_bytes;  /* Total number of bytes of audio data */
+  long audio_chunks; /* Chunks of audio data in the file */
+
+  char audio_tag[4]; /* Tag of audio data */
+  long audio_posc;   /* Audio position: chunk */
+  long audio_posb;   /* Audio position: byte within chunk */
+
+  long a_codech_off; /* absolut offset of audio codec information */
+  long a_codecf_off; /* absolut offset of audio codec information */
+
+  audio_index_entry *audio_index;
+
+} track_t;
+
+typedef struct {
+
+  long fdes; /* File descriptor of AVI file */
+  long mode; /* 0 for reading, 1 for writing */
+
+  long width;          /* Width  of a video frame */
+  long height;         /* Height of a video frame */
+  double fps;          /* Frames per second */
+  char compressor[8];  /* Type of compressor, 4 bytes + padding for 0 byte */
+  char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
+  long video_strn;     /* Video stream number */
+  long video_frames;   /* Number of video frames */
+  char video_tag[4];   /* Tag of video data */
+  long video_pos;      /* Number of next frame to be read
+                              (if index present) */
+
+  unsigned long max_len; /* maximum video chunk present */
+
+  track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
+
+  unsigned long pos; /* position in file */
+  long n_idx;        /* number of index entries actually filled */
+  long max_idx;      /* number of index entries actually allocated */
+
+  long v_codech_off; /* absolut offset of video codec (strh) info */
+  long v_codecf_off; /* absolut offset of video codec (strf) info */
+
+  unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
+  video_index_entry *video_index;
+
+  unsigned long last_pos; /* Position of last frame written */
+  unsigned long last_len; /* Length of last frame written */
+  int must_use_index;     /* Flag if frames are duplicated */
+  unsigned long movi_start;
+
+  int anum; // total number of audio tracks
+  int aptr; // current audio working track
+
+} avi_t;
+
+#define AVI_MODE_WRITE 0
+#define AVI_MODE_READ 1
+
+/* The error codes delivered by avi_open_input_file */
+
+#define AVI_ERR_SIZELIM                                                        \
+  1 /* The write of the data would exceed                                      \
+                                           the maximum size of the AVI file.   \
+                                           This is more a warning than an      \
+       error since the file may be closed safely */
+
+#define AVI_ERR_OPEN                                                           \
+  2 /* Error opening the AVI file - wrong path                                 \
+                                           name or file nor readable/writable  \
+     */
+
+#define AVI_ERR_READ 3 /* Error reading from AVI File */
+
+#define AVI_ERR_WRITE                                                          \
+  4 /* Error writing to AVI File,                                              \
+                                           disk full ??? */
+
+#define AVI_ERR_WRITE_INDEX                                                    \
+  5 /* Could not write index to AVI file                                       \
+                                           during close, file may still be     \
+                                           usable */
+
+#define AVI_ERR_CLOSE                                                          \
+  6 /* Could not write header to AVI file                                      \
+                                           or not truncate the file during     \
+       close, file is most probably corrupted */
+
+#define AVI_ERR_NOT_PERM                                                       \
+  7 /* Operation not permitted:                                                \
+                                           trying to read from a file open     \
+                                           for writing or vice versa */
+
+#define AVI_ERR_NO_MEM 8 /* malloc failed */
+
+#define AVI_ERR_NO_AVI 9 /* Not an AVI file */
+
+#define AVI_ERR_NO_HDRL                                                        \
+  10 /* AVI file has no has no header list,                                    \
+                                            corrupted ??? */
+
+#define AVI_ERR_NO_MOVI                                                        \
+  11 /* AVI file has no has no MOVI list,                                      \
+                                            corrupted ??? */
+
+#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
+
+#define AVI_ERR_NO_IDX                                                         \
+  13 /* The file has been opened with                                          \
+                                            getIndex==0, but an operation has  \
+        been performed that needs an index */
+
+/* Possible Audio formats */
+
+#ifndef WAVE_FORMAT_PCM
+#define WAVE_FORMAT_UNKNOWN (0x0000)
+#define WAVE_FORMAT_PCM (0x0001)
+#define WAVE_FORMAT_ADPCM (0x0002)
+#define WAVE_FORMAT_IBM_CVSD (0x0005)
+#define WAVE_FORMAT_ALAW (0x0006)
+#define WAVE_FORMAT_MULAW (0x0007)
+#define WAVE_FORMAT_OKI_ADPCM (0x0010)
+#define WAVE_FORMAT_DVI_ADPCM (0x0011)
+#define WAVE_FORMAT_DIGISTD (0x0015)
+#define WAVE_FORMAT_DIGIFIX (0x0016)
+#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
+#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
+#define WAVE_FORMAT_GSM610 (0x0031)
+#define IBM_FORMAT_MULAW (0x0101)
+#define IBM_FORMAT_ALAW (0x0102)
+#define IBM_FORMAT_ADPCM (0x0103)
+#endif
+
+avi_t *AVI_open_output_file(char *filename);
+void AVI_set_video(avi_t *AVI, int width, int height, double fps,
+                   char *compressor);
+void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
+                   long mp3rate);
+int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
+int AVI_dup_frame(avi_t *AVI);
+int AVI_write_audio(avi_t *AVI, char *data, long bytes);
+int AVI_append_audio(avi_t *AVI, char *data, long bytes);
+long AVI_bytes_remain(avi_t *AVI);
+int AVI_close(avi_t *AVI);
+long AVI_bytes_written(avi_t *AVI);
+
+avi_t *AVI_open_input_file(char *filename, int getIndex);
+avi_t *AVI_open_fd(int fd, int getIndex);
+int avi_parse_input_file(avi_t *AVI, int getIndex);
+long AVI_audio_mp3rate(avi_t *AVI);
+long AVI_video_frames(avi_t *AVI);
+int AVI_video_width(avi_t *AVI);
+int AVI_video_height(avi_t *AVI);
+double AVI_frame_rate(avi_t *AVI);
+char *AVI_video_compressor(avi_t *AVI);
+
+int AVI_audio_channels(avi_t *AVI);
+int AVI_audio_bits(avi_t *AVI);
+int AVI_audio_format(avi_t *AVI);
+long AVI_audio_rate(avi_t *AVI);
+long AVI_audio_bytes(avi_t *AVI);
+long AVI_audio_chunks(avi_t *AVI);
+
+long AVI_max_video_chunk(avi_t *AVI);
+
+long AVI_frame_size(avi_t *AVI, long frame);
+long AVI_audio_size(avi_t *AVI, long frame);
+int AVI_seek_start(avi_t *AVI);
+int AVI_set_video_position(avi_t *AVI, long frame);
+long AVI_get_video_position(avi_t *AVI, long frame);
+long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
+
+int AVI_set_audio_position(avi_t *AVI, long byte);
+int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
+
+long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
+
+long AVI_audio_codech_offset(avi_t *AVI);
+long AVI_audio_codecf_offset(avi_t *AVI);
+long AVI_video_codech_offset(avi_t *AVI);
+long AVI_video_codecf_offset(avi_t *AVI);
+
+int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
+                  long max_audbuf, long *len);
+
+void AVI_print_error(char *str);
+char *AVI_strerror();
+char *AVI_syserror();
+
+int AVI_scan(char *name);
+int AVI_dump(char *name, int mode);
+
+char *AVI_codec2str(short cc);
+int AVI_file_check(char *import_file);
+
+void AVI_info(avi_t *avifile);
+uint64_t AVI_max_size();
+int avi_update_header(avi_t *AVI);
+
+int AVI_set_audio_track(avi_t *AVI, int track);
+int AVI_get_audio_track(avi_t *AVI);
+int AVI_audio_tracks(avi_t *AVI);
+
+struct riff_struct {
+  unsigned char id[4]; /* RIFF */
+  unsigned long len;
+  unsigned char wave_id[4]; /* WAVE */
+};
+
+struct chunk_struct {
+  unsigned char id[4];
+  unsigned long len;
+};
+
+struct common_struct {
+  unsigned short wFormatTag;
+  unsigned short wChannels;
+  unsigned long dwSamplesPerSec;
+  unsigned long dwAvgBytesPerSec;
+  unsigned short wBlockAlign;
+  unsigned short wBitsPerSample; /* Only for PCM */
+};
+
+struct wave_header {
+  struct riff_struct riff;
+  struct chunk_struct format;
+  struct common_struct common;
+  struct chunk_struct data;
+};
+
+struct AVIStreamHeader {
+  long fccType;
+  long fccHandler;
+  long dwFlags;
+  long dwPriority;
+  long dwInitialFrames;
+  long dwScale;
+  long dwRate;
+  long dwStart;
+  long dwLength;
+  long dwSuggestedBufferSize;
+  long dwQuality;
+  long dwSampleSize;
+};
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/heartwall/AVI/avimod.c
+++ b/examples/heartwall/AVI/avimod.c
@ -0,0 +1,130 @@
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+//===============================================================================================================================================================================================================
+//	DEFINE / INCLUDE
+//===============================================================================================================================================================================================================
+#include "avimod.h"
+
+//===============================================================================================================================================================================================================
+//	FUNCTIONS
+//===============================================================================================================================================================================================================
+
+// Flips the specified image and crops it to the specified dimensions
+// If scaled == true, all values are scaled to the range [0.0, 1.0
+fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
+                    int converted) {
+
+  // fixed dimensions for cropping or not cropping, square vertices starting
+  // from initial point in top left corner going down and right
+  int top;
+  int bottom;
+  int left;
+  int right;
+  if (cropped == 1) {
+    top = 0;
+    bottom = 0;
+    left = 0;
+    right = 0;
+  } else {
+    top = 0;
+    bottom = height - 1;
+    left = 0;
+    right = width - 1;
+  }
+
+  // dimensions of new cropped image
+  int height_new = bottom - top + 1;
+  int width_new = right - left + 1;
+
+  // counters
+  int i, j;
+
+  // allocate memory for cropped/flipped frame
+  fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
+
+  // crop/flip and scale frame
+  fp temp;
+  if (scaled) {
+    fp scale = 1.0 / 255.0;
+    for (i = 0; i < height_new; i++) {  // rows
+      for (j = 0; j < width_new; j++) { // colums
+        temp =
+            (fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
+        if (temp < 0) {
+          result[i * width_new + j] = temp + 256;
+        } else {
+          result[i * width_new + j] = temp;
+        }
+      }
+    }
+  } else {
+    for (i = 0; i < height_new; i++) {  // rows
+      for (j = 0; j < width_new; j++) { // colums
+        temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
+        if (temp < 0) {
+          result[i * width_new + j] = temp + 256;
+        } else {
+          result[i * width_new + j] = temp;
+        }
+      }
+    }
+  }
+
+  // convert storage method (from row-major to column-major)
+  fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
+  if (converted == 1) {
+    for (i = 0; i < width_new; i++) {    // rows
+      for (j = 0; j < height_new; j++) { // colums
+        result_converted[i * height_new + j] = result[j * width_new + i];
+      }
+    }
+  } else {
+    result_converted = result;
+  }
+  free(result);
+
+  // return
+  return result_converted;
+}
+
+// Returns the specified frame from the specified video file
+// If cropped == true, the frame is cropped to pre-determined dimensions
+//  (hardcoded to the boundaries of the blood vessel in the test video)
+// If scaled == true, all values are scaled to the range [0.0, 1.0]
+fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
+              int converted) {
+
+  // variable
+  int dummy;
+  int width = AVI_video_width(cell_file);
+  int height = AVI_video_height(cell_file);
+  int status;
+
+  // There are 600 frames in this file (i.e. frame_num = 600 causes an error)
+  AVI_set_video_position(cell_file, frame_num);
+
+  // Read in the frame from the AVI
+  char *image_buf = (char *)malloc(width * height * sizeof(char));
+  status = AVI_read_frame(cell_file, image_buf, &dummy);
+  if (status == -1) {
+    AVI_print_error((char *)"Error with AVI_read_frame");
+    exit(-1);
+  }
+
+  // The image is read in upside-down, so we need to flip it
+  fp *image_chopped;
+  image_chopped =
+      chop_flip_image(image_buf, height, width, cropped, scaled, converted);
+
+  // free image buffer
+  free(image_buf);
+
+  // return
+  return image_chopped;
+}
+
+// #ifdef __cplusplus
+// }
+// #endif
--- a/examples/heartwall/AVI/avimod.h
+++ b/examples/heartwall/AVI/avimod.h
@ -0,0 +1,24 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================
+//	DEFINE / INCLUDE
+//===============================================================================================================================================================================================================
+#define fp float
+
+#include "avilib.h"
+
+//===============================================================================================================================================================================================================
+//	DEFINE / INCLUDE
+//===============================================================================================================================================================================================================
+
+fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
+                    int converted);
+
+fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
+              int converted);
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/heartwall/define.c
+++ b/examples/heartwall/define.c
@ -0,0 +1,396 @@
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	DEFINE / INCLUDE
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+
+#define fp float
+
+/* #define NUMBER_THREADS 512 */
+#ifdef RD_WG_SIZE_0_0
+#define NUMBER_THREADS RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+#define NUMBER_THREADS RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+#define NUMBER_THREADS RD_WG_SIZE
+#else
+#define NUMBER_THREADS 256
+#endif
+
+#define ENDO_POINTS 20
+#define EPI_POINTS 31
+#define ALL_POINTS 51
+
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	PARAMS_COMMON_CHANGE STRUCT
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+
+typedef struct params_common_change {
+
+  //======================================================================================================================================================
+  //	FRAME
+  //======================================================================================================================================================
+
+  fp *d_frame;
+  int frame_no;
+
+} params_common_change;
+
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	PARAMS_COMMON STRUCTURE
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+
+typedef struct params_common {
+
+  //======================================================================================================================================================
+  //	HARDCODED INPUTS FROM MATLAB
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	CONSTANTS
+  //====================================================================================================
+
+  int sSize;
+  int tSize;
+  int maxMove;
+  fp alpha;
+
+  //====================================================================================================
+  //	FRAME
+  //====================================================================================================
+
+  int no_frames;
+  int frame_rows;
+  int frame_cols;
+  int frame_elem;
+  int frame_mem;
+
+  //====================================================================================================
+  //	ENDO POINTS
+  //====================================================================================================
+
+  int endoPoints;
+  int endo_mem;
+
+  int *endoRow;
+  int *endoCol;
+  int *tEndoRowLoc;
+  int *tEndoColLoc;
+
+  int *d_endoRow;
+  int *d_endoCol;
+  int *d_tEndoRowLoc;
+  int *d_tEndoColLoc;
+
+  fp *d_endoT;
+
+  //====================================================================================================
+  //	EPI POINTS
+  //====================================================================================================
+  int epiPoints;
+  int epi_mem;
+
+  int *epiRow;
+  int *epiCol;
+  int *tEpiRowLoc;
+  int *tEpiColLoc;
+
+  int *d_epiRow;
+  int *d_epiCol;
+  int *d_tEpiRowLoc;
+  int *d_tEpiColLoc;
+
+  fp *d_epiT;
+
+  //====================================================================================================
+  //	ALL POINTS
+  //====================================================================================================
+
+  int allPoints;
+
+  //======================================================================================================================================================
+  //	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
+  //======================================================================================================================================================
+
+  int in_rows;
+  int in_cols;
+  int in_elem;
+  int in_mem;
+
+  //======================================================================================================================================================
+  // 	AREA AROUND POINT		FROM	FRAME
+  //======================================================================================================================================================
+
+  int in2_rows;
+  int in2_cols;
+  int in2_elem;
+  int in2_mem;
+
+  //======================================================================================================================================================
+  //	CONVOLUTION
+  //======================================================================================================================================================
+
+  int conv_rows;
+  int conv_cols;
+  int conv_elem;
+  int conv_mem;
+  int ioffset;
+  int joffset;
+
+  //======================================================================================================================================================
+  //	CUMULATIVE SUM 1
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
+  //====================================================================================================
+
+  int in2_pad_add_rows;
+  int in2_pad_add_cols;
+  int in2_pad_cumv_rows;
+  int in2_pad_cumv_cols;
+  int in2_pad_cumv_elem;
+  int in2_pad_cumv_mem;
+
+  //====================================================================================================
+  //	SELECTION
+  //====================================================================================================
+
+  int in2_pad_cumv_sel_rows;
+  int in2_pad_cumv_sel_cols;
+  int in2_pad_cumv_sel_elem;
+  int in2_pad_cumv_sel_mem;
+  int in2_pad_cumv_sel_rowlow;
+  int in2_pad_cumv_sel_rowhig;
+  int in2_pad_cumv_sel_collow;
+  int in2_pad_cumv_sel_colhig;
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
+  //====================================================================================================
+
+  int in2_pad_cumv_sel2_rowlow;
+  int in2_pad_cumv_sel2_rowhig;
+  int in2_pad_cumv_sel2_collow;
+  int in2_pad_cumv_sel2_colhig;
+  int in2_sub_cumh_rows;
+  int in2_sub_cumh_cols;
+  int in2_sub_cumh_elem;
+  int in2_sub_cumh_mem;
+
+  //====================================================================================================
+  //	SELECTION
+  //====================================================================================================
+
+  int in2_sub_cumh_sel_rows;
+  int in2_sub_cumh_sel_cols;
+  int in2_sub_cumh_sel_elem;
+  int in2_sub_cumh_sel_mem;
+  int in2_sub_cumh_sel_rowlow;
+  int in2_sub_cumh_sel_rowhig;
+  int in2_sub_cumh_sel_collow;
+  int in2_sub_cumh_sel_colhig;
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION
+  //====================================================================================================
+
+  int in2_sub_cumh_sel2_rowlow;
+  int in2_sub_cumh_sel2_rowhig;
+  int in2_sub_cumh_sel2_collow;
+  int in2_sub_cumh_sel2_colhig;
+  int in2_sub2_rows;
+  int in2_sub2_cols;
+  int in2_sub2_elem;
+  int in2_sub2_mem;
+
+  //======================================================================================================================================================
+  //	CUMULATIVE SUM 2
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	MULTIPLICATION
+  //====================================================================================================
+
+  int in2_sqr_rows;
+  int in2_sqr_cols;
+  int in2_sqr_elem;
+  int in2_sqr_mem;
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION
+  //====================================================================================================
+
+  int in2_sqr_sub2_rows;
+  int in2_sqr_sub2_cols;
+  int in2_sqr_sub2_elem;
+  int in2_sqr_sub2_mem;
+
+  //======================================================================================================================================================
+  //	FINAL
+  //======================================================================================================================================================
+
+  int in_sqr_rows;
+  int in_sqr_cols;
+  int in_sqr_elem;
+  int in_sqr_mem;
+
+  //======================================================================================================================================================
+  //	TEMPLATE MASK CREATE
+  //======================================================================================================================================================
+
+  int tMask_rows;
+  int tMask_cols;
+  int tMask_elem;
+  int tMask_mem;
+
+  //======================================================================================================================================================
+  //	POINT MASK INITIALIZE
+  //======================================================================================================================================================
+
+  int mask_rows;
+  int mask_cols;
+  int mask_elem;
+  int mask_mem;
+
+  //======================================================================================================================================================
+  //	MASK CONVOLUTION
+  //======================================================================================================================================================
+
+  int mask_conv_rows;
+  int mask_conv_cols;
+  int mask_conv_elem;
+  int mask_conv_mem;
+  int mask_conv_ioffset;
+  int mask_conv_joffset;
+
+} params_common;
+
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	PARAMS_UNIQUE STRUCTURE
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+
+typedef struct params_unique {
+
+  //======================================================================================================================================================
+  //	POINT NUMBER
+  //======================================================================================================================================================
+
+  int *d_Row;
+  int *d_Col;
+  int *d_tRowLoc;
+  int *d_tColLoc;
+  fp *d_T;
+
+  //======================================================================================================================================================
+  //	POINT NUMBER
+  //======================================================================================================================================================
+
+  int point_no;
+
+  //======================================================================================================================================================
+  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
+  //======================================================================================================================================================
+
+  int in_pointer;
+
+  //======================================================================================================================================================
+  //	AREA AROUND POINT		FROM	FRAME
+  //======================================================================================================================================================
+
+  fp *d_in2;
+
+  //======================================================================================================================================================
+  //	CONVOLUTION
+  //======================================================================================================================================================
+
+  fp *d_conv;
+  fp *d_in_mod;
+
+  //======================================================================================================================================================
+  //	CUMULATIVE SUM
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
+  //====================================================================================================
+
+  fp *d_in2_pad_cumv;
+
+  //====================================================================================================
+  //	SELECTION
+  //====================================================================================================
+
+  fp *d_in2_pad_cumv_sel;
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
+  //====================================================================================================
+
+  fp *d_in2_sub_cumh;
+
+  //====================================================================================================
+  //	SELECTION
+  //====================================================================================================
+
+  fp *d_in2_sub_cumh_sel;
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION
+  //====================================================================================================
+
+  fp *d_in2_sub2;
+
+  //======================================================================================================================================================
+  //	CUMULATIVE SUM 2
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	MULTIPLICATION
+  //====================================================================================================
+
+  fp *d_in2_sqr;
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION
+  //====================================================================================================
+
+  fp *d_in2_sqr_sub2;
+
+  //======================================================================================================================================================
+  //	FINAL
+  //======================================================================================================================================================
+
+  fp *d_in_sqr;
+
+  //======================================================================================================================================================
+  //	TEMPLATE MASK
+  //======================================================================================================================================================
+
+  fp *d_tMask;
+
+  //======================================================================================================================================================
+  //	POINT MASK INITIALIZE
+  //======================================================================================================================================================
+
+  fp *d_mask;
+
+  //======================================================================================================================================================
+  //	MASK CONVOLUTION
+  //======================================================================================================================================================
+
+  fp *d_mask_conv;
+
+} params_unique;
+
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	END OF STRUCTURE
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
--- a/examples/heartwall/kernel.cu
+++ b/examples/heartwall/kernel.cu
--- a/examples/heartwall/main.cu
+++ b/examples/heartwall/main.cu
@ -0,0 +1,795 @@
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	DEFINE / INCLUDE
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+
+//======================================================================================================================================================
+//	LIBRARIES
+//======================================================================================================================================================
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <avilib.h>
+#include <avimod.h>
+#include <cuda.h>
+
+//======================================================================================================================================================
+//	STRUCTURES, GLOBAL STRUCTURE VARIABLES
+//======================================================================================================================================================
+
+#include "define.c"
+
+params_common_change common_change;
+__constant__ params_common_change d_common_change;
+
+params_common common;
+__constant__ params_common d_common;
+
+params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
+                                  // more than usually needed
+__constant__ params_unique d_unique[ALL_POINTS];
+
+//======================================================================================================================================================
+// KERNEL CODE
+//======================================================================================================================================================
+
+#include "kernel.cu"
+
+//	WRITE DATA FUNCTION
+//===============================================================================================================================================================================================================200
+
+void write_data(char *filename, int frameNo, int frames_processed,
+                int endoPoints, int *input_a, int *input_b, int epiPoints,
+                int *input_2a, int *input_2b) {
+
+  //================================================================================80
+  //	VARIABLES
+  //================================================================================80
+
+  FILE *fid;
+  int i, j;
+  char c;
+
+  //================================================================================80
+  //	OPEN FILE FOR READING
+  //================================================================================80
+
+  fid = fopen(filename, "w+");
+  if (fid == NULL) {
+    printf("The file was not opened for writing\n");
+    return;
+  }
+
+  //================================================================================80
+  //	WRITE VALUES TO THE FILE
+  //================================================================================80
+  fprintf(fid, "Total AVI Frames: %d\n", frameNo);
+  fprintf(fid, "Frames Processed: %d\n", frames_processed);
+  fprintf(fid, "endoPoints: %d\n", endoPoints);
+  fprintf(fid, "epiPoints: %d", epiPoints);
+  for (j = 0; j < frames_processed; j++) {
+    fprintf(fid, "\n---Frame %d---", j);
+    fprintf(fid, "\n--endo--\n", j);
+    for (i = 0; i < endoPoints; i++) {
+      fprintf(fid, "%d\t", input_a[j + i * frameNo]);
+    }
+    fprintf(fid, "\n");
+    for (i = 0; i < endoPoints; i++) {
+      // if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
+      fprintf(fid, "%d\t", input_b[j + i * frameNo]);
+    }
+    fprintf(fid, "\n--epi--\n", j);
+    for (i = 0; i < epiPoints; i++) {
+      // if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
+      fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
+    }
+    fprintf(fid, "\n");
+    for (i = 0; i < epiPoints; i++) {
+      // if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
+      fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
+    }
+  }
+  // 	================================================================================80
+  //		CLOSE FILE
+  //	================================================================================80
+
+  fclose(fid);
+}
+
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	MAIN FUNCTION
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+int main(int argc, char *argv[]) {
+  cudaSetDevice(0);
+  printf("WG size of kernel = %d \n", NUMBER_THREADS);
+  //======================================================================================================================================================
+  //	VARIABLES
+  //======================================================================================================================================================
+
+  // CUDA kernel execution parameters
+  dim3 threads;
+  dim3 blocks;
+
+  // counter
+  int i;
+  int frames_processed;
+
+  // frames
+  char *video_file_name;
+  avi_t *frames;
+  fp *frame;
+
+  //======================================================================================================================================================
+  // 	FRAME
+  //======================================================================================================================================================
+
+  if (argc != 3) {
+    printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
+    exit(1);
+  }
+
+  // open movie file
+  video_file_name = argv[1];
+  frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
+  if (frames == NULL) {
+    AVI_print_error((char *)"Error with AVI_open_input_file");
+    return -1;
+  }
+
+  // common
+  common.no_frames = AVI_video_frames(frames);
+  common.frame_rows = AVI_video_height(frames);
+  common.frame_cols = AVI_video_width(frames);
+  common.frame_elem = common.frame_rows * common.frame_cols;
+  common.frame_mem = sizeof(fp) * common.frame_elem;
+
+  // pointers
+  cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
+
+  //======================================================================================================================================================
+  // 	CHECK INPUT ARGUMENTS
+  //======================================================================================================================================================
+
+  frames_processed = atoi(argv[2]);
+  if (frames_processed < 0 || frames_processed > common.no_frames) {
+    printf("ERROR: %d is an incorrect number of frames specified, select in "
+           "the range of 0-%d\n",
+           frames_processed, common.no_frames);
+    return 0;
+  }
+
+  //======================================================================================================================================================
+  //	HARDCODED INPUTS FROM MATLAB
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	CONSTANTS
+  //====================================================================================================
+
+  common.sSize = 40;
+  common.tSize = 25;
+  common.maxMove = 10;
+  common.alpha = 0.87;
+
+  //====================================================================================================
+  //	ENDO POINTS
+  //====================================================================================================
+
+  common.endoPoints = ENDO_POINTS;
+  common.endo_mem = sizeof(int) * common.endoPoints;
+
+  common.endoRow = (int *)malloc(common.endo_mem);
+  common.endoRow[0] = 369;
+  common.endoRow[1] = 400;
+  common.endoRow[2] = 429;
+  common.endoRow[3] = 452;
+  common.endoRow[4] = 476;
+  common.endoRow[5] = 486;
+  common.endoRow[6] = 479;
+  common.endoRow[7] = 458;
+  common.endoRow[8] = 433;
+  common.endoRow[9] = 404;
+  common.endoRow[10] = 374;
+  common.endoRow[11] = 346;
+  common.endoRow[12] = 318;
+  common.endoRow[13] = 294;
+  common.endoRow[14] = 277;
+  common.endoRow[15] = 269;
+  common.endoRow[16] = 275;
+  common.endoRow[17] = 287;
+  common.endoRow[18] = 311;
+  common.endoRow[19] = 339;
+  cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
+  cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
+             cudaMemcpyHostToDevice);
+
+  common.endoCol = (int *)malloc(common.endo_mem);
+  common.endoCol[0] = 408;
+  common.endoCol[1] = 406;
+  common.endoCol[2] = 397;
+  common.endoCol[3] = 383;
+  common.endoCol[4] = 354;
+  common.endoCol[5] = 322;
+  common.endoCol[6] = 294;
+  common.endoCol[7] = 270;
+  common.endoCol[8] = 250;
+  common.endoCol[9] = 237;
+  common.endoCol[10] = 235;
+  common.endoCol[11] = 241;
+  common.endoCol[12] = 254;
+  common.endoCol[13] = 273;
+  common.endoCol[14] = 300;
+  common.endoCol[15] = 328;
+  common.endoCol[16] = 356;
+  common.endoCol[17] = 383;
+  common.endoCol[18] = 401;
+  common.endoCol[19] = 411;
+  cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
+  cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
+             cudaMemcpyHostToDevice);
+
+  common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
+  cudaMalloc((void **)&common.d_tEndoRowLoc,
+             common.endo_mem * common.no_frames);
+
+  common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
+  cudaMalloc((void **)&common.d_tEndoColLoc,
+             common.endo_mem * common.no_frames);
+
+  //====================================================================================================
+  //	EPI POINTS
+  //====================================================================================================
+
+  common.epiPoints = EPI_POINTS;
+  common.epi_mem = sizeof(int) * common.epiPoints;
+
+  common.epiRow = (int *)malloc(common.epi_mem);
+  common.epiRow[0] = 390;
+  common.epiRow[1] = 419;
+  common.epiRow[2] = 448;
+  common.epiRow[3] = 474;
+  common.epiRow[4] = 501;
+  common.epiRow[5] = 519;
+  common.epiRow[6] = 535;
+  common.epiRow[7] = 542;
+  common.epiRow[8] = 543;
+  common.epiRow[9] = 538;
+  common.epiRow[10] = 528;
+  common.epiRow[11] = 511;
+  common.epiRow[12] = 491;
+  common.epiRow[13] = 466;
+  common.epiRow[14] = 438;
+  common.epiRow[15] = 406;
+  common.epiRow[16] = 376;
+  common.epiRow[17] = 347;
+  common.epiRow[18] = 318;
+  common.epiRow[19] = 291;
+  common.epiRow[20] = 275;
+  common.epiRow[21] = 259;
+  common.epiRow[22] = 256;
+  common.epiRow[23] = 252;
+  common.epiRow[24] = 252;
+  common.epiRow[25] = 257;
+  common.epiRow[26] = 266;
+  common.epiRow[27] = 283;
+  common.epiRow[28] = 305;
+  common.epiRow[29] = 331;
+  common.epiRow[30] = 360;
+  cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
+  cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
+             cudaMemcpyHostToDevice);
+
+  common.epiCol = (int *)malloc(common.epi_mem);
+  common.epiCol[0] = 457;
+  common.epiCol[1] = 454;
+  common.epiCol[2] = 446;
+  common.epiCol[3] = 431;
+  common.epiCol[4] = 411;
+  common.epiCol[5] = 388;
+  common.epiCol[6] = 361;
+  common.epiCol[7] = 331;
+  common.epiCol[8] = 301;
+  common.epiCol[9] = 273;
+  common.epiCol[10] = 243;
+  common.epiCol[11] = 218;
+  common.epiCol[12] = 196;
+  common.epiCol[13] = 178;
+  common.epiCol[14] = 166;
+  common.epiCol[15] = 157;
+  common.epiCol[16] = 155;
+  common.epiCol[17] = 165;
+  common.epiCol[18] = 177;
+  common.epiCol[19] = 197;
+  common.epiCol[20] = 218;
+  common.epiCol[21] = 248;
+  common.epiCol[22] = 276;
+  common.epiCol[23] = 304;
+  common.epiCol[24] = 333;
+  common.epiCol[25] = 361;
+  common.epiCol[26] = 391;
+  common.epiCol[27] = 415;
+  common.epiCol[28] = 434;
+  common.epiCol[29] = 448;
+  common.epiCol[30] = 455;
+  cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
+  cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
+             cudaMemcpyHostToDevice);
+
+  common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
+  cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
+
+  common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
+  cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
+
+  //====================================================================================================
+  //	ALL POINTS
+  //====================================================================================================
+
+  common.allPoints = ALL_POINTS;
+
+  //======================================================================================================================================================
+  // 	TEMPLATE SIZES
+  //======================================================================================================================================================
+
+  // common
+  common.in_rows = common.tSize + 1 + common.tSize;
+  common.in_cols = common.in_rows;
+  common.in_elem = common.in_rows * common.in_cols;
+  common.in_mem = sizeof(fp) * common.in_elem;
+
+  //======================================================================================================================================================
+  // 	CREATE ARRAY OF TEMPLATES FOR ALL POINTS
+  //======================================================================================================================================================
+
+  // common
+  cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
+  cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
+
+  //======================================================================================================================================================
+  //	SPECIFIC TO ENDO OR EPI TO BE SET HERE
+  //======================================================================================================================================================
+
+  for (i = 0; i < common.endoPoints; i++) {
+    unique[i].point_no = i;
+    unique[i].d_Row = common.d_endoRow;
+    unique[i].d_Col = common.d_endoCol;
+    unique[i].d_tRowLoc = common.d_tEndoRowLoc;
+    unique[i].d_tColLoc = common.d_tEndoColLoc;
+    unique[i].d_T = common.d_endoT;
+  }
+  for (i = common.endoPoints; i < common.allPoints; i++) {
+    unique[i].point_no = i - common.endoPoints;
+    unique[i].d_Row = common.d_epiRow;
+    unique[i].d_Col = common.d_epiCol;
+    unique[i].d_tRowLoc = common.d_tEpiRowLoc;
+    unique[i].d_tColLoc = common.d_tEpiColLoc;
+    unique[i].d_T = common.d_epiT;
+  }
+
+  //======================================================================================================================================================
+  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
+  //======================================================================================================================================================
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    unique[i].in_pointer = unique[i].point_no * common.in_elem;
+  }
+
+  //======================================================================================================================================================
+  // 	AREA AROUND POINT		FROM	FRAME
+  //======================================================================================================================================================
+
+  // common
+  common.in2_rows = 2 * common.sSize + 1;
+  common.in2_cols = 2 * common.sSize + 1;
+  common.in2_elem = common.in2_rows * common.in2_cols;
+  common.in2_mem = sizeof(float) * common.in2_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
+  }
+
+  //======================================================================================================================================================
+  // 	CONVOLUTION
+  //======================================================================================================================================================
+
+  // common
+  common.conv_rows =
+      common.in_rows + common.in2_rows - 1; // number of rows in I
+  common.conv_cols =
+      common.in_cols + common.in2_cols - 1; // number of columns in I
+  common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
+  common.conv_mem = sizeof(float) * common.conv_elem;
+  common.ioffset = 0;
+  common.joffset = 0;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
+  }
+
+  //======================================================================================================================================================
+  // 	CUMULATIVE SUM
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  // 	PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
+  //====================================================================================================
+
+  // common
+  common.in2_pad_add_rows = common.in_rows;
+  common.in2_pad_add_cols = common.in_cols;
+
+  common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
+  common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
+  common.in2_pad_cumv_elem =
+      common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
+  common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
+  }
+
+  //====================================================================================================
+  // 	SELECTION
+  //====================================================================================================
+
+  // common
+  common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
+  common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
+  common.in2_pad_cumv_sel_collow = 1;
+  common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
+  common.in2_pad_cumv_sel_rows =
+      common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
+  common.in2_pad_cumv_sel_cols =
+      common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
+  common.in2_pad_cumv_sel_elem =
+      common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
+  common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
+               common.in2_pad_cumv_sel_mem);
+  }
+
+  //====================================================================================================
+  // 	SELECTION	2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
+  //====================================================================================================
+
+  // common
+  common.in2_pad_cumv_sel2_rowlow = 1;
+  common.in2_pad_cumv_sel2_rowhig =
+      common.in2_pad_cumv_rows - common.in_rows - 1;
+  common.in2_pad_cumv_sel2_collow = 1;
+  common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
+  common.in2_sub_cumh_rows =
+      common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
+  common.in2_sub_cumh_cols =
+      common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
+  common.in2_sub_cumh_elem =
+      common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
+  common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
+  }
+
+  //====================================================================================================
+  // 	SELECTION
+  //====================================================================================================
+
+  // common
+  common.in2_sub_cumh_sel_rowlow = 1;
+  common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
+  common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
+  common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
+  common.in2_sub_cumh_sel_rows =
+      common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
+  common.in2_sub_cumh_sel_cols =
+      common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
+  common.in2_sub_cumh_sel_elem =
+      common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
+  common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
+               common.in2_sub_cumh_sel_mem);
+  }
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION
+  //====================================================================================================
+
+  // common
+  common.in2_sub_cumh_sel2_rowlow = 1;
+  common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
+  common.in2_sub_cumh_sel2_collow = 1;
+  common.in2_sub_cumh_sel2_colhig =
+      common.in2_sub_cumh_cols - common.in_cols - 1;
+  common.in2_sub2_rows =
+      common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
+  common.in2_sub2_cols =
+      common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
+  common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
+  common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
+  }
+
+  //======================================================================================================================================================
+  //	CUMULATIVE SUM 2
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	MULTIPLICATION
+  //====================================================================================================
+
+  // common
+  common.in2_sqr_rows = common.in2_rows;
+  common.in2_sqr_cols = common.in2_cols;
+  common.in2_sqr_elem = common.in2_elem;
+  common.in2_sqr_mem = common.in2_mem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
+  }
+
+  //====================================================================================================
+  //	SELECTION 2, SUBTRACTION
+  //====================================================================================================
+
+  // common
+  common.in2_sqr_sub2_rows = common.in2_sub2_rows;
+  common.in2_sqr_sub2_cols = common.in2_sub2_cols;
+  common.in2_sqr_sub2_elem = common.in2_sub2_elem;
+  common.in2_sqr_sub2_mem = common.in2_sub2_mem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
+  }
+
+  //======================================================================================================================================================
+  //	FINAL
+  //======================================================================================================================================================
+
+  // common
+  common.in_sqr_rows = common.in_rows;
+  common.in_sqr_cols = common.in_cols;
+  common.in_sqr_elem = common.in_elem;
+  common.in_sqr_mem = common.in_mem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
+  }
+
+  //======================================================================================================================================================
+  //	TEMPLATE MASK CREATE
+  //======================================================================================================================================================
+
+  // common
+  common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
+  common.tMask_cols = common.tMask_rows;
+  common.tMask_elem = common.tMask_rows * common.tMask_cols;
+  common.tMask_mem = sizeof(float) * common.tMask_elem;
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
+  }
+
+  //======================================================================================================================================================
+  //	POINT MASK INITIALIZE
+  //======================================================================================================================================================
+
+  // common
+  common.mask_rows = common.maxMove;
+  common.mask_cols = common.mask_rows;
+  common.mask_elem = common.mask_rows * common.mask_cols;
+  common.mask_mem = sizeof(float) * common.mask_elem;
+
+  //======================================================================================================================================================
+  //	MASK CONVOLUTION
+  //======================================================================================================================================================
+
+  // common
+  common.mask_conv_rows = common.tMask_rows; // number of rows in I
+  common.mask_conv_cols = common.tMask_cols; // number of columns in I
+  common.mask_conv_elem =
+      common.mask_conv_rows * common.mask_conv_cols; // number of elements
+  common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
+  common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
+  if ((common.mask_rows - 1) % 2 > 0.5) {
+    common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
+  }
+  common.mask_conv_joffset = (common.mask_cols - 1) / 2;
+  if ((common.mask_cols - 1) % 2 > 0.5) {
+    common.mask_conv_joffset = common.mask_conv_joffset + 1;
+  }
+
+  // pointers
+  for (i = 0; i < common.allPoints; i++) {
+    cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
+  }
+
+  //======================================================================================================================================================
+  //	KERNEL
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	THREAD BLOCK
+  //====================================================================================================
+
+  // All kernels operations within kernel use same max size of threads. Size of
+  // block size is set to the size appropriate for max size operation (on padded
+  // matrix). Other use subsets of that.
+  threads.x = NUMBER_THREADS; // define the number of threads in the block
+  threads.y = 1;
+  blocks.x = common.allPoints; // define the number of blocks in the grid
+  blocks.y = 1;
+
+  //====================================================================================================
+  //	COPY ARGUMENTS
+  //====================================================================================================
+
+  cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
+  cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
+
+  //====================================================================================================
+  //	PRINT FRAME PROGRESS START
+  //====================================================================================================
+
+  printf("frame progress: ");
+  fflush(NULL);
+
+  //====================================================================================================
+  //	LAUNCH
+  //====================================================================================================
+
+  for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
+       common_change.frame_no++) {
+    printf("get frame\n");
+    // Extract a cropped version of the first frame from the video file
+    frame = get_frame(
+        frames,                 // pointer to video file
+        common_change.frame_no, // number of frame that needs to be returned
+        0,                      // cropped?
+        0,                      // scaled?
+        1);                     // converted
+    printf("memcpy\n");
+    // copy frame to GPU memory
+    cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
+               cudaMemcpyHostToDevice);
+    printf("toSymbol\n");
+    cudaMemcpyToSymbol(d_common_change, &common_change,
+                       sizeof(params_common_change));
+
+    // launch GPU kernel
+    printf("launch\n");
+    kernel<<<1, 32>>>();
+    cudaDeviceSynchronize();
+    printf("return\n");
+    // free frame after each loop iteration, since AVI library allocates memory
+    // for every frame fetched
+    printf("free\n");
+    free(frame);
+
+    // print frame progress
+    printf("%d ", common_change.frame_no);
+    fflush(NULL);
+  }
+
+  //====================================================================================================
+  //	PRINT FRAME PROGRESS END
+  //====================================================================================================
+
+  printf("\n");
+  fflush(NULL);
+
+  //====================================================================================================
+  //	OUTPUT
+  //====================================================================================================
+
+  cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
+             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
+  cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
+             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
+             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
+  cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
+             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
+
+#ifdef OUTPUT
+
+  //==================================================50
+  //	DUMP DATA TO FILE
+  //==================================================50
+  write_data("result.txt", common.no_frames, frames_processed,
+             common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
+             common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
+
+  //==================================================50
+  //	End
+  //==================================================50
+
+#endif
+
+  //======================================================================================================================================================
+  //	DEALLOCATION
+  //======================================================================================================================================================
+
+  //====================================================================================================
+  //	COMMON
+  //====================================================================================================
+
+  // frame
+  cudaFree(common_change.d_frame);
+
+  // endo points
+  free(common.endoRow);
+  free(common.endoCol);
+  free(common.tEndoRowLoc);
+  free(common.tEndoColLoc);
+
+  cudaFree(common.d_endoRow);
+  cudaFree(common.d_endoCol);
+  cudaFree(common.d_tEndoRowLoc);
+  cudaFree(common.d_tEndoColLoc);
+
+  cudaFree(common.d_endoT);
+
+  // epi points
+  free(common.epiRow);
+  free(common.epiCol);
+  free(common.tEpiRowLoc);
+  free(common.tEpiColLoc);
+
+  cudaFree(common.d_epiRow);
+  cudaFree(common.d_epiCol);
+  cudaFree(common.d_tEpiRowLoc);
+  cudaFree(common.d_tEpiColLoc);
+
+  cudaFree(common.d_epiT);
+
+  //====================================================================================================
+  //	POINTERS
+  //====================================================================================================
+
+  for (i = 0; i < common.allPoints; i++) {
+    cudaFree(unique[i].d_in2);
+
+    cudaFree(unique[i].d_conv);
+    cudaFree(unique[i].d_in2_pad_cumv);
+    cudaFree(unique[i].d_in2_pad_cumv_sel);
+    cudaFree(unique[i].d_in2_sub_cumh);
+    cudaFree(unique[i].d_in2_sub_cumh_sel);
+    cudaFree(unique[i].d_in2_sub2);
+    cudaFree(unique[i].d_in2_sqr);
+    cudaFree(unique[i].d_in2_sqr_sub2);
+    cudaFree(unique[i].d_in_sqr);
+
+    cudaFree(unique[i].d_tMask);
+    cudaFree(unique[i].d_mask_conv);
+  }
+}
+
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
+//	MAIN FUNCTION
+//===============================================================================================================================================================================================================
+//===============================================================================================================================================================================================================
--- a/examples/heartwall/run.sh
+++ b/examples/heartwall/run.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+cd AVI; make; cd ..;
+
+clang++ -DOUTPUT main.cu -I./AVI  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
+
+
+/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
+/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+
+g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o  ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
+
+./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20
--- a/examples/heartwall/setdevice.cu
+++ b/examples/heartwall/setdevice.cu
@ -0,0 +1,5 @@
+////////////////////////////////////////////////////////////////////////////////
+// Set Device
+////////////////////////////////////////////////////////////////////////////////
+
+void setdevice(void) { cudaSetDevice(0); }
--- a/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,719 @@
+; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "hotspot.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
+
+@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
+@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
+@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
+entry:
+  %iteration.addr = alloca i32, align 4
+  %power.addr = alloca float*, align 8
+  %temp_src.addr = alloca float*, align 8
+  %temp_dst.addr = alloca float*, align 8
+  %grid_cols.addr = alloca i32, align 4
+  %grid_rows.addr = alloca i32, align 4
+  %border_cols.addr = alloca i32, align 4
+  %border_rows.addr = alloca i32, align 4
+  %Cap.addr = alloca float, align 4
+  %Rx.addr = alloca float, align 4
+  %Ry.addr = alloca float, align 4
+  %Rz.addr = alloca float, align 4
+  %step.addr = alloca float, align 4
+  %time_elapsed.addr = alloca float, align 4
+  %amb_temp = alloca float, align 4
+  %step_div_Cap = alloca float, align 4
+  %Rx_1 = alloca float, align 4
+  %Ry_1 = alloca float, align 4
+  %Rz_1 = alloca float, align 4
+  %bx = alloca i32, align 4
+  %by = alloca i32, align 4
+  %tx = alloca i32, align 4
+  %ty = alloca i32, align 4
+  %small_block_rows = alloca i32, align 4
+  %small_block_cols = alloca i32, align 4
+  %blkY = alloca i32, align 4
+  %blkX = alloca i32, align 4
+  %blkYmax = alloca i32, align 4
+  %blkXmax = alloca i32, align 4
+  %yidx = alloca i32, align 4
+  %xidx = alloca i32, align 4
+  %loadYidx = alloca i32, align 4
+  %loadXidx = alloca i32, align 4
+  %index = alloca i32, align 4
+  %validYmin = alloca i32, align 4
+  %validYmax = alloca i32, align 4
+  %validXmin = alloca i32, align 4
+  %validXmax = alloca i32, align 4
+  %N = alloca i32, align 4
+  %S = alloca i32, align 4
+  %W = alloca i32, align 4
+  %E = alloca i32, align 4
+  %computed = alloca i8, align 1
+  %i = alloca i32, align 4
+  store i32 %iteration, i32* %iteration.addr, align 4
+  store float* %power, float** %power.addr, align 8
+  store float* %temp_src, float** %temp_src.addr, align 8
+  store float* %temp_dst, float** %temp_dst.addr, align 8
+  store i32 %grid_cols, i32* %grid_cols.addr, align 4
+  store i32 %grid_rows, i32* %grid_rows.addr, align 4
+  store i32 %border_cols, i32* %border_cols.addr, align 4
+  store i32 %border_rows, i32* %border_rows.addr, align 4
+  store float %Cap, float* %Cap.addr, align 4
+  store float %Rx, float* %Rx.addr, align 4
+  store float %Ry, float* %Ry.addr, align 4
+  store float %Rz, float* %Rz.addr, align 4
+  store float %step, float* %step.addr, align 4
+  store float %time_elapsed, float* %time_elapsed.addr, align 4
+  store float 8.000000e+01, float* %amb_temp, align 4
+  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call, i32* %bx, align 4
+  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
+  store i32 %call1, i32* %by, align 4
+  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
+  store i32 %call2, i32* %tx, align 4
+  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
+  store i32 %call3, i32* %ty, align 4
+  %0 = load float, float* %step.addr, align 4
+  %1 = load float, float* %Cap.addr, align 4
+  %div = fdiv float %0, %1
+  store float %div, float* %step_div_Cap, align 4
+  %2 = load float, float* %Rx.addr, align 4
+  %div4 = fdiv float 1.000000e+00, %2
+  store float %div4, float* %Rx_1, align 4
+  %3 = load float, float* %Ry.addr, align 4
+  %div5 = fdiv float 1.000000e+00, %3
+  store float %div5, float* %Ry_1, align 4
+  %4 = load float, float* %Rz.addr, align 4
+  %div6 = fdiv float 1.000000e+00, %4
+  store float %div6, float* %Rz_1, align 4
+  %5 = load i32, i32* %iteration.addr, align 4
+  %mul = mul nsw i32 %5, 2
+  %sub = sub nsw i32 16, %mul
+  store i32 %sub, i32* %small_block_rows, align 4
+  %6 = load i32, i32* %iteration.addr, align 4
+  %mul7 = mul nsw i32 %6, 2
+  %sub8 = sub nsw i32 16, %mul7
+  store i32 %sub8, i32* %small_block_cols, align 4
+  %7 = load i32, i32* %small_block_rows, align 4
+  %8 = load i32, i32* %by, align 4
+  %mul9 = mul nsw i32 %7, %8
+  %9 = load i32, i32* %border_rows.addr, align 4
+  %sub10 = sub nsw i32 %mul9, %9
+  store i32 %sub10, i32* %blkY, align 4
+  %10 = load i32, i32* %small_block_cols, align 4
+  %11 = load i32, i32* %bx, align 4
+  %mul11 = mul nsw i32 %10, %11
+  %12 = load i32, i32* %border_cols.addr, align 4
+  %sub12 = sub nsw i32 %mul11, %12
+  store i32 %sub12, i32* %blkX, align 4
+  %13 = load i32, i32* %blkY, align 4
+  %add = add nsw i32 %13, 16
+  %sub13 = sub nsw i32 %add, 1
+  store i32 %sub13, i32* %blkYmax, align 4
+  %14 = load i32, i32* %blkX, align 4
+  %add14 = add nsw i32 %14, 16
+  %sub15 = sub nsw i32 %add14, 1
+  store i32 %sub15, i32* %blkXmax, align 4
+  %15 = load i32, i32* %blkY, align 4
+  %16 = load i32, i32* %ty, align 4
+  %add16 = add nsw i32 %15, %16
+  store i32 %add16, i32* %yidx, align 4
+  %17 = load i32, i32* %blkX, align 4
+  %18 = load i32, i32* %tx, align 4
+  %add17 = add nsw i32 %17, %18
+  store i32 %add17, i32* %xidx, align 4
+  %19 = load i32, i32* %yidx, align 4
+  store i32 %19, i32* %loadYidx, align 4
+  %20 = load i32, i32* %xidx, align 4
+  store i32 %20, i32* %loadXidx, align 4
+  %21 = load i32, i32* %grid_cols.addr, align 4
+  %22 = load i32, i32* %loadYidx, align 4
+  %mul18 = mul nsw i32 %21, %22
+  %23 = load i32, i32* %loadXidx, align 4
+  %add19 = add nsw i32 %mul18, %23
+  store i32 %add19, i32* %index, align 4
+  %24 = load i32, i32* %loadYidx, align 4
+  %cmp = icmp sge i32 %24, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %25 = load i32, i32* %loadYidx, align 4
+  %26 = load i32, i32* %grid_rows.addr, align 4
+  %sub20 = sub nsw i32 %26, 1
+  %cmp21 = icmp sle i32 %25, %sub20
+  br i1 %cmp21, label %land.lhs.true22, label %if.end
+
+land.lhs.true22:                                  ; preds = %land.lhs.true
+  %27 = load i32, i32* %loadXidx, align 4
+  %cmp23 = icmp sge i32 %27, 0
+  br i1 %cmp23, label %land.lhs.true24, label %if.end
+
+land.lhs.true24:                                  ; preds = %land.lhs.true22
+  %28 = load i32, i32* %loadXidx, align 4
+  %29 = load i32, i32* %grid_cols.addr, align 4
+  %sub25 = sub nsw i32 %29, 1
+  %cmp26 = icmp sle i32 %28, %sub25
+  br i1 %cmp26, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true24
+  %30 = load float*, float** %temp_src.addr, align 8
+  %31 = load i32, i32* %index, align 4
+  %idxprom = sext i32 %31 to i64
+  %arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
+  %32 = load float, float* %arrayidx, align 4
+  %33 = load i32, i32* %ty, align 4
+  %idxprom27 = sext i32 %33 to i64
+  %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
+  %34 = load i32, i32* %tx, align 4
+  %idxprom29 = sext i32 %34 to i64
+  %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
+  store float %32, float* %arrayidx30, align 4
+  %35 = load float*, float** %power.addr, align 8
+  %36 = load i32, i32* %index, align 4
+  %idxprom31 = sext i32 %36 to i64
+  %arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
+  %37 = load float, float* %arrayidx32, align 4
+  %38 = load i32, i32* %ty, align 4
+  %idxprom33 = sext i32 %38 to i64
+  %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
+  %39 = load i32, i32* %tx, align 4
+  %idxprom35 = sext i32 %39 to i64
+  %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
+  store float %37, float* %arrayidx36, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
+  call void @llvm.nvvm.barrier0()
+  %40 = load i32, i32* %blkY, align 4
+  %cmp37 = icmp slt i32 %40, 0
+  br i1 %cmp37, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %if.end
+  %41 = load i32, i32* %blkY, align 4
+  %sub38 = sub nsw i32 0, %41
+  br label %cond.end
+
+cond.false:                                       ; preds = %if.end
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
+  store i32 %cond, i32* %validYmin, align 4
+  %42 = load i32, i32* %blkYmax, align 4
+  %43 = load i32, i32* %grid_rows.addr, align 4
+  %sub39 = sub nsw i32 %43, 1
+  %cmp40 = icmp sgt i32 %42, %sub39
+  br i1 %cmp40, label %cond.true41, label %cond.false45
+
+cond.true41:                                      ; preds = %cond.end
+  %44 = load i32, i32* %blkYmax, align 4
+  %45 = load i32, i32* %grid_rows.addr, align 4
+  %sub42 = sub nsw i32 %44, %45
+  %add43 = add nsw i32 %sub42, 1
+  %sub44 = sub nsw i32 15, %add43
+  br label %cond.end46
+
+cond.false45:                                     ; preds = %cond.end
+  br label %cond.end46
+
+cond.end46:                                       ; preds = %cond.false45, %cond.true41
+  %cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
+  store i32 %cond47, i32* %validYmax, align 4
+  %46 = load i32, i32* %blkX, align 4
+  %cmp48 = icmp slt i32 %46, 0
+  br i1 %cmp48, label %cond.true49, label %cond.false51
+
+cond.true49:                                      ; preds = %cond.end46
+  %47 = load i32, i32* %blkX, align 4
+  %sub50 = sub nsw i32 0, %47
+  br label %cond.end52
+
+cond.false51:                                     ; preds = %cond.end46
+  br label %cond.end52
+
+cond.end52:                                       ; preds = %cond.false51, %cond.true49
+  %cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
+  store i32 %cond53, i32* %validXmin, align 4
+  %48 = load i32, i32* %blkXmax, align 4
+  %49 = load i32, i32* %grid_cols.addr, align 4
+  %sub54 = sub nsw i32 %49, 1
+  %cmp55 = icmp sgt i32 %48, %sub54
+  br i1 %cmp55, label %cond.true56, label %cond.false60
+
+cond.true56:                                      ; preds = %cond.end52
+  %50 = load i32, i32* %blkXmax, align 4
+  %51 = load i32, i32* %grid_cols.addr, align 4
+  %sub57 = sub nsw i32 %50, %51
+  %add58 = add nsw i32 %sub57, 1
+  %sub59 = sub nsw i32 15, %add58
+  br label %cond.end61
+
+cond.false60:                                     ; preds = %cond.end52
+  br label %cond.end61
+
+cond.end61:                                       ; preds = %cond.false60, %cond.true56
+  %cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
+  store i32 %cond62, i32* %validXmax, align 4
+  %52 = load i32, i32* %ty, align 4
+  %sub63 = sub nsw i32 %52, 1
+  store i32 %sub63, i32* %N, align 4
+  %53 = load i32, i32* %ty, align 4
+  %add64 = add nsw i32 %53, 1
+  store i32 %add64, i32* %S, align 4
+  %54 = load i32, i32* %tx, align 4
+  %sub65 = sub nsw i32 %54, 1
+  store i32 %sub65, i32* %W, align 4
+  %55 = load i32, i32* %tx, align 4
+  %add66 = add nsw i32 %55, 1
+  store i32 %add66, i32* %E, align 4
+  %56 = load i32, i32* %N, align 4
+  %57 = load i32, i32* %validYmin, align 4
+  %cmp67 = icmp slt i32 %56, %57
+  br i1 %cmp67, label %cond.true68, label %cond.false69
+
+cond.true68:                                      ; preds = %cond.end61
+  %58 = load i32, i32* %validYmin, align 4
+  br label %cond.end70
+
+cond.false69:                                     ; preds = %cond.end61
+  %59 = load i32, i32* %N, align 4
+  br label %cond.end70
+
+cond.end70:                                       ; preds = %cond.false69, %cond.true68
+  %cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
+  store i32 %cond71, i32* %N, align 4
+  %60 = load i32, i32* %S, align 4
+  %61 = load i32, i32* %validYmax, align 4
+  %cmp72 = icmp sgt i32 %60, %61
+  br i1 %cmp72, label %cond.true73, label %cond.false74
+
+cond.true73:                                      ; preds = %cond.end70
+  %62 = load i32, i32* %validYmax, align 4
+  br label %cond.end75
+
+cond.false74:                                     ; preds = %cond.end70
+  %63 = load i32, i32* %S, align 4
+  br label %cond.end75
+
+cond.end75:                                       ; preds = %cond.false74, %cond.true73
+  %cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
+  store i32 %cond76, i32* %S, align 4
+  %64 = load i32, i32* %W, align 4
+  %65 = load i32, i32* %validXmin, align 4
+  %cmp77 = icmp slt i32 %64, %65
+  br i1 %cmp77, label %cond.true78, label %cond.false79
+
+cond.true78:                                      ; preds = %cond.end75
+  %66 = load i32, i32* %validXmin, align 4
+  br label %cond.end80
+
+cond.false79:                                     ; preds = %cond.end75
+  %67 = load i32, i32* %W, align 4
+  br label %cond.end80
+
+cond.end80:                                       ; preds = %cond.false79, %cond.true78
+  %cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
+  store i32 %cond81, i32* %W, align 4
+  %68 = load i32, i32* %E, align 4
+  %69 = load i32, i32* %validXmax, align 4
+  %cmp82 = icmp sgt i32 %68, %69
+  br i1 %cmp82, label %cond.true83, label %cond.false84
+
+cond.true83:                                      ; preds = %cond.end80
+  %70 = load i32, i32* %validXmax, align 4
+  br label %cond.end85
+
+cond.false84:                                     ; preds = %cond.end80
+  %71 = load i32, i32* %E, align 4
+  br label %cond.end85
+
+cond.end85:                                       ; preds = %cond.false84, %cond.true83
+  %cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
+  store i32 %cond86, i32* %E, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %cond.end85
+  %72 = load i32, i32* %i, align 4
+  %73 = load i32, i32* %iteration.addr, align 4
+  %cmp87 = icmp slt i32 %72, %73
+  br i1 %cmp87, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i8 0, i8* %computed, align 1
+  %74 = load i32, i32* %tx, align 4
+  %75 = load i32, i32* %i, align 4
+  %add88 = add nsw i32 %75, 1
+  %cmp89 = icmp sge i32 %74, %add88
+  br i1 %cmp89, label %land.lhs.true90, label %if.end175
+
+land.lhs.true90:                                  ; preds = %for.body
+  %76 = load i32, i32* %tx, align 4
+  %77 = load i32, i32* %i, align 4
+  %sub91 = sub nsw i32 16, %77
+  %sub92 = sub nsw i32 %sub91, 2
+  %cmp93 = icmp sle i32 %76, %sub92
+  br i1 %cmp93, label %land.lhs.true94, label %if.end175
+
+land.lhs.true94:                                  ; preds = %land.lhs.true90
+  %78 = load i32, i32* %ty, align 4
+  %79 = load i32, i32* %i, align 4
+  %add95 = add nsw i32 %79, 1
+  %cmp96 = icmp sge i32 %78, %add95
+  br i1 %cmp96, label %land.lhs.true97, label %if.end175
+
+land.lhs.true97:                                  ; preds = %land.lhs.true94
+  %80 = load i32, i32* %ty, align 4
+  %81 = load i32, i32* %i, align 4
+  %sub98 = sub nsw i32 16, %81
+  %sub99 = sub nsw i32 %sub98, 2
+  %cmp100 = icmp sle i32 %80, %sub99
+  br i1 %cmp100, label %land.lhs.true101, label %if.end175
+
+land.lhs.true101:                                 ; preds = %land.lhs.true97
+  %82 = load i32, i32* %tx, align 4
+  %83 = load i32, i32* %validXmin, align 4
+  %cmp102 = icmp sge i32 %82, %83
+  br i1 %cmp102, label %land.lhs.true103, label %if.end175
+
+land.lhs.true103:                                 ; preds = %land.lhs.true101
+  %84 = load i32, i32* %tx, align 4
+  %85 = load i32, i32* %validXmax, align 4
+  %cmp104 = icmp sle i32 %84, %85
+  br i1 %cmp104, label %land.lhs.true105, label %if.end175
+
+land.lhs.true105:                                 ; preds = %land.lhs.true103
+  %86 = load i32, i32* %ty, align 4
+  %87 = load i32, i32* %validYmin, align 4
+  %cmp106 = icmp sge i32 %86, %87
+  br i1 %cmp106, label %land.lhs.true107, label %if.end175
+
+land.lhs.true107:                                 ; preds = %land.lhs.true105
+  %88 = load i32, i32* %ty, align 4
+  %89 = load i32, i32* %validYmax, align 4
+  %cmp108 = icmp sle i32 %88, %89
+  br i1 %cmp108, label %if.then109, label %if.end175
+
+if.then109:                                       ; preds = %land.lhs.true107
+  store i8 1, i8* %computed, align 1
+  %90 = load i32, i32* %ty, align 4
+  %idxprom110 = sext i32 %90 to i64
+  %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
+  %91 = load i32, i32* %tx, align 4
+  %idxprom112 = sext i32 %91 to i64
+  %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
+  %92 = load float, float* %arrayidx113, align 4
+  %conv = fpext float %92 to double
+  %93 = load float, float* %step_div_Cap, align 4
+  %conv114 = fpext float %93 to double
+  %94 = load i32, i32* %ty, align 4
+  %idxprom115 = sext i32 %94 to i64
+  %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
+  %95 = load i32, i32* %tx, align 4
+  %idxprom117 = sext i32 %95 to i64
+  %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
+  %96 = load float, float* %arrayidx118, align 4
+  %conv119 = fpext float %96 to double
+  %97 = load i32, i32* %S, align 4
+  %idxprom120 = sext i32 %97 to i64
+  %arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
+  %98 = load i32, i32* %tx, align 4
+  %idxprom122 = sext i32 %98 to i64
+  %arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
+  %99 = load float, float* %arrayidx123, align 4
+  %100 = load i32, i32* %N, align 4
+  %idxprom124 = sext i32 %100 to i64
+  %arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
+  %101 = load i32, i32* %tx, align 4
+  %idxprom126 = sext i32 %101 to i64
+  %arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
+  %102 = load float, float* %arrayidx127, align 4
+  %add128 = fadd contract float %99, %102
+  %conv129 = fpext float %add128 to double
+  %103 = load i32, i32* %ty, align 4
+  %idxprom130 = sext i32 %103 to i64
+  %arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
+  %104 = load i32, i32* %tx, align 4
+  %idxprom132 = sext i32 %104 to i64
+  %arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
+  %105 = load float, float* %arrayidx133, align 4
+  %conv134 = fpext float %105 to double
+  %mul135 = fmul contract double 2.000000e+00, %conv134
+  %sub136 = fsub contract double %conv129, %mul135
+  %106 = load float, float* %Ry_1, align 4
+  %conv137 = fpext float %106 to double
+  %mul138 = fmul contract double %sub136, %conv137
+  %add139 = fadd contract double %conv119, %mul138
+  %107 = load i32, i32* %ty, align 4
+  %idxprom140 = sext i32 %107 to i64
+  %arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
+  %108 = load i32, i32* %E, align 4
+  %idxprom142 = sext i32 %108 to i64
+  %arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
+  %109 = load float, float* %arrayidx143, align 4
+  %110 = load i32, i32* %ty, align 4
+  %idxprom144 = sext i32 %110 to i64
+  %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
+  %111 = load i32, i32* %W, align 4
+  %idxprom146 = sext i32 %111 to i64
+  %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
+  %112 = load float, float* %arrayidx147, align 4
+  %add148 = fadd contract float %109, %112
+  %conv149 = fpext float %add148 to double
+  %113 = load i32, i32* %ty, align 4
+  %idxprom150 = sext i32 %113 to i64
+  %arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
+  %114 = load i32, i32* %tx, align 4
+  %idxprom152 = sext i32 %114 to i64
+  %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
+  %115 = load float, float* %arrayidx153, align 4
+  %conv154 = fpext float %115 to double
+  %mul155 = fmul contract double 2.000000e+00, %conv154
+  %sub156 = fsub contract double %conv149, %mul155
+  %116 = load float, float* %Rx_1, align 4
+  %conv157 = fpext float %116 to double
+  %mul158 = fmul contract double %sub156, %conv157
+  %add159 = fadd contract double %add139, %mul158
+  %117 = load float, float* %amb_temp, align 4
+  %118 = load i32, i32* %ty, align 4
+  %idxprom160 = sext i32 %118 to i64
+  %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
+  %119 = load i32, i32* %tx, align 4
+  %idxprom162 = sext i32 %119 to i64
+  %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
+  %120 = load float, float* %arrayidx163, align 4
+  %sub164 = fsub contract float %117, %120
+  %121 = load float, float* %Rz_1, align 4
+  %mul165 = fmul contract float %sub164, %121
+  %conv166 = fpext float %mul165 to double
+  %add167 = fadd contract double %add159, %conv166
+  %mul168 = fmul contract double %conv114, %add167
+  %add169 = fadd contract double %conv, %mul168
+  %conv170 = fptrunc double %add169 to float
+  %122 = load i32, i32* %ty, align 4
+  %idxprom171 = sext i32 %122 to i64
+  %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
+  %123 = load i32, i32* %tx, align 4
+  %idxprom173 = sext i32 %123 to i64
+  %arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
+  store float %conv170, float* %arrayidx174, align 4
+  br label %if.end175
+
+if.end175:                                        ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
+  call void @llvm.nvvm.barrier0()
+  %124 = load i32, i32* %i, align 4
+  %125 = load i32, i32* %iteration.addr, align 4
+  %sub176 = sub nsw i32 %125, 1
+  %cmp177 = icmp eq i32 %124, %sub176
+  br i1 %cmp177, label %if.then178, label %if.end179
+
+if.then178:                                       ; preds = %if.end175
+  br label %for.end
+
+if.end179:                                        ; preds = %if.end175
+  %126 = load i8, i8* %computed, align 1
+  %tobool = trunc i8 %126 to i1
+  br i1 %tobool, label %if.then180, label %if.end189
+
+if.then180:                                       ; preds = %if.end179
+  %127 = load i32, i32* %ty, align 4
+  %idxprom181 = sext i32 %127 to i64
+  %arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
+  %128 = load i32, i32* %tx, align 4
+  %idxprom183 = sext i32 %128 to i64
+  %arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
+  %129 = load float, float* %arrayidx184, align 4
+  %130 = load i32, i32* %ty, align 4
+  %idxprom185 = sext i32 %130 to i64
+  %arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
+  %131 = load i32, i32* %tx, align 4
+  %idxprom187 = sext i32 %131 to i64
+  %arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
+  store float %129, float* %arrayidx188, align 4
+  br label %if.end189
+
+if.end189:                                        ; preds = %if.then180, %if.end179
+  call void @llvm.nvvm.barrier0()
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end189
+  %132 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %132, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %if.then178, %for.cond
+  %133 = load i8, i8* %computed, align 1
+  %tobool190 = trunc i8 %133 to i1
+  br i1 %tobool190, label %if.then191, label %if.end198
+
+if.then191:                                       ; preds = %for.end
+  %134 = load i32, i32* %ty, align 4
+  %idxprom192 = sext i32 %134 to i64
+  %arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
+  %135 = load i32, i32* %tx, align 4
+  %idxprom194 = sext i32 %135 to i64
+  %arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
+  %136 = load float, float* %arrayidx195, align 4
+  %137 = load float*, float** %temp_dst.addr, align 8
+  %138 = load i32, i32* %index, align 4
+  %idxprom196 = sext i32 %138 to i64
+  %arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
+  store float %136, float* %arrayidx197, align 4
+  br label %if.end198
+
+if.end198:                                        ; preds = %if.then191, %for.end
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  ret i32 %0
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+attributes #3 = { nounwind readnone }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
--- a/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot/hotspot.cu
+++ b/examples/hotspot/hotspot.cu
@ -0,0 +1,353 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#ifdef RD_WG_SIZE_0_0
+#define BLOCK_SIZE RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+#define BLOCK_SIZE RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+#define BLOCK_SIZE RD_WG_SIZE
+#else
+#define BLOCK_SIZE 16
+#endif
+
+#define STR_SIZE 256
+
+/* maximum power density possible (say 300W for a 10mm x 10mm chip)	*/
+#define MAX_PD (3.0e6)
+/* required precision in degrees	*/
+#define PRECISION 0.001
+#define SPEC_HEAT_SI 1.75e6
+#define K_SI 100
+/* capacitance fitting factor	*/
+#define FACTOR_CHIP 0.5
+
+/* chip parameters	*/
+float t_chip = 0.0005;
+float chip_height = 0.016;
+float chip_width = 0.016;
+/* ambient temperature, assuming no package at all	*/
+float amb_temp = 80.0;
+
+void run(int argc, char **argv);
+
+/* define timer macros */
+#define pin_stats_reset() startCycle()
+#define pin_stats_pause(cycles) stopCycle(cycles)
+#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
+
+void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
+
+void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
+
+  int i, j, index = 0;
+  FILE *fp;
+  char str[STR_SIZE];
+
+  if ((fp = fopen(file, "w")) == 0)
+    printf("The file was not opened\n");
+
+  for (i = 0; i < grid_rows; i++)
+    for (j = 0; j < grid_cols; j++) {
+
+      sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
+      fputs(str, fp);
+      index++;
+    }
+
+  fclose(fp);
+}
+
+void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
+
+  int i, j;
+  FILE *fp;
+  char str[STR_SIZE];
+  float val;
+
+  if ((fp = fopen(file, "r")) == 0)
+    printf("The file was not opened\n");
+
+  for (i = 0; i <= grid_rows - 1; i++)
+    for (j = 0; j <= grid_cols - 1; j++) {
+      fgets(str, STR_SIZE, fp);
+      if (feof(fp))
+        fatal("not enough lines in file");
+      // if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
+      // ((i-1)*(grid_cols-2)+j-1)))
+      if ((sscanf(str, "%f", &val) != 1))
+        fatal("invalid file format");
+      vect[i * grid_cols + j] = val;
+    }
+
+  fclose(fp);
+}
+
+#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
+#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
+#define MIN(a, b) ((a) <= (b) ? (a) : (b))
+
+__global__ void calculate_temp(int iteration,   // number of iteration
+                               float *power,    // power input
+                               float *temp_src, // temperature input/output
+                               float *temp_dst, // temperature input/output
+                               int grid_cols,   // Col of grid
+                               int grid_rows,   // Row of grid
+                               int border_cols, // border offset
+                               int border_rows, // border offset
+                               float Cap,       // Capacitance
+                               float Rx, float Ry, float Rz, float step,
+                               float time_elapsed) {
+
+  __shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
+  __shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
+  __shared__ float temp_t[BLOCK_SIZE]
+                         [BLOCK_SIZE]; // saving temparary temperature result
+
+  float amb_temp = 80.0;
+  float step_div_Cap;
+  float Rx_1, Ry_1, Rz_1;
+
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  step_div_Cap = step / Cap;
+
+  Rx_1 = 1 / Rx;
+  Ry_1 = 1 / Ry;
+  Rz_1 = 1 / Rz;
+
+  // each block finally computes result for a small block
+  // after N iterations.
+  // it is the non-overlapping small blocks that cover
+  // all the input data
+
+  // calculate the small block size
+  int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
+  int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
+
+  // calculate the boundary for the block according to
+  // the boundary of its small block
+  int blkY = small_block_rows * by - border_rows;
+  int blkX = small_block_cols * bx - border_cols;
+  int blkYmax = blkY + BLOCK_SIZE - 1;
+  int blkXmax = blkX + BLOCK_SIZE - 1;
+
+  // calculate the global thread coordination
+  int yidx = blkY + ty;
+  int xidx = blkX + tx;
+
+  // load data if it is within the valid input range
+  int loadYidx = yidx, loadXidx = xidx;
+  int index = grid_cols * loadYidx + loadXidx;
+
+  if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
+      IN_RANGE(loadXidx, 0, grid_cols - 1)) {
+    temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
+                                            // global memory to shared memory
+    power_on_cuda[ty][tx] =
+        power[index]; // Load the power data from global memory to shared memory
+  }
+  __syncthreads();
+
+  // effective range within this block that falls within
+  // the valid range of the input data
+  // used to rule out computation outside the boundary.
+  int validYmin = (blkY < 0) ? -blkY : 0;
+  int validYmax = (blkYmax > grid_rows - 1)
+                      ? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
+                      : BLOCK_SIZE - 1;
+  int validXmin = (blkX < 0) ? -blkX : 0;
+  int validXmax = (blkXmax > grid_cols - 1)
+                      ? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
+                      : BLOCK_SIZE - 1;
+
+  int N = ty - 1;
+  int S = ty + 1;
+  int W = tx - 1;
+  int E = tx + 1;
+
+  N = (N < validYmin) ? validYmin : N;
+  S = (S > validYmax) ? validYmax : S;
+  W = (W < validXmin) ? validXmin : W;
+  E = (E > validXmax) ? validXmax : E;
+
+  bool computed;
+  for (int i = 0; i < iteration; i++) {
+    computed = false;
+    if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
+        IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
+        IN_RANGE(tx, validXmin, validXmax) &&
+        IN_RANGE(ty, validYmin, validYmax)) {
+      computed = true;
+      temp_t[ty][tx] =
+          temp_on_cuda[ty][tx] +
+          step_div_Cap * (power_on_cuda[ty][tx] +
+                          (temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
+                           2.0 * temp_on_cuda[ty][tx]) *
+                              Ry_1 +
+                          (temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
+                           2.0 * temp_on_cuda[ty][tx]) *
+                              Rx_1 +
+                          (amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
+    }
+    __syncthreads();
+    if (i == iteration - 1)
+      break;
+    if (computed) // Assign the computation range
+      temp_on_cuda[ty][tx] = temp_t[ty][tx];
+    __syncthreads();
+  }
+
+  // update the global memory
+  // after the last iteration, only threads coordinated within the
+  // small block perform the calculation and switch on ``computed''
+  if (computed) {
+    temp_dst[index] = temp_t[ty][tx];
+  }
+}
+
+/*
+   compute N time steps
+*/
+
+int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
+                      int row, int total_iterations, int num_iterations,
+                      int blockCols, int blockRows, int borderCols,
+                      int borderRows) {
+  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+  dim3 dimGrid(blockCols, blockRows);
+
+  float grid_height = chip_height / row;
+  float grid_width = chip_width / col;
+
+  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
+  float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
+  float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
+  float Rz = t_chip / (K_SI * grid_height * grid_width);
+
+  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
+  float step = PRECISION / max_slope;
+  float t;
+  float time_elapsed;
+  time_elapsed = 0.001;
+
+  int src = 1, dst = 0;
+
+  for (t = 0; t < total_iterations; t += num_iterations) {
+    int temp = src;
+    src = dst;
+    dst = temp;
+    calculate_temp<<<dimGrid, dimBlock>>>(
+        MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
+        MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
+        step, time_elapsed);
+    cudaDeviceSynchronize();
+  }
+  return dst;
+}
+
+void usage(int argc, char **argv) {
+  fprintf(stderr,
+          "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
+          "<temp_file> <power_file> <output_file>\n",
+          argv[0]);
+  fprintf(stderr, "\t<grid_rows/grid_cols>  - number of rows/cols in the grid "
+                  "(positive integer)\n");
+  fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
+  fprintf(stderr, "\t<sim_time>   - number of iterations\n");
+  fprintf(stderr, "\t<temp_file>  - name of the file containing the initial "
+                  "temperature values of each cell\n");
+  fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
+                  "power values of each cell\n");
+  fprintf(stderr, "\t<output_file> - name of the output file\n");
+  exit(1);
+}
+
+int main(int argc, char **argv) {
+  cudaSetDevice(0);
+  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
+
+  run(argc, argv);
+
+  return EXIT_SUCCESS;
+}
+
+void run(int argc, char **argv) {
+  int size;
+  int grid_rows, grid_cols;
+  float *FilesavingTemp, *FilesavingPower, *MatrixOut;
+  char *tfile, *pfile, *ofile;
+
+  int total_iterations = 60;
+  int pyramid_height = 1; // number of iterations
+
+  if (argc != 7)
+    usage(argc, argv);
+  if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
+      (pyramid_height = atoi(argv[2])) <= 0 ||
+      (total_iterations = atoi(argv[3])) <= 0)
+    usage(argc, argv);
+
+  tfile = argv[4];
+  pfile = argv[5];
+  ofile = argv[6];
+
+  size = grid_rows * grid_cols;
+
+/* --------------- pyramid parameters --------------- */
+#define EXPAND_RATE                                                            \
+  2 // add one iteration will extend the pyramid base by 2 per each borderline
+  int borderCols = (pyramid_height)*EXPAND_RATE / 2;
+  int borderRows = (pyramid_height)*EXPAND_RATE / 2;
+  int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
+  int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
+  int blockCols =
+      grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
+  int blockRows =
+      grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
+
+  FilesavingTemp = (float *)malloc(size * sizeof(float));
+  FilesavingPower = (float *)malloc(size * sizeof(float));
+  MatrixOut = (float *)calloc(size, sizeof(float));
+
+  if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
+    fatal("unable to allocate memory");
+
+  printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
+         "%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
+         pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
+         blockCols, blockRows, smallBlockCol, smallBlockRow);
+
+  readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
+  readinput(FilesavingPower, grid_rows, grid_cols, pfile);
+
+  float *MatrixTemp[2], *MatrixPower;
+  cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
+  cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
+  cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
+             cudaMemcpyHostToDevice);
+
+  cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
+  cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
+             cudaMemcpyHostToDevice);
+  printf("Start computing the transient temperature\n");
+  int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
+                              total_iterations, pyramid_height, blockCols,
+                              blockRows, borderCols, borderRows);
+  printf("Ending simulation\n");
+  cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
+             cudaMemcpyDeviceToHost);
+
+  writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
+
+  cudaFree(MatrixPower);
+  cudaFree(MatrixTemp[0]);
+  cudaFree(MatrixTemp[1]);
+  free(MatrixOut);
+}
--- a/examples/hotspot/run.sh
+++ b/examples/hotspot/run.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -e
+llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
+../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
+../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
+    -o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
+if head output.out | grep -q "323.829"; then
+    echo "Pass"
+else
+    echo "Error result"
+    exit 1
+fi
--- a/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,587 @@
+; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "3D.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.__cuda_builtin_blockDim_t = type { i8 }
+%struct.__cuda_builtin_blockIdx_t = type { i8 }
+%struct.__cuda_builtin_threadIdx_t = type { i8 }
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
+
+$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
+
+$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
+
+$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
+
+@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
+@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
+@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
+entry:
+  %p.addr = alloca i8**, align 8
+  %s.addr = alloca i64, align 8
+  store i8** %p, i8*** %p.addr, align 8
+  store i64 %s, i64* %s.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
+entry:
+  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
+  %c.addr = alloca i8*, align 8
+  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
+  store i8* %c, i8** %c.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
+entry:
+  %value.addr = alloca i32*, align 8
+  %attr.addr = alloca i32, align 4
+  %device.addr = alloca i32, align 4
+  store i32* %value, i32** %value.addr, align 8
+  store i32 %attr, i32* %attr.addr, align 4
+  store i32 %device, i32* %device.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
+entry:
+  %device.addr = alloca i32*, align 8
+  store i32* %device, i32** %device.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
+entry:
+  %numBlocks.addr = alloca i32*, align 8
+  %func.addr = alloca i8*, align 8
+  %blockSize.addr = alloca i32, align 4
+  %dynamicSmemSize.addr = alloca i64, align 8
+  %flags.addr = alloca i32, align 4
+  store i32* %numBlocks, i32** %numBlocks.addr, align 8
+  store i8* %func, i8** %func.addr, align 8
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
+  store i32 %flags, i32* %flags.addr, align 4
+  ret i32 999
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
+entry:
+  %p.addr = alloca float*, align 8
+  %tIn.addr = alloca float*, align 8
+  %tOut.addr = alloca float*, align 8
+  %sdc.addr = alloca float, align 4
+  %nx.addr = alloca i32, align 4
+  %ny.addr = alloca i32, align 4
+  %nz.addr = alloca i32, align 4
+  %ce.addr = alloca float, align 4
+  %cw.addr = alloca float, align 4
+  %cn.addr = alloca float, align 4
+  %cs.addr = alloca float, align 4
+  %ct.addr = alloca float, align 4
+  %cb.addr = alloca float, align 4
+  %cc.addr = alloca float, align 4
+  %amb_temp = alloca float, align 4
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %c = alloca i32, align 4
+  %xy = alloca i32, align 4
+  %W = alloca i32, align 4
+  %E = alloca i32, align 4
+  %N = alloca i32, align 4
+  %S = alloca i32, align 4
+  %temp1 = alloca float, align 4
+  %temp2 = alloca float, align 4
+  %temp3 = alloca float, align 4
+  %k = alloca i32, align 4
+  store float* %p, float** %p.addr, align 8
+  store float* %tIn, float** %tIn.addr, align 8
+  store float* %tOut, float** %tOut.addr, align 8
+  store float %sdc, float* %sdc.addr, align 4
+  store i32 %nx, i32* %nx.addr, align 4
+  store i32 %ny, i32* %ny.addr, align 4
+  store i32 %nz, i32* %nz.addr, align 4
+  store float %ce, float* %ce.addr, align 4
+  store float %cw, float* %cw.addr, align 4
+  store float %cn, float* %cn.addr, align 4
+  store float %cs, float* %cs.addr, align 4
+  store float %ct, float* %ct.addr, align 4
+  store float %cb, float* %cb.addr, align 4
+  store float %cc, float* %cc.addr, align 4
+  store float 8.000000e+01, float* %amb_temp, align 4
+  %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
+  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
+  %mul = mul i32 %call, %call1
+  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
+  %add = add i32 %mul, %call2
+  store i32 %add, i32* %i, align 4
+  %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
+  %call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
+  %mul5 = mul i32 %call3, %call4
+  %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
+  %add7 = add i32 %mul5, %call6
+  store i32 %add7, i32* %j, align 4
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %j, align 4
+  %2 = load i32, i32* %nx.addr, align 4
+  %mul8 = mul nsw i32 %1, %2
+  %add9 = add nsw i32 %0, %mul8
+  store i32 %add9, i32* %c, align 4
+  %3 = load i32, i32* %nx.addr, align 4
+  %4 = load i32, i32* %ny.addr, align 4
+  %mul10 = mul nsw i32 %3, %4
+  store i32 %mul10, i32* %xy, align 4
+  %5 = load i32, i32* %i, align 4
+  %cmp = icmp eq i32 %5, 0
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %6 = load i32, i32* %c, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %7 = load i32, i32* %c, align 4
+  %sub = sub nsw i32 %7, 1
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
+  store i32 %cond, i32* %W, align 4
+  %8 = load i32, i32* %i, align 4
+  %9 = load i32, i32* %nx.addr, align 4
+  %sub11 = sub nsw i32 %9, 1
+  %cmp12 = icmp eq i32 %8, %sub11
+  br i1 %cmp12, label %cond.true13, label %cond.false14
+
+cond.true13:                                      ; preds = %cond.end
+  %10 = load i32, i32* %c, align 4
+  br label %cond.end16
+
+cond.false14:                                     ; preds = %cond.end
+  %11 = load i32, i32* %c, align 4
+  %add15 = add nsw i32 %11, 1
+  br label %cond.end16
+
+cond.end16:                                       ; preds = %cond.false14, %cond.true13
+  %cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
+  store i32 %cond17, i32* %E, align 4
+  %12 = load i32, i32* %j, align 4
+  %cmp18 = icmp eq i32 %12, 0
+  br i1 %cmp18, label %cond.true19, label %cond.false20
+
+cond.true19:                                      ; preds = %cond.end16
+  %13 = load i32, i32* %c, align 4
+  br label %cond.end22
+
+cond.false20:                                     ; preds = %cond.end16
+  %14 = load i32, i32* %c, align 4
+  %15 = load i32, i32* %nx.addr, align 4
+  %sub21 = sub nsw i32 %14, %15
+  br label %cond.end22
+
+cond.end22:                                       ; preds = %cond.false20, %cond.true19
+  %cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
+  store i32 %cond23, i32* %N, align 4
+  %16 = load i32, i32* %j, align 4
+  %17 = load i32, i32* %ny.addr, align 4
+  %sub24 = sub nsw i32 %17, 1
+  %cmp25 = icmp eq i32 %16, %sub24
+  br i1 %cmp25, label %cond.true26, label %cond.false27
+
+cond.true26:                                      ; preds = %cond.end22
+  %18 = load i32, i32* %c, align 4
+  br label %cond.end29
+
+cond.false27:                                     ; preds = %cond.end22
+  %19 = load i32, i32* %c, align 4
+  %20 = load i32, i32* %nx.addr, align 4
+  %add28 = add nsw i32 %19, %20
+  br label %cond.end29
+
+cond.end29:                                       ; preds = %cond.false27, %cond.true26
+  %cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
+  store i32 %cond30, i32* %S, align 4
+  %21 = load float*, float** %tIn.addr, align 8
+  %22 = load i32, i32* %c, align 4
+  %idxprom = sext i32 %22 to i64
+  %arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
+  %23 = load float, float* %arrayidx, align 4
+  store float %23, float* %temp2, align 4
+  store float %23, float* %temp1, align 4
+  %24 = load float*, float** %tIn.addr, align 8
+  %25 = load i32, i32* %c, align 4
+  %26 = load i32, i32* %xy, align 4
+  %add31 = add nsw i32 %25, %26
+  %idxprom32 = sext i32 %add31 to i64
+  %arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
+  %27 = load float, float* %arrayidx33, align 4
+  store float %27, float* %temp3, align 4
+  %28 = load float, float* %cc.addr, align 4
+  %29 = load float, float* %temp2, align 4
+  %mul34 = fmul contract float %28, %29
+  %30 = load float, float* %cw.addr, align 4
+  %31 = load float*, float** %tIn.addr, align 8
+  %32 = load i32, i32* %W, align 4
+  %idxprom35 = sext i32 %32 to i64
+  %arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
+  %33 = load float, float* %arrayidx36, align 4
+  %mul37 = fmul contract float %30, %33
+  %add38 = fadd contract float %mul34, %mul37
+  %34 = load float, float* %ce.addr, align 4
+  %35 = load float*, float** %tIn.addr, align 8
+  %36 = load i32, i32* %E, align 4
+  %idxprom39 = sext i32 %36 to i64
+  %arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
+  %37 = load float, float* %arrayidx40, align 4
+  %mul41 = fmul contract float %34, %37
+  %add42 = fadd contract float %add38, %mul41
+  %38 = load float, float* %cs.addr, align 4
+  %39 = load float*, float** %tIn.addr, align 8
+  %40 = load i32, i32* %S, align 4
+  %idxprom43 = sext i32 %40 to i64
+  %arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
+  %41 = load float, float* %arrayidx44, align 4
+  %mul45 = fmul contract float %38, %41
+  %add46 = fadd contract float %add42, %mul45
+  %42 = load float, float* %cn.addr, align 4
+  %43 = load float*, float** %tIn.addr, align 8
+  %44 = load i32, i32* %N, align 4
+  %idxprom47 = sext i32 %44 to i64
+  %arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
+  %45 = load float, float* %arrayidx48, align 4
+  %mul49 = fmul contract float %42, %45
+  %add50 = fadd contract float %add46, %mul49
+  %46 = load float, float* %cb.addr, align 4
+  %47 = load float, float* %temp1, align 4
+  %mul51 = fmul contract float %46, %47
+  %add52 = fadd contract float %add50, %mul51
+  %48 = load float, float* %ct.addr, align 4
+  %49 = load float, float* %temp3, align 4
+  %mul53 = fmul contract float %48, %49
+  %add54 = fadd contract float %add52, %mul53
+  %50 = load float, float* %sdc.addr, align 4
+  %51 = load float*, float** %p.addr, align 8
+  %52 = load i32, i32* %c, align 4
+  %idxprom55 = sext i32 %52 to i64
+  %arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
+  %53 = load float, float* %arrayidx56, align 4
+  %mul57 = fmul contract float %50, %53
+  %add58 = fadd contract float %add54, %mul57
+  %54 = load float, float* %ct.addr, align 4
+  %55 = load float, float* %amb_temp, align 4
+  %mul59 = fmul contract float %54, %55
+  %add60 = fadd contract float %add58, %mul59
+  %56 = load float*, float** %tOut.addr, align 8
+  %57 = load i32, i32* %c, align 4
+  %idxprom61 = sext i32 %57 to i64
+  %arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
+  store float %add60, float* %arrayidx62, align 4
+  %58 = load i32, i32* %xy, align 4
+  %59 = load i32, i32* %c, align 4
+  %add63 = add nsw i32 %59, %58
+  store i32 %add63, i32* %c, align 4
+  %60 = load i32, i32* %xy, align 4
+  %61 = load i32, i32* %W, align 4
+  %add64 = add nsw i32 %61, %60
+  store i32 %add64, i32* %W, align 4
+  %62 = load i32, i32* %xy, align 4
+  %63 = load i32, i32* %E, align 4
+  %add65 = add nsw i32 %63, %62
+  store i32 %add65, i32* %E, align 4
+  %64 = load i32, i32* %xy, align 4
+  %65 = load i32, i32* %N, align 4
+  %add66 = add nsw i32 %65, %64
+  store i32 %add66, i32* %N, align 4
+  %66 = load i32, i32* %xy, align 4
+  %67 = load i32, i32* %S, align 4
+  %add67 = add nsw i32 %67, %66
+  store i32 %add67, i32* %S, align 4
+  store i32 1, i32* %k, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %cond.end29
+  %68 = load i32, i32* %k, align 4
+  %69 = load i32, i32* %nz.addr, align 4
+  %sub68 = sub nsw i32 %69, 1
+  %cmp69 = icmp slt i32 %68, %sub68
+  br i1 %cmp69, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %70 = load float, float* %temp2, align 4
+  store float %70, float* %temp1, align 4
+  %71 = load float, float* %temp3, align 4
+  store float %71, float* %temp2, align 4
+  %72 = load float*, float** %tIn.addr, align 8
+  %73 = load i32, i32* %c, align 4
+  %74 = load i32, i32* %xy, align 4
+  %add70 = add nsw i32 %73, %74
+  %idxprom71 = sext i32 %add70 to i64
+  %arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
+  %75 = load float, float* %arrayidx72, align 4
+  store float %75, float* %temp3, align 4
+  %76 = load float, float* %cc.addr, align 4
+  %77 = load float, float* %temp2, align 4
+  %mul73 = fmul contract float %76, %77
+  %78 = load float, float* %cw.addr, align 4
+  %79 = load float*, float** %tIn.addr, align 8
+  %80 = load i32, i32* %W, align 4
+  %idxprom74 = sext i32 %80 to i64
+  %arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
+  %81 = load float, float* %arrayidx75, align 4
+  %mul76 = fmul contract float %78, %81
+  %add77 = fadd contract float %mul73, %mul76
+  %82 = load float, float* %ce.addr, align 4
+  %83 = load float*, float** %tIn.addr, align 8
+  %84 = load i32, i32* %E, align 4
+  %idxprom78 = sext i32 %84 to i64
+  %arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
+  %85 = load float, float* %arrayidx79, align 4
+  %mul80 = fmul contract float %82, %85
+  %add81 = fadd contract float %add77, %mul80
+  %86 = load float, float* %cs.addr, align 4
+  %87 = load float*, float** %tIn.addr, align 8
+  %88 = load i32, i32* %S, align 4
+  %idxprom82 = sext i32 %88 to i64
+  %arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
+  %89 = load float, float* %arrayidx83, align 4
+  %mul84 = fmul contract float %86, %89
+  %add85 = fadd contract float %add81, %mul84
+  %90 = load float, float* %cn.addr, align 4
+  %91 = load float*, float** %tIn.addr, align 8
+  %92 = load i32, i32* %N, align 4
+  %idxprom86 = sext i32 %92 to i64
+  %arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
+  %93 = load float, float* %arrayidx87, align 4
+  %mul88 = fmul contract float %90, %93
+  %add89 = fadd contract float %add85, %mul88
+  %94 = load float, float* %cb.addr, align 4
+  %95 = load float, float* %temp1, align 4
+  %mul90 = fmul contract float %94, %95
+  %add91 = fadd contract float %add89, %mul90
+  %96 = load float, float* %ct.addr, align 4
+  %97 = load float, float* %temp3, align 4
+  %mul92 = fmul contract float %96, %97
+  %add93 = fadd contract float %add91, %mul92
+  %98 = load float, float* %sdc.addr, align 4
+  %99 = load float*, float** %p.addr, align 8
+  %100 = load i32, i32* %c, align 4
+  %idxprom94 = sext i32 %100 to i64
+  %arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
+  %101 = load float, float* %arrayidx95, align 4
+  %mul96 = fmul contract float %98, %101
+  %add97 = fadd contract float %add93, %mul96
+  %102 = load float, float* %ct.addr, align 4
+  %103 = load float, float* %amb_temp, align 4
+  %mul98 = fmul contract float %102, %103
+  %add99 = fadd contract float %add97, %mul98
+  %104 = load float*, float** %tOut.addr, align 8
+  %105 = load i32, i32* %c, align 4
+  %idxprom100 = sext i32 %105 to i64
+  %arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
+  store float %add99, float* %arrayidx101, align 4
+  %106 = load i32, i32* %xy, align 4
+  %107 = load i32, i32* %c, align 4
+  %add102 = add nsw i32 %107, %106
+  store i32 %add102, i32* %c, align 4
+  %108 = load i32, i32* %xy, align 4
+  %109 = load i32, i32* %W, align 4
+  %add103 = add nsw i32 %109, %108
+  store i32 %add103, i32* %W, align 4
+  %110 = load i32, i32* %xy, align 4
+  %111 = load i32, i32* %E, align 4
+  %add104 = add nsw i32 %111, %110
+  store i32 %add104, i32* %E, align 4
+  %112 = load i32, i32* %xy, align 4
+  %113 = load i32, i32* %N, align 4
+  %add105 = add nsw i32 %113, %112
+  store i32 %add105, i32* %N, align 4
+  %114 = load i32, i32* %xy, align 4
+  %115 = load i32, i32* %S, align 4
+  %add106 = add nsw i32 %115, %114
+  store i32 %add106, i32* %S, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %116 = load i32, i32* %k, align 4
+  %inc = add nsw i32 %116, 1
+  store i32 %inc, i32* %k, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %117 = load float, float* %temp2, align 4
+  store float %117, float* %temp1, align 4
+  %118 = load float, float* %temp3, align 4
+  store float %118, float* %temp2, align 4
+  %119 = load float, float* %cc.addr, align 4
+  %120 = load float, float* %temp2, align 4
+  %mul107 = fmul contract float %119, %120
+  %121 = load float, float* %cw.addr, align 4
+  %122 = load float*, float** %tIn.addr, align 8
+  %123 = load i32, i32* %W, align 4
+  %idxprom108 = sext i32 %123 to i64
+  %arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
+  %124 = load float, float* %arrayidx109, align 4
+  %mul110 = fmul contract float %121, %124
+  %add111 = fadd contract float %mul107, %mul110
+  %125 = load float, float* %ce.addr, align 4
+  %126 = load float*, float** %tIn.addr, align 8
+  %127 = load i32, i32* %E, align 4
+  %idxprom112 = sext i32 %127 to i64
+  %arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
+  %128 = load float, float* %arrayidx113, align 4
+  %mul114 = fmul contract float %125, %128
+  %add115 = fadd contract float %add111, %mul114
+  %129 = load float, float* %cs.addr, align 4
+  %130 = load float*, float** %tIn.addr, align 8
+  %131 = load i32, i32* %S, align 4
+  %idxprom116 = sext i32 %131 to i64
+  %arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
+  %132 = load float, float* %arrayidx117, align 4
+  %mul118 = fmul contract float %129, %132
+  %add119 = fadd contract float %add115, %mul118
+  %133 = load float, float* %cn.addr, align 4
+  %134 = load float*, float** %tIn.addr, align 8
+  %135 = load i32, i32* %N, align 4
+  %idxprom120 = sext i32 %135 to i64
+  %arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
+  %136 = load float, float* %arrayidx121, align 4
+  %mul122 = fmul contract float %133, %136
+  %add123 = fadd contract float %add119, %mul122
+  %137 = load float, float* %cb.addr, align 4
+  %138 = load float, float* %temp1, align 4
+  %mul124 = fmul contract float %137, %138
+  %add125 = fadd contract float %add123, %mul124
+  %139 = load float, float* %ct.addr, align 4
+  %140 = load float, float* %temp3, align 4
+  %mul126 = fmul contract float %139, %140
+  %add127 = fadd contract float %add125, %mul126
+  %141 = load float, float* %sdc.addr, align 4
+  %142 = load float*, float** %p.addr, align 8
+  %143 = load i32, i32* %c, align 4
+  %idxprom128 = sext i32 %143 to i64
+  %arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
+  %144 = load float, float* %arrayidx129, align 4
+  %mul130 = fmul contract float %141, %144
+  %add131 = fadd contract float %add127, %mul130
+  %145 = load float, float* %ct.addr, align 4
+  %146 = load float, float* %amb_temp, align 4
+  %mul132 = fmul contract float %145, %146
+  %add133 = fadd contract float %add131, %mul132
+  %147 = load float*, float** %tOut.addr, align 8
+  %148 = load i32, i32* %c, align 4
+  %idxprom134 = sext i32 %148 to i64
+  %arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
+  store float %add133, float* %arrayidx135, align 4
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  ret i32 %0
+}
+
+; Function Attrs: alwaysinline convergent nounwind
+define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  ret i32 %0
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
--- a/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot3D/3D.cu
+++ b/examples/hotspot3D/3D.cu
@ -0,0 +1,205 @@
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+
+#define BLOCK_SIZE 16
+#define STR_SIZE 256
+
+#define block_x_ 128
+#define block_y_ 2
+#define block_z_ 1
+#define MAX_PD (3.0e6)
+/* required precision in degrees	*/
+#define PRECISION 0.001
+#define SPEC_HEAT_SI 1.75e6
+#define K_SI 100
+/* capacitance fitting factor	*/
+#define FACTOR_CHIP 0.5
+
+#include "opt1.cu"
+
+/* chip parameters	*/
+float t_chip = 0.0005;
+float chip_height = 0.016;
+float chip_width = 0.016; /* ambient temperature, assuming no package at all
+                           */
+float amb_temp = 80.0;
+
+void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
+
+void readinput(float *vect, int grid_rows, int grid_cols, int layers,
+               char *file) {
+  int i, j, k;
+  FILE *fp;
+  char str[STR_SIZE];
+  float val;
+
+  if ((fp = fopen(file, "r")) == 0)
+    fatal("The file was not opened");
+
+  for (i = 0; i <= grid_rows - 1; i++)
+    for (j = 0; j <= grid_cols - 1; j++)
+      for (k = 0; k <= layers - 1; k++) {
+        if (fgets(str, STR_SIZE, fp) == NULL)
+          fatal("Error reading file\n");
+        if (feof(fp))
+          fatal("not enough lines in file");
+        if ((sscanf(str, "%f", &val) != 1))
+          fatal("invalid file format");
+        vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
+      }
+
+  fclose(fp);
+}
+
+void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
+                 char *file) {
+
+  int i, j, k, index = 0;
+  FILE *fp;
+  char str[STR_SIZE];
+
+  if ((fp = fopen(file, "w")) == 0)
+    printf("The file was not opened\n");
+
+  for (i = 0; i < grid_rows; i++)
+    for (j = 0; j < grid_cols; j++)
+      for (k = 0; k < layers; k++) {
+        sprintf(str, "%d\t%g\n", index,
+                vect[i * grid_cols + j + k * grid_rows * grid_cols]);
+        fputs(str, fp);
+        index++;
+      }
+
+  fclose(fp);
+}
+
+void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
+                    float Cap, float Rx, float Ry, float Rz, float dt,
+                    int numiter) {
+  float ce, cw, cn, cs, ct, cb, cc;
+  float stepDivCap = dt / Cap;
+  ce = cw = stepDivCap / Rx;
+  cn = cs = stepDivCap / Ry;
+  ct = cb = stepDivCap / Rz;
+
+  cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
+
+  int c, w, e, n, s, b, t;
+  int x, y, z;
+  int i = 0;
+  do {
+    for (z = 0; z < nz; z++)
+      for (y = 0; y < ny; y++)
+        for (x = 0; x < nx; x++) {
+          c = x + y * nx + z * nx * ny;
+
+          w = (x == 0) ? c : c - 1;
+          e = (x == nx - 1) ? c : c + 1;
+          n = (y == 0) ? c : c - nx;
+          s = (y == ny - 1) ? c : c + nx;
+          b = (z == 0) ? c : c - nx * ny;
+          t = (z == nz - 1) ? c : c + nx * ny;
+
+          tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
+                    tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
+                    (dt / Cap) * pIn[c] + ct * amb_temp;
+        }
+    float *temp = tIn;
+    tIn = tOut;
+    tOut = temp;
+    i++;
+  } while (i < numiter);
+}
+
+float accuracy(float *arr1, float *arr2, int len) {
+  float err = 0.0;
+  int i;
+  for (i = 0; i < len; i++) {
+    err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
+  }
+
+  return (float)sqrt(err / len);
+}
+
+void usage(int argc, char **argv) {
+  fprintf(stderr,
+          "Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
+          "<outputFile>\n",
+          argv[0]);
+  fprintf(
+      stderr,
+      "\t<rows/cols>  - number of rows/cols in the grid (positive integer)\n");
+  fprintf(stderr,
+          "\t<layers>  - number of layers in the grid (positive integer)\n");
+
+  fprintf(stderr, "\t<iteration> - number of iterations\n");
+  fprintf(stderr, "\t<powerFile>  - name of the file containing the initial "
+                  "power values of each cell\n");
+  fprintf(stderr, "\t<tempFile>  - name of the file containing the initial "
+                  "temperature values of each cell\n");
+  fprintf(stderr, "\t<outputFile - output file\n");
+  exit(1);
+}
+
+int main(int argc, char **argv) {
+  cudaSetDevice(0);
+  if (argc != 7) {
+    usage(argc, argv);
+  }
+
+  char *pfile, *tfile, *ofile;
+  int iterations = atoi(argv[3]);
+
+  pfile = argv[4];
+  tfile = argv[5];
+  ofile = argv[6];
+  int numCols = atoi(argv[1]);
+  int numRows = atoi(argv[1]);
+  int layers = atoi(argv[2]);
+
+  /* calculating parameters*/
+
+  float dx = chip_height / numRows;
+  float dy = chip_width / numCols;
+  float dz = t_chip / layers;
+
+  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
+  float Rx = dy / (2.0 * K_SI * t_chip * dx);
+  float Ry = dx / (2.0 * K_SI * t_chip * dy);
+  float Rz = dz / (K_SI * dx * dy);
+
+  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
+  float dt = PRECISION / max_slope;
+
+  float *powerIn, *tempOut, *tempIn, *tempCopy;
+  int size = numCols * numRows * layers;
+
+  powerIn = (float *)calloc(size, sizeof(float));
+  tempCopy = (float *)malloc(size * sizeof(float));
+  tempIn = (float *)calloc(size, sizeof(float));
+  tempOut = (float *)calloc(size, sizeof(float));
+  float *answer = (float *)calloc(size, sizeof(float));
+
+  readinput(powerIn, numRows, numCols, layers, pfile);
+  readinput(tempIn, numRows, numCols, layers, tfile);
+
+  memcpy(tempCopy, tempIn, size * sizeof(float));
+
+  hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
+               Rz, dt, iterations);
+
+  computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
+                 Ry, Rz, dt, iterations);
+
+  float acc = accuracy(tempOut, answer, numRows * numCols * layers);
+  printf("Accuracy: %e\n", acc);
+  writeoutput(tempOut, numRows, numCols, layers, ofile);
+  free(tempIn);
+  free(tempOut);
+  free(powerIn);
+  return 0;
+}
--- a/examples/hotspot3D/run.sh
+++ b/examples/hotspot3D/run.sh
@ -0,0 +1,22 @@
+# # #!/bin/bash
+set -e
+llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
+llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
+../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
+../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
+
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o 3D \
+    -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
+
+if head output.out | grep -q "334.017"; then
+    echo "Pass"
+else
+    echo "Error result"
+    exit 1
+fi
--- a/examples/huffman/comparison_helpers.h
+++ b/examples/huffman/comparison_helpers.h
@ -0,0 +1,24 @@
+#ifndef _COMPARISON_HELPERS_H_
+#define _COMPARISON_HELPERS_H_
+#include <stdio.h>
+template <typename T>
+__inline int compare_vectors(T *data1, T *data2, unsigned int size) {
+  printf("Comparing vectors: \n");
+  bool match = true;
+  for (unsigned int i = 0; i < size; i++)
+    if (data1[i] != data2[i]) {
+      match = false;
+      printf("Diff: data1[%d]=%d,  data1[%d]=%d.\n", i, data1[i], i, data2[i]);
+    }
+
+  if (match) {
+    printf("PASS! vectors are matching!\n");
+    return 0;
+  } else {
+    printf("FAIL! vectors are NOT matching!\n");
+    exit(1);
+    return -1;
+  }
+}
+
+#endif
--- a/examples/huffman/cpuencode.cpp
+++ b/examples/huffman/cpuencode.cpp
@ -0,0 +1,116 @@
+#include "stdafx.h"
+
+#include "cpuencode.h"
+#include "print_helpers.h"
+
+using namespace std;
+
+#if 1
+
+// The max. codeword length for each byte symbol is 32-bits
+
+extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
+                               unsigned int *outdata, unsigned int *outsize,
+                               unsigned int *codewords,
+                               unsigned int *codewordlens) {
+  unsigned int *bitstreamPt =
+      (unsigned int *)outdata; /* Pointer to current byte   */
+  *bitstreamPt = 0x00000000U;
+  unsigned int startbit = 0;
+  unsigned int totalBytes = 0;
+
+  for (unsigned int k = 0; k < num_elements; k++) {
+    unsigned int cw32 = 0;
+    unsigned int val32 = indata[k];
+    unsigned int numbits = 0;
+    unsigned int mask32;
+
+    for (unsigned int i = 0; i < 4; i++) {
+      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
+      cw32 = codewords[symbol];
+      numbits = codewordlens[symbol];
+
+      while (numbits > 0) {
+        int writebits = min(32 - startbit, numbits);
+        if (numbits == writebits)
+          mask32 = (cw32 & ((1 << numbits) - 1))
+                   << (32 - startbit -
+                       numbits); // first make sure that the start of the word
+                                 // is clean, then shift to the left as many
+                                 // places as you need
+        else
+          mask32 = cw32 >>
+                   (numbits - writebits); // shift out the bits that can not fit
+        *bitstreamPt = (*bitstreamPt) | mask32;
+        numbits = numbits - writebits;
+        startbit = (startbit + writebits) % 32;
+        if (startbit == 0) {
+          bitstreamPt++;
+          *bitstreamPt = 0x00000000;
+          totalBytes += 4;
+        }
+      }
+    }
+  }
+  totalBytes += (startbit / 8) +
+                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
+  *outsize = totalBytes;
+}
+
+//////////////////////////////////////////////////////////////////////
+/// ALTERNATIVE CODER
+/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
+/// i.e. g 64 bits
+///////////////////////////////////////////////////////////////////////
+
+#else
+
+extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
+                               unsigned int *outdata, unsigned int *outsize,
+                               unsigned int *codewords,
+                               unsigned int *codewordlens) {
+  unsigned int *bitstreamPt =
+      (unsigned int *)outdata; /* Pointer to current byte   */
+  // assume memset is done.
+  *bitstreamPt = 0x00000000U;
+  unsigned int startbit = 0;
+  unsigned int totalBytes = 0;
+
+  for (unsigned int k = 0; k < num_elements; k++) {
+    unsigned long long cw64 = 0, mask64 = 0;
+    unsigned int val32 = indata[k];
+    unsigned int numbits = 0;
+    unsigned int mask32, temp32;
+
+    for (unsigned int i = 0; i < 4; i++) {
+      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
+      cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
+      numbits += codewordlens[symbol];
+      // if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
+      // %d!!!!!!!\n", k, numbits);
+    }
+
+    while (numbits > 0) {
+      int writebits = min(32 - startbit, numbits);
+      if (numbits == writebits) {
+        temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
+        mask32 = temp32 << (32 - startbit - numbits);
+      } else {
+        mask32 = (unsigned int)(cw64 >> (numbits - writebits));
+        cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
+      }
+      *bitstreamPt = (*bitstreamPt) | mask32;
+      numbits = numbits - writebits;
+      startbit = (startbit + writebits) % 32;
+      if (startbit == 0) {
+        bitstreamPt++;
+        *bitstreamPt = 0x00000000;
+        totalBytes += 4;
+      }
+    }
+  }
+  totalBytes += (startbit / 8) +
+                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
+  *outsize = totalBytes;
+}
+#endif
--- a/examples/huffman/cpuencode.h
+++ b/examples/huffman/cpuencode.h
@ -0,0 +1,8 @@
+#ifndef _CE_H_
+#define _CE_H_
+
+extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
+                               unsigned int *outdata, unsigned int *outsize,
+                               unsigned int *codewords,
+                               unsigned int *codewordlens);
+#endif
--- a/Show More
+++ b/Show More