add codebase for TACO submission

2022-05-04 08:59:38 -04:00 · 2022-05-04 08:59:38 -04:00 · f8e72916c1
parent 897af29748
commit f8e72916c1
164 changed files with 65421 additions and 1082 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,3 +39,4 @@ set(GCC_COVERAGE_LINK_FLAGS
    "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
 add_subdirectory(compilation)
 add_subdirectory(runtime)
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@ -1,6 +1,6 @@
-# Contributing to CuPBoP
+# Contributing to COX
-Thank you for your interest in contributing to CuPBoP!
+Thank you for your interest in contributing to COX!
 We appreciate all contributions, including but not limited to:
 - Add documentation
@ -10,9 +10,9 @@ We appreciate all contributions, including but not limited to:
 ## How to contribute?
 0. (Optional) Open an issue and discuss your idea before start
-1. Fork the latest version CuPBoP
+1. Fork the latest version COX
 2. Commit to the forked repo
-3. Create a Pull Request to CuPBoP main branch
+3. Create a Pull Request to COX main branch
 ## Code style
@ -21,15 +21,14 @@ To make sure your contribution is following the correct style,
 we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
 ```bash
-# Python3 environment is required
+# Python environment is required
 pip install pre-commit
 ```
 Then, from the repository folder, execute the following instruction:
 ```bash
-# execute in CuPBoP's root folder
+ pre-commit install
 pre-commit install
 ```
 With pre-commit plugin, each local commit will be automatically checked.
--- a/README.md
+++ b/README.md
@ -1,10 +1,10 @@
-# CuPBoP: Cuda for Parallelized and Broad-range Processors
+# COX: CUDA on X86
 ## Introduction
-CuPBoP (Cuda for parallelized and broad-range processors) is a framework
+This project consists of two parts: a series of LLVM passes that
-aims to execute CUDA source code on non-NVIDIA devices,
+achieve a SPMD NVVM IR as input, and output the corresponding
-including CPU, GPU and other architectures.
+MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
 ## Install
@ -22,8 +22,8 @@ including CPU, GPU and other architectures.
 1. Clone from github
    ```bash
-    git clone https://github.com/cupbop/CuPBoP
+    git clone https://github.com/drcut/open_source_template
-    cd CuPBoP
+    cd open_source_template
    ```
 2. Build the transformer for NVVM IR to LLVM IR for X86
@ -55,12 +55,8 @@ g++ ../compilation/examples/vecadd/host.cpp \
 ./vecadd_example
 ```
 ## Contribution
 We sincerely appreciate all kinds of contributions.
 Please refer to [CONTRIBUTING](docs/CONTRIBUTING.md) for the contributing guideline.
 ## Author
-* [Ruobing Han](https://drcut.github.io/)
+[Ruobing Han](https://drcut.github.io/) is a CS phd student in
-* [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/)
+Georgia Institute Technology, under the supervision
 of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).
--- a/compilation/HostTranslation.cpp
+++ b/compilation/HostTranslation.cpp
@ -1,25 +1,43 @@
-#include "ReplaceKernelLaunch.h"
+#include "RemoveCudaBuiltin.h"
 #include "ReplaceConstantMemory.h"
 #include "ReplaceCudaBuiltin.h"
 #include "ReplaceKernelArgs.h"
 #include "tool.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <stdlib.h>
 using namespace llvm;
 std::string PATH = "kernel_meta.log";
 int main(int argc, char **argv) {
  assert(argc == 3 && "incorrect number of arguments\n");
  char *input_host_path = argv[1];
  char *output_host_path = argv[2];
  std::ifstream fin;
  fin.open(PATH);
  // load LLVM module(s)
  llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
  VerifyModule(hostModule);
  // replace const memory
  ReplaceConstantMemory(hostModule, fin);
  // process host module
-  ReplaceKernelLaunch(hostModule);
+  ReplaceCudaBuiltin(hostModule);
  // remove builtin unuse functions and variables
  RemoveCudaBuiltin(hostModule);
  // replace arguments in kernel_arg, from alloc to malloc
  ReplaceKernelArg(hostModule);
  VerifyModule(hostModule);
  DumpModule(hostModule, output_host_path);
  fin.close();
  return 0;
 }
--- a/compilation/HostTranslation/include/ReplaceKernelLaunch.h
+++ b/compilation/HostTranslation/include/ReplaceKernelLaunch.h
@ -1,11 +1,11 @@
-#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
+#ifndef __NVVM2x86_REMOVE_CUDABUILTIN__
-#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
+#define __NVVM2x86_REMOVE_CUDABUILTIN__
 #include "llvm/IR/Module.h"
 /*
 * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 */
-void ReplaceKernelLaunch(llvm::Module *M);
+void RemoveCudaBuiltin(llvm::Module *M);
 #endif
--- a/compilation/HostTranslation/include/ReplaceConstantMemory.h
+++ b/compilation/HostTranslation/include/ReplaceConstantMemory.h
@ -0,0 +1,12 @@
 #ifndef __NVVM2x86_REPLACE_CONSTANT_MEMORY__
 #define __NVVM2x86_REPLACE_CONSTANT_MEMORY__
 #include "llvm/IR/Module.h"
 #include <fstream>
 /*
 * From: @ff_variable = internal global [5 x float] undef, align 16
 * To: @wrapper_global_ff_variable = common global [5 x float] zeroinitializer
 */
 void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin);
 #endif
--- a/compilation/HostTranslation/include/ReplaceCudaBuiltin.h
+++ b/compilation/HostTranslation/include/ReplaceCudaBuiltin.h
@ -0,0 +1,11 @@
 #ifndef __NVVM2x86_REPLACE_CUDA_BUILTIN__
 #define __NVVM2x86_REPLACE_CUDA_BUILTIN__
 #include "llvm/IR/Module.h"
 /*
 * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 */
 void ReplaceCudaBuiltin(llvm::Module *M);
 #endif
--- a/compilation/HostTranslation/include/ReplaceKernelArgs.h
+++ b/compilation/HostTranslation/include/ReplaceKernelArgs.h
@ -0,0 +1,14 @@
 #ifndef __NVVM2x86_REPLACE_KERNEL_ARGS__
 #define __NVVM2x86_REPLACE_KERNEL_ARGS__
 #include "llvm/IR/Module.h"
 /*
 * before:
 * %m_cuda.addr = alloca float*, align 8
 * after:
 * %m_cuda.addr_tmp = call i8* @malloc(i64 8)
 * %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
 */
 void ReplaceKernelArg(llvm::Module *M);
 #endif
--- a/compilation/HostTranslation/lib/GenerateHostStub.cpp
+++ b/compilation/HostTranslation/lib/GenerateHostStub.cpp
@ -0,0 +1,7 @@
 /**
 *  Generate a file for Cuda Kernel Function Attributes
 *
 *
 *
 *
 */
--- a/compilation/HostTranslation/lib/InitializeDevice.cpp
+++ b/compilation/HostTranslation/lib/InitializeDevice.cpp
@ -0,0 +1,6 @@
 /*
  Initialize the cudaDevice as first statements if not set by the User
  (cudaSetDevice)
 */
--- a/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp
+++ b/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp
@ -0,0 +1,59 @@
 /**
 * Remove Clang cuda builtin functions and variables
 */
 #include "RemoveCudaBuiltin.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
 void RemoveCudaBuiltin(llvm::Module *M) {
  std::set<llvm::Function *> need_remove;
  if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) {
    gv->dropAllReferences();
    gv->eraseFromParent();
  }
  Function *c_tor = NULL;
  if (c_tor = M->getFunction("__cuda_module_ctor")) {
    c_tor->dropAllReferences();
    c_tor->eraseFromParent();
  }
  if (c_tor = M->getFunction("__cuda_module_dtor")) {
    c_tor->dropAllReferences();
    c_tor->eraseFromParent();
  }
  if (c_tor = M->getFunction("__cuda_register_globals")) {
    c_tor->dropAllReferences();
    c_tor->eraseFromParent();
  }
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (func_name == "__cuda_module_dtor" ||
        func_name == "__cuda_register_globals" ||
        func_name == "__cudaRegisterFunction" ||
        func_name == "__cudaRegisterVar" ||
        func_name == "__cudaRegisterFatBinary" ||
        func_name == "__cuda_module_ctor" ||
        func_name == "__cudaRegisterFatBinaryEnd" ||
        func_name == "__cudaUnregisterFatBinary") {
      need_remove.insert(F);
    }
  }
  for (auto f : need_remove) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
 }
--- a/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp
+++ b/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp
@ -0,0 +1,93 @@
 #include "ReplaceConstantMemory.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
 void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin) {
  std::string s;
  bool find_constant_memory = false;
  while (getline(fin, s)) {
    if (s.find("ConstMemory2GlobalMemory") != std::string::npos) {
      find_constant_memory = true;
      break;
    }
  }
  if (!find_constant_memory) {
    assert(0 && "Do not find constant to global mapping\n");
  }
  std::map<std::string, std::string> corresponding_global_memory;
  while (getline(fin, s)) {
    if (s.find("END") != std::string::npos) {
      break;
    }
    // get constant name
    size_t pos = 0;
    pos = s.find(' ');
    std::string constant_name = s.substr(0, pos);
    s.erase(0, pos + 1);
    // get mapped global name
    std::string global_name = s.substr(3, s.length() - 1);
    corresponding_global_memory.insert(
        std::pair<std::string, std::string>(constant_name, global_name));
  }
  std::set<llvm::GlobalVariable *> need_remove_constant_memory;
  // find all constant memory and generate corresponding global memory
  for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
    if (auto constant_memory = dyn_cast<llvm::GlobalVariable>(I)) {
      if (corresponding_global_memory.find(constant_memory->getName().str()) !=
          corresponding_global_memory.end()) {
        auto global_name =
            corresponding_global_memory.find(constant_memory->getName().str())
                ->second;
        // create a new global variable
        if (auto PT = dyn_cast<llvm::PointerType>(I->getType())) {
          need_remove_constant_memory.insert(constant_memory);
          // generate the corresponding global memory variable
          auto element_type = PT->getElementType();
          if (auto array_type = dyn_cast<llvm::ArrayType>(element_type)) {
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, array_type, false, llvm::GlobalValue::CommonLinkage, NULL,
                global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
            llvm::ConstantAggregateZero *const_array =
                llvm::ConstantAggregateZero::get(array_type);
            global_memory->setInitializer(const_array);
            constant_memory->replaceAllUsesWith(
                llvm::ConstantExpr::getPointerCast(
                    global_memory,
                    cast<PointerType>(constant_memory->getType())));
          } else if (element_type->isStructTy()) {
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, element_type, false, llvm::GlobalValue::CommonLinkage, NULL,
                global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
            llvm::ConstantAggregateZero *const_array =
                llvm::ConstantAggregateZero::get(element_type);
            global_memory->setInitializer(const_array);
            constant_memory->replaceAllUsesWith(
                llvm::ConstantExpr::getPointerCast(
                    global_memory,
                    cast<PointerType>(constant_memory->getType())));
          } else {
            assert(0 && "The required Constant Memory Type is not supported\n");
          }
        }
      }
    }
  }
  for (auto i : need_remove_constant_memory) {
    i->dropAllReferences();
    i->eraseFromParent();
  }
  return;
 }
--- a/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp
+++ b/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp
@ -0,0 +1,292 @@
 #include "ReplaceCudaBuiltin.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
 /*
 insert sync after cudaKernel launch
  call void @_Z13staticReversePii(i32* %55, i32 64)
  %57 = call i32 @cudaDeviceSynchronize()
 */
 void InsertSyncAfterKernelLaunch(llvm::Module *M) {
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::FunctionType *LauncherFuncT = FunctionType::get(Int32T, NULL);
  llvm::FunctionCallee _f =
      M->getOrInsertFunction("cudaDeviceSynchronize", LauncherFuncT);
  llvm::Function *func_launch = llvm::cast<llvm::Function>(_f.getCallee());
  std::set<std::string> launch_function_name;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
              // F is a kernel launch function
              launch_function_name.insert(func_name);
            }
          }
        }
      }
    }
  }
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (launch_function_name.find(calledFunction->getName().str()) !=
                launch_function_name.end()) {
              // insert a sync after launch
              if (callInst->getNextNonDebugInstruction()) {
                llvm::CallInst::Create(func_launch, "",
                                       callInst->getNextNonDebugInstruction());
              }
            }
          }
        }
      }
    }
  }
 }
 // Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 // Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 void ReplaceKernelLaunch(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto VoidTy = llvm::Type::getVoidTy(context);
  auto I8 = llvm::Type::getInt8PtrTy(context);
  std::map<std::string, Function *> kernels;
  std::set<llvm::Function *> need_remove;
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(Type::getVoidTy(*C), NULL);
  llvm::FunctionType *LaunchFun2 =
      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
  bool done = false;
  std::set<std::string> cuda_register_kernel_names;
  std::string str;
  llvm::raw_string_ostream ss(str);
  /*
  When using << >>, clang generates cudaPushCallConfiguration with the same
  function definition as the kernel definition in the kernel bitcode
    define internal void @__cuda_register_globals(i8** %0) {
    entry:
      %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*,
  float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x
  i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14
  x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32*
  null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void
  (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8*
  getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8*
  getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8*
  null, i8* null, i8* null, i8* null, i32* null) ret void
    }
  */
  Function *f_register_global = M->getFunction("__cuda_register_globals");
  if (f_register_global) {
    for (Function::iterator b = f_register_global->begin();
         b != f_register_global->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (calledFunction->getName().str() == "__cudaRegisterFunction") {
              Value *callOperand = callInst->getArgOperand(1);
              Function *functionOperand =
                  dyn_cast<Function>(callInst->getArgOperand(1));
              // call function is wrapped in a bitcast
              if (functionOperand == NULL) {
                std::vector<size_t> arg_sizes;
                functionOperand =
                    dyn_cast<Function>(callOperand->stripPointerCasts());
                cuda_register_kernel_names.insert(
                    functionOperand->getName().str());
                std::cout << "Cuda Register Global Kernel: "
                          << functionOperand->getName().str() << std::endl;
              }
            }
          }
        }
      }
    }
  }
  bool host_changed = false;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
              Value *callOperand = callInst->getArgOperand(0);
              Function *functionOperand =
                  dyn_cast<Function>(callInst->getArgOperand(0));
              // call function is wrapped in a bitcast
              if (functionOperand == NULL) {
                std::vector<size_t> arg_sizes;
                functionOperand =
                    dyn_cast<Function>(callOperand->stripPointerCasts());
                FunctionType *ft = calledFunction->getFunctionType();
                std::cout << " Parent (Caller) Function Name: " << func_name
                          << ", cudaLaunchKernel Function: "
                          << functionOperand->getName().str() << ", args "
                          << functionOperand->arg_size() << std::endl;
                auto rep = kernels.find(functionOperand->getName().str());
                if (rep != kernels.end()) {
                  Function *FC = rep->second;
                  BitCastInst *B = new BitCastInst(FC, I8, "", callInst);
                  callInst->setArgOperand(0, B);
                  continue;
                }
                std::vector<Type *> Params;
                Params.push_back(I8);
                FunctionType *FT = FunctionType::get(VoidTy, Params, false);
                /*
                  Because of the TODO in the 2nd if statement, need to get the
                  prior name before _host is add
                */
                std::string oldName = functionOperand->getName().str();
                // if parent function is __host and same as the cudaKernelLaunch
                std::string newName = oldName + "_wrapper";
                if (func_name == oldName && host_changed &&
                    oldName.find("_host") != std::string::npos) {
                  newName =
                      oldName.substr(0, oldName.length() - 5) + "_wrapper";
                }
                std::cout << "Change Kernel Name to: " << newName << std::endl;
                Function *F =
                    Function::Create(FT, Function::ExternalLinkage, newName, M);
                F->setDSOLocal(true);
                F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
                BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
                callInst->setArgOperand(0, BC);
                kernels.insert({functionOperand->getName().str(), F});
              }
            } else if (cuda_register_kernel_names.find(
                           calledFunction->getName()) !=
                       cuda_register_kernel_names.end()) {
              // if the called function collides with kernel definiton
              // TODO: some reason changes all occurences of the function name
              // for both cudaKernelLaunch calls and regular function call
              // errs() << *inst;
              host_changed = true;
              calledFunction->setName(calledFunction->getName() + "_host");
              std::cout << std::endl;
              std::cout << "Change Host Function Name To: "
                        << calledFunction->getName().str() << std::endl;
            }
          }
        }
      }
    }
  }
 }
 void ReplaceMemcpyToSymbol(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<llvm::Instruction *> need_remove;
  for (Module::iterator F = M->begin(); F != M->end(); ++F) {
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
            if (func_name == "cudaMemcpyToSymbol") {
              std::vector<llvm::Type *> args;
              // i32 @cudaMemcpyToSymbol(i8* %1, i8* %2, i64 %3, i64 %4, i32 %5)
              args.push_back(llvm::Type::getInt8PtrTy(context));
              args.push_back(llvm::Type::getInt8PtrTy(context));
              args.push_back(llvm::Type::getInt64Ty(context));
              args.push_back(llvm::Type::getInt64Ty(context));
              args.push_back(llvm::Type::getInt32Ty(context));
              llvm::FunctionType *func_Type =
                  FunctionType::get(I32, args, false);
              llvm::FunctionCallee _f =
                  M->getOrInsertFunction("cudaMemcpyToSymbol_host", func_Type);
              llvm::Function *func = llvm::cast<llvm::Function>(_f.getCallee());
              // construct argument(s)
              std::vector<Value *> func_args;
              func_args.push_back(Call->getArgOperand(0));
              func_args.push_back(Call->getArgOperand(1));
              func_args.push_back(Call->getArgOperand(2));
              func_args.push_back(Call->getArgOperand(3));
              func_args.push_back(Call->getArgOperand(4));
              auto c_inst = llvm::CallInst::Create(func, func_args, "", Call);
              // insert
              Call->replaceAllUsesWith(c_inst);
              need_remove.push_back(Call);
            }
          }
        }
      }
    }
  }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 void ReplaceCudaBuiltin(llvm::Module *M) {
  InsertSyncAfterKernelLaunch(M);
  ReplaceKernelLaunch(M);
  ReplaceMemcpyToSymbol(M);
 }
--- a/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp
+++ b/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp
@ -0,0 +1,90 @@
 #include "ReplaceKernelArgs.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
 /*
 * before:
 * %m_cuda.addr = alloca float*, align 8
 * after:
 * %m_cuda.addr_tmp = call i8* @malloc(i64 8)
 * %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
 */
 // TODO: we use hard-code to implement this replacement,
 // to use use-analysis to find the arguments in the future
 void ReplaceKernelArg(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto VoidTy = llvm::Type::getVoidTy(context);
  auto I8 = llvm::Type::getInt8PtrTy(context);
  std::map<std::string, Function *> kernels;
  std::set<llvm::Function *> need_replace;
  LLVMContext *C = &M->getContext();
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
              need_replace.insert(F);
            }
          }
        }
      }
    }
  }
  // find/create C's malloc function
  std::vector<llvm::Type *> args;
  args.push_back(llvm::Type::getInt8PtrTy(context));
  llvm::FunctionType *mallocFuncType =
      FunctionType::get(llvm::Type::getInt8PtrTy(context),
                        {llvm::Type::getInt64Ty(context)}, false);
  llvm::FunctionCallee _f = M->getOrInsertFunction("malloc", mallocFuncType);
  llvm::Function *func_malloc = llvm::cast<llvm::Function>(_f.getCallee());
  for (auto F : need_replace) {
    std::set<const llvm::Value *> args_set;
    int arg_cnt = 0;
    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
         ii != ee; ++ii) {
      args_set.insert(&(*ii));
      arg_cnt++;
    }
    std::vector<llvm::Instruction *> need_remove;
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::AllocaInst *alloc = llvm::dyn_cast<llvm::AllocaInst>(inst)) {
          // just replace all alloc in that function
          auto c_malloc_inst = llvm::CallInst::Create(
              func_malloc,
              ConstantInt::get(llvm::Type::getInt64Ty(context), 256), "",
              alloc);
          auto bit_cast = new BitCastInst(c_malloc_inst, alloc->getType(),
                                          alloc->getName().str(), alloc);
          alloc->replaceAllUsesWith(bit_cast);
          need_remove.push_back(alloc);
        }
      }
    }
    for (auto inst : need_remove) {
      inst->eraseFromParent();
    }
  }
 }
--- a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
+++ b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
@ -1,94 +0,0 @@
 #include "ReplaceKernelLaunch.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
 // Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 // Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 void ReplaceKernelLaunch(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto VoidTy = llvm::Type::getVoidTy(context);
  auto I8 = llvm::Type::getInt8PtrTy(context);
  std::map<std::string, BitCastInst *> kernels;
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(Type::getVoidTy(*C), NULL);
  llvm::FunctionType *LaunchFun2 =
      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
  bool done = false;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
              Value *callOperand = callInst->getArgOperand(0);
              Function *functionOperand =
                  dyn_cast<Function>(callInst->getArgOperand(0));
              // call function is wrapped in a bitcast
              if (functionOperand == NULL) {
                std::vector<size_t> arg_sizes;
                functionOperand =
                    dyn_cast<Function>(callOperand->stripPointerCasts());
                FunctionType *ft = calledFunction->getFunctionType();
                std::cout << " Parent (Caller) Function Name: " << func_name
                          << ", cudaLaunchKernel Function: "
                          << functionOperand->getName().str() << ", args "
                          << functionOperand->arg_size() << std::endl;
                auto rep = kernels.find(functionOperand->getName().str());
                if (rep != kernels.end()) {
                  callInst->setArgOperand(0, rep->second);
                  continue;
                }
                std::vector<Type *> Params;
                Params.push_back(I8);
                FunctionType *FT = FunctionType::get(VoidTy, Params, false);
                std::string newName =
                    functionOperand->getName().str() + "_wrapper";
                Function *F =
                    Function::Create(FT, Function::ExternalLinkage, newName, M);
                F->setDSOLocal(true);
                BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
                callInst->setArgOperand(0, BC);
                kernels.insert({functionOperand->getName().str(), BC});
              }
            }
          }
        }
      }
    }
  }
 }
--- a/compilation/KernelTranslation.cpp
+++ b/compilation/KernelTranslation.cpp
@ -8,46 +8,66 @@
 #include "warp_func.h"
 #include "llvm/IR/Module.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <llvm/Support/raw_ostream.h>
 #include <map>
 #include <set>
 #include <stdlib.h>
 using namespace llvm;
 std::string PATH = "kernel_meta.log";
 int main(int argc, char **argv) {
-  assert(argc == 9 && "incorrect number of arguments\n");
+  assert(argc == 3 && "incorrect number of arguments\n");
  llvm::Module *program = LoadModuleFromFilr(argv[1]);
-  // get size of grid and dim from input arguments
+
-  int *grid_dim = new int[3];
+  std::ofstream fout;
-  int *block_dim = new int[3];
+  fout.open(PATH);
  grid_dim[0] = atoi(argv[3]);
  grid_dim[1] = atoi(argv[4]);
  grid_dim[2] = atoi(argv[5]);
  block_dim[0] = atoi(argv[6]);
  block_dim[1] = atoi(argv[7]);
  block_dim[2] = atoi(argv[8]);
  // inline, and create auxiliary global variables
-  init_block(program);
+  init_block(program, fout);
  // insert sync before each vote, and replace the
  // original vote function to warp vote
  handle_warp_vote(program);
  // replace warp shuffle
  // VerifyModule(program);
  handle_warp_shfl(program);
  // insert sync
  // VerifyModule(program);
  insert_sync(program);
  // split block by sync
  // VerifyModule(program);
  std::cout << "split\n" << std::flush;
  split_block_by_sync(program);
  // add loop for intra&intera thread
-  insert_warp_loop(program);
+
  // (TODO): replace this patch
  replace_built_in_function(program, grid_dim, block_dim);
  // VerifyModule(program);
  std::cout << "insert\n" << std::flush;
  insert_warp_loop(program);
  // VerifyModule(program);
  // (TODO): replace this patch
  std::cout << "replace\n" << std::flush;
  replace_built_in_function(program);
  // VerifyModule(program);
  std::cout << "generate\n" << std::flush;
  generate_x86_format(program);
  // VerifyModule(program);
  // performance optimization
  performance_optimization(program);
  VerifyModule(program);
  DumpModule(program, argv[2]);
  fout.close();
  return 0;
 }
--- a/compilation/KernelTranslation/include/generate_x86_format.h
+++ b/compilation/KernelTranslation/include/generate_x86_format.h
@ -5,4 +5,6 @@
 void generate_x86_format(llvm::Module *M);
 void set_meta_data(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/init.h
+++ b/compilation/KernelTranslation/include/init.h
@ -2,6 +2,6 @@
 #define __NVVM2x86_INIT__
 #include "llvm/IR/Module.h"
-
+#include <fstream>
-void init_block(llvm::Module *M);
+void init_block(llvm::Module *M, std::ofstream &fout);
 #endif
--- a/compilation/KernelTranslation/include/memory_hierarchy.h
+++ b/compilation/KernelTranslation/include/memory_hierarchy.h
@ -1,9 +1,10 @@
 #ifndef __NVVM2x86_MEMORY_HIERARCHY__
 #define __NVVM2x86_MEMORY_HIERARCHY__
 #include "llvm/IR/Module.h"
-
+#include <fstream>
 using namespace llvm;
 void mem_share2global(llvm::Module *M);
 void mem_constant2global(llvm::Module *M, std::ofstream &fout);
 #endif
--- a/compilation/KernelTranslation/include/tool.h
+++ b/compilation/KernelTranslation/include/tool.h
@ -12,7 +12,7 @@ llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
 void VerifyModule(llvm::Module *);
 void phi2alloc(llvm::Module *M);
 void remove_cuda_built_in(llvm::Module *M);
-void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
+void replace_built_in_function(llvm::Module *M);
 void replace_asm_call(llvm::Module *M);
 bool find_block_barrier_in_region(llvm::BasicBlock *start,
                                  llvm::BasicBlock *end);
@ -21,4 +21,5 @@ bool has_warp_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::BasicBlock *B);
 bool has_block_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::Function *F);
 void replace_dynamic_shared_memory(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/lib/generate_x86_format.cpp
+++ b/compilation/KernelTranslation/lib/generate_x86_format.cpp
@ -18,6 +18,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 using namespace llvm;
@ -40,6 +41,10 @@ void decode_input(llvm::Module *M) {
  llvm::FunctionType *LauncherFuncT = FunctionType::get(
      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
  std::set<GlobalVariable *> dynmaic_memory;
  std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
  // generate Wrapper Function type
  // now we only support a single int32*
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
@ -64,6 +69,51 @@ void decode_input(llvm::Module *M) {
    // convert to int**
    input_arg = Builder.CreateBitOrPointerCast(
        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
    // dynamic memory load in the wrapper function
    GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
    if (share_memory != NULL) {
      dynmaic_memory.insert(share_memory);
      llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
          *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
          "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
          0, false);
      Value *loadedValue = Builder.CreateLoad(global_mem);
      llvm::FunctionType *LaunchFun2 = FunctionType::get(
          PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
      FunctionCallee fc2 =
          M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
      Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
      WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
      WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
      Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
      co->setSelectionKind(Comdat::SelectionKind::Any);
      WorkGroup2->setComdat(co);
      BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
      llvm::IRBuilder<> Builder2(M->getContext());
      Builder2.SetInsertPoint(Block2);
      Builder2.CreateRet(share_memory);
      auto PT = dyn_cast<PointerType>(share_memory->getType());
      auto element_type = PT->getElementType();
      // std::cout << element_type->getTypeID()  << " Got global memor $$$$$$"
      // << share_memory->getName().str() << std::endl;
      AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
      // new_arr->setAlignment(llvm::MaybeAlign(16));
      Value *new_ar = new_arr;
      Value *gptr = Builder.CreateBitOrPointerCast(
          share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
      Builder.CreateStore(new_ar, gptr);
    }
    size_t idx = 0;
    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
         ii != ee; ++ii) {
@ -95,6 +145,8 @@ void remove_barrier(llvm::Module *M) {
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->isInlineAsm())
            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync" ||
              func_name == "llvm.nvvm.barrier0" ||
@ -109,6 +161,11 @@ void remove_barrier(llvm::Module *M) {
  }
 }
 void remove_useless_var(llvm::Module *M) {
  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
 }
 void generate_x86_format(llvm::Module *M) {
  // change metadata
  set_meta_data(M);
@ -116,4 +173,6 @@ void generate_x86_format(llvm::Module *M) {
  decode_input(M);
  // remove barrier
  remove_barrier(M);
  // remove useless func/variable
  remove_useless_var(M);
 }
--- a/compilation/KernelTranslation/lib/handle_sync.cpp
+++ b/compilation/KernelTranslation/lib/handle_sync.cpp
@ -27,6 +27,8 @@ void split_block_by_sync(llvm::Function *F) {
      }
      llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
      if (Call) {
        if (Call->isInlineAsm())
          continue;
        auto func_name = Call->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.barrier0" ||
            func_name == "llvm.nvvm.bar.warp.sync" ||
--- a/compilation/KernelTranslation/lib/init.cpp
+++ b/compilation/KernelTranslation/lib/init.cpp
@ -1,6 +1,7 @@
 #include "init.h"
 #include "memory_hierarchy.h"
 #include "tool.h"
 #include <fstream>
 #include <iostream>
 #include <set>
@ -23,7 +24,8 @@
 using namespace llvm;
-void inline_func_vote(llvm::Module *M) {
+bool inline_warp_level_func(llvm::Module *M) {
  bool changed = false;
  std::set<llvm::Function *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
@ -36,10 +38,13 @@ void inline_func_vote(llvm::Module *M) {
      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
          if (c->getCalledFunction()) {
-            if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
+            auto func_name = c->getCalledFunction()->getName().str();
            if (func_name == "_Z10__any_syncji" ||
                func_name.find("shfl_down_sync") != std::string::npos) {
              InlineFunctionInfo IFI;
              InlineFunction(c, IFI);
              need_remove.insert(c->getCalledFunction());
              changed = true;
            }
          }
        }
@ -50,6 +55,56 @@ void inline_func_vote(llvm::Module *M) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
  return changed;
 }
 bool find_sreg_inst(llvm::Function *F) {
  Function::iterator I = F->begin();
  for (Function::iterator E = F->end(); I != E; ++I) {
    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
      if (CallInst *c = dyn_cast<CallInst>(BI++)) {
        if (c->getCalledFunction()) {
          auto func_name = c->getCalledFunction()->getName().str();
          if (func_name.find("llvm.nvvm.read.ptx.sreg.") != std::string::npos) {
            return true;
          }
        }
      }
    }
  }
  return false;
 }
 bool inline_func_with_tid(llvm::Module *M) {
  bool changed = false;
  std::set<llvm::Function *> need_remove;
  std::set<CallInst *> need_inline;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    Function::iterator I = F->begin();
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
          if (c->getCalledFunction()) {
            if (find_sreg_inst(c->getCalledFunction())) {
              printf("inline: %s\n",
                     c->getCalledFunction()->getName().str().c_str());
              need_inline.insert(c);
              need_remove.insert(c->getCalledFunction());
            }
          }
        }
      }
    }
  }
  if (!need_inline.empty()) {
    changed = true;
  }
  for (auto c : need_inline) {
    InlineFunctionInfo IFI;
    InlineFunction(c, IFI);
  }
  return changed;
 }
 void create_global_variable(llvm::Module *M) {
@ -70,21 +125,33 @@ void create_global_variable(llvm::Module *M) {
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_x", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_y", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_z", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
-                           NULL, "grid_size", NULL,
+                           NULL, "grid_size_x", NULL,
-                           llvm::GlobalValue::NotThreadLocal, 0, false);
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
-                           NULL, "block_index", NULL,
+                           NULL, "grid_size_y", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "grid_size_z", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_index_x", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_index_y", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_index_z", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  // TLS variable used for warp-level collective operators
  new llvm::GlobalVariable(
@ -224,24 +291,23 @@ bool lower_constant_expr(llvm::Module *M) {
          auto load_from = load_inst->getOperand(0);
          if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
            modified = true;
            auto ReplInst = get_element_ptr->getAsInstruction();
            ReplInst->insertBefore(load_inst);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : get_element_ptr->users()) {
              if (auto InstUser = dyn_cast<Instruction>(U)) {
                Users.push_back(InstUser);
              }
            }
-            for (auto &User : Users)
+            for (auto &User : Users) {
              auto ReplInst = get_element_ptr->getAsInstruction();
              ReplInst->insertBefore(User);
              User->replaceUsesOfWith(get_element_ptr, ReplInst);
            }
          }
        } else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
          auto store_to = store_inst->getOperand(1);
          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
            modified = true;
-            auto ReplInst = addr_cast->getAsInstruction();
+
            ReplInst->insertBefore(store_inst);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : addr_cast->users()) {
@ -249,16 +315,19 @@ bool lower_constant_expr(llvm::Module *M) {
                Users.push_back(InstUser);
              }
            }
-            for (auto &User : Users)
+            for (auto &User : Users) {
              auto ReplInst = addr_cast->getAsInstruction();
              ReplInst->insertBefore(User);
              User->replaceUsesOfWith(addr_cast, ReplInst);
            }
          }
        } else if (auto get_element_ptr =
                       dyn_cast<llvm::GetElementPtrInst>(BI)) {
          auto get_from = get_element_ptr->getOperand(0);
          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
            modified = true;
-            auto ReplInst = addr_cast->getAsInstruction();
+            // auto ReplInst = addr_cast->getAsInstruction();
-            ReplInst->insertBefore(get_element_ptr);
+            // ReplInst->insertBefore(get_element_ptr);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : addr_cast->users()) {
@ -266,8 +335,11 @@ bool lower_constant_expr(llvm::Module *M) {
                Users.push_back(InstUser);
              }
            }
-            for (auto &User : Users)
+            for (auto &User : Users) {
              auto ReplInst = addr_cast->getAsInstruction();
              ReplInst->insertBefore(User);
              User->replaceUsesOfWith(addr_cast, ReplInst);
            }
          }
        }
      }
@ -276,11 +348,24 @@ bool lower_constant_expr(llvm::Module *M) {
  return modified;
 }
-void init_block(llvm::Module *M) {
+void replace_cuda_math_built_in(llvm::Module *M) {
  // replace _ZL3expd, just delete its body
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (func_name.find("_ZL3expd") != std::string::npos) {
      F->deleteBody();
    }
  }
 }
 void init_block(llvm::Module *M, std::ofstream &fout) {
  // using official llvm preprocess
  llvm_preprocess(M);
  // remove useles Cuda function
  remove_cuda_built_in(M);
  // replace CUDA math function, like expf
  replace_cuda_math_built_in(M);
  // lower ConstantExpression
  bool modified;
@ -289,14 +374,26 @@ void init_block(llvm::Module *M) {
  } while (modified);
  // remove useless metadata
  remove_metadata(M);
-  // inline vote function
+  // inline warp-level function
-  inline_func_vote(M);
+  while (1) {
    if (!inline_warp_level_func(M))
      break;
  }
  // TODO: remove the hardcode
  while (1) {
    if (!inline_func_with_tid(M))
      break;
  }
  // create global variable for warp and vote
  create_global_variable(M);
  // replace phi with data load
  phi2alloc(M);
  // replace share memory
  mem_share2global(M);
  // replace share memory
  mem_constant2global(M, fout);
  // replace asm Inline
  replace_asm_call(M);
  // replace dynamic shared memory
  replace_dynamic_shared_memory(M);
 }
--- a/compilation/KernelTranslation/lib/insert_sync.cpp
+++ b/compilation/KernelTranslation/lib/insert_sync.cpp
@ -212,11 +212,22 @@ public:
      changed = true;
      // we may create a new conditional barrier after insert
-      if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
+      if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) {
-        conditionalBarriers.push_back(pred);
+        // if the block postdominates all its predecessor
        // then it is not a conditional barriers
        bool post_dominate_all = true;
        for (auto I = pred_begin(pred); I != pred_end(pred); I++) {
          if (!PDT->getPostDomTree().dominates(pred, *I)) {
            post_dominate_all = false;
            break;
          }
        }
        if (!post_dominate_all)
          conditionalBarriers.push_back(pred);
      }
      // find any block which are not dominated by header
-      // but be posdiminated by merge point
+      // but be postdominated by merge point
      std::queue<llvm::BasicBlock *> if_body;
      std::set<llvm::BasicBlock *> visited_block;
      for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
@ -234,19 +245,26 @@ public:
            PDT->getPostDomTree().dominates(merge_point, curr)) {
          // we should insert barrier at the beginning and
          // end of its predecessor
          printf("insert [255]: %s\n", curr->getName().str().c_str());
          if (has_warp_barrier(b)) {
            CreateIntraWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
              printf("insert [262]: %s\n", Pred->getName().str().c_str());
              CreateIntraWarpBarrier(&(*Pred->getTerminator()));
            }
          } else {
            CreateInterWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
              printf("insert [268]: %s\n", Pred->getName().str().c_str());
              CreateInterWarpBarrier(&(*Pred->getTerminator()));
            }
          }
        }
        for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
          // avoid backedge
          if (DT->dominates(curr->getTerminator()->getSuccessor(i), pred)) {
            continue;
          }
          if_body.push(curr->getTerminator()->getSuccessor(i));
        }
      }
@ -266,6 +284,32 @@ public:
    AU.addRequired<DominatorTreeWrapperPass>();
  }
  BasicBlock *find_merge_point(BasicBlock *start, PostDominatorTree &PDT) {
    assert(start->getTerminator()->getNumSuccessors() == 2);
    std::set<llvm::BasicBlock *> visit;
    std::queue<llvm::BasicBlock *> pending_blocks;
    for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
      pending_blocks.push(start->getTerminator()->getSuccessor(i));
    }
    while (!pending_blocks.empty()) {
      BasicBlock *current = pending_blocks.front();
      pending_blocks.pop();
      if (visit.find(current) != visit.end())
        continue;
      visit.insert(current);
      if (PDT.dominates(current, start))
        return current;
      for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
        auto succ = current->getTerminator()->getSuccessor(i);
        if (visit.find(succ) == visit.end())
          pending_blocks.push(succ);
      }
    }
    assert(0 && "Do not find merge point\n");
    return NULL;
  }
  virtual bool runOnFunction(Function &F) {
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
@ -280,18 +324,8 @@ public:
    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
      BasicBlock *b = &*i;
      BasicBlock *merge_point = NULL;
      if (b->getTerminator()->getNumSuccessors() == 2) {
-        auto b1 = b->getTerminator()->getSuccessor(0);
+        auto merge_point = find_merge_point(b, PDT->getPostDomTree());
        auto b2 = b->getTerminator()->getSuccessor(1);
        if (PDT->getPostDomTree().dominates(b1, b2)) {
          merge_point = b1;
        } else if (PDT->getPostDomTree().dominates(b2, b2)) {
          merge_point = b2;
        } else {
          assert(0 && "find complex if-else branch\n");
        }
        std::cout << std::flush;
        for (BasicBlock *Pred : predecessors(merge_point)) {
          if (!DT->dominates(b, Pred)) {
            // we need to insert an extra block to be the merge point
@ -305,14 +339,8 @@ public:
    auto M = F.getParent();
    for (auto head : if_head) {
      assert(head->getTerminator()->getNumSuccessors() == 2);
-      BasicBlock *merge_point = NULL;
+      BasicBlock *merge_point = find_merge_point(head, PDT->getPostDomTree());
-      auto s1 = head->getTerminator()->getSuccessor(0);
+      assert(PDT->getPostDomTree().dominates(merge_point, head));
      auto s2 = head->getTerminator()->getSuccessor(1);
      if (PDT->getPostDomTree().dominates(s1, s2)) {
        merge_point = s1;
      } else {
        merge_point = s2;
      }
      if (!find_barrier_in_region(head, merge_point)) {
        printf("do not need to handle tri-income if: %s\n",
               merge_point->getName().str().c_str());
@ -368,6 +396,8 @@ public:
      for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
           ++j) {
        if (auto Call = dyn_cast<CallInst>(j)) {
          if (Call->isInlineAsm())
            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.bar.warp.sync" ||
@ -383,7 +413,7 @@ public:
    }
    if (!is_conditional_loop)
      return 0;
-    // insert barrier at the beginning of header
+    // insert barrier at the beginning of header (for_cond)
    // and the end of pre header, so that we can get a
    // single block connected with latch
    if (!is_warp) {
@ -399,17 +429,40 @@ public:
    }
    // as we assume all loops are rotated, we have to insert
-    // barrier before the condition jump of the loop exit
+    // barrier before the condition jump of the for_cond
-
+    if (auto for_cond = L->getExitingBlock()) {
-    if (auto exit_block = L->getExitingBlock()) {
+      assert(for_cond->getTerminator()->getNumSuccessors() == 2 &&
             "has more than 2 successors of the for-cond\n");
      auto conditional_br =
-          dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
+          dyn_cast<llvm::BranchInst>(for_cond->getTerminator());
      assert(conditional_br && conditional_br->isConditional());
-      // insert barrier at the beginning of successor of exit
+      // insert barrier before the condition jump of the loop cond
      if (!is_warp)
        CreateInterWarpBarrier(conditional_br);
      else
        CreateIntraWarpBarrier(conditional_br);
      // insert barrier before the for_body
      auto for_body = for_cond->getTerminator()->getSuccessor(0);
      if (for_body == L->getExitBlock()) {
        for_body = for_cond->getTerminator()->getSuccessor(1);
      }
      // insert at the beginning of for_body
      if (!is_warp)
        CreateInterWarpBarrier(&(*for_body->begin()));
      else
        CreateIntraWarpBarrier(&(*for_body->begin()));
      // insert at the beginning and end in for_inc block
      if (auto for_inc = L->getLoopLatch()) {
        if (!is_warp) {
          CreateInterWarpBarrier(&(*for_inc->begin()));
          CreateInterWarpBarrier(for_inc->getTerminator());
        } else {
          CreateIntraWarpBarrier(&(*for_inc->begin()));
          CreateIntraWarpBarrier(for_inc->getTerminator());
        }
      } else {
        assert(0 && "has continue in a barrier loop\n");
      }
    } else {
      // handle break in for-loop
      printf("loop has multiply exists\n");
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -67,9 +67,15 @@ std::map<std::string, llvm::Instruction *> contextArrays;
 int tempInstructionIndex = 0;
 int need_nested_loop;
 // adding multiple kenerl in file support
 bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
  if (isa<BranchInst>(instr))
    return true;
  // if (isa<AddrSpaceCastInst>(instr))
  //   return true;
  // if (isa<CastInst>(instr))
  //   return true;
  llvm::Module *M = instr->getParent()->getParent()->getParent();
  llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
@ -111,6 +117,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
    return contextArrays[varName];
  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
  Function *FF = instruction->getParent()->getParent();
  Module *M = instruction->getParent()->getParent()->getParent();
@ -127,6 +134,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
  Type *AllocType = elementType;
  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
  /*
  if (InstCast) {
    unsigned Alignment = InstCast->getAlignment();
@ -166,7 +174,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
      }
    }
  }
-
+  */
  llvm::Value *ItemSize = nullptr;
  llvm::AllocaInst *Alloca = nullptr;
@ -354,13 +362,36 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
    auto F = PRs[0].start_block->getParent();
    for (auto bb = F->begin(); bb != F->end(); bb++) {
      for (auto ii = bb->begin(); ii != bb->end(); ii++) {
-        if (isa<AllocaInst>(&(*ii)))
+        if (isa<AllocaInst>(&(*ii))) {
-          instruction_to_fix.push_back(&(*ii));
+          auto alloc = dyn_cast<AllocaInst>(&(*ii));
-      }
+          // Do not duplicate var used outside PRs
-      for (auto inst : instruction_to_fix) {
+          bool used_in_non_PR = false;
-        AddContextSaveRestore(inst, intra_warp_loop);
+          for (Instruction::use_iterator ui = alloc->use_begin(),
                                         ue = alloc->use_end();
               ui != ue; ++ui) {
            llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
            auto user_block = user->getParent();
            bool find_in_PR = false;
            for (auto PR : PRs) {
              if (PR.wrapped_block.find(user_block) != PR.wrapped_block.end()) {
                find_in_PR = true;
                break;
              }
            }
            if (find_in_PR == false) {
              used_in_non_PR = true;
              break;
            }
          }
          if (!used_in_non_PR) {
            instruction_to_fix.push_back(alloc);
          }
        }
      }
    }
    for (auto inst : instruction_to_fix) {
      AddContextSaveRestore(inst, intra_warp_loop);
    }
  }
  for (auto parallel_regions : PRs) {
@ -380,10 +411,8 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
      for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
           ++instr) {
        llvm::Instruction *instruction = &*instr;
        if (ShouldNotBeContextSaved(instruction))
          continue;
        for (Instruction::use_iterator ui = instruction->use_begin(),
                                       ue = instruction->use_end();
             ui != ue; ++ui) {
@ -582,6 +611,8 @@ void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
  for (auto BB = F->begin(); BB != F->end(); ++BB) {
    for (auto BI = BB->begin(); BI != BB->end(); BI++) {
      if (auto Call = dyn_cast<CallInst>(BI)) {
        if (Call->isInlineAsm())
          continue;
        auto func_name = Call->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.bar.warp.sync") {
          need_remove.push_back(Call);
@ -648,6 +679,8 @@ public:
      bool has_barrier = 0;
      for (auto i = current->begin(), e = current->end(); i != e; ++i) {
        if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
          if (call_inst->isInlineAsm())
            continue;
          auto func_name = call_inst->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.barrier.sync")
@ -761,6 +794,8 @@ public:
    for (Function::iterator s = F->begin(); s != F->end(); s++) {
      if (llvm::CallInst *call_inst =
              llvm::dyn_cast<llvm::CallInst>(s->begin())) {
        if (call_inst->isInlineAsm())
          continue;
        auto func_name = call_inst->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.barrier0" ||
            func_name == "llvm.nvvm.barrier.sync") {
@ -787,6 +822,12 @@ public:
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
    auto func_name = (&F)->getName().str();
    // clear context array, temp variables for new kernel function
    contextArrays.clear();
    tempInstructionIds.clear();
    tempInstructionIndex = 0;
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
@ -794,11 +835,11 @@ public:
    auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
    assert(!parallel_regions.empty() && "can not find any parallel regions\n");
    // print_parallel_region(parallel_regions);
    add_warp_loop(parallel_regions, intra_warp_loop);
    if (intra_warp_loop) {
      handle_local_variable_intra_warp(parallel_regions);
    }
    add_warp_loop(parallel_regions, intra_warp_loop);
    remove_barrier(&F, intra_warp_loop);
    return 1;
  }
@ -816,6 +857,8 @@ bool has_warp_barrier(llvm::Module *M) {
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->isInlineAsm())
            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync") {
            return true;
@ -841,8 +884,8 @@ void insert_warp_loop(llvm::Module *M) {
    // only need a single loop, with size=block_size
    Passes.add(new InsertWarpLoopPass(intra_warp));
    Passes.run(*M);
    // remove all barriers
    for (auto F = M->begin(); F != M->end(); ++F)
      remove_barrier(dyn_cast<llvm::Function>(F), false);
  }
  // remove all barriers
  for (auto F = M->begin(); F != M->end(); ++F)
    remove_barrier(dyn_cast<llvm::Function>(F), false);
 }
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -9,6 +9,8 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
@ -36,15 +38,35 @@ void mem_share2global(llvm::Module *M) {
          auto new_name = "wrapper_global_" + share_memory->getName().str();
          auto element_type = PT->getElementType();
          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
-            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+            if (share_memory->hasExternalLinkage() &&
-                *M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
+                array_type->getArrayNumElements() == 0) {
-                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
+              // external shared memory of []
-            ConstantAggregateZero *const_array =
+              // generate global type pointer
-                ConstantAggregateZero::get(array_type);
+              PointerType *PointerTy =
-            global_memory->setInitializer(const_array);
+                  PointerType::get(array_type->getElementType(), 0);
-            corresponding_global_memory.insert(
+              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
-                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
+              llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
-                                                              global_memory));
+                  *M, PointerTy, false, llvm::GlobalValue::CommonLinkage, x1,
                  "wrapper_global_data", NULL,
                  llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
              global_ptr->setDSOLocal(true);
              corresponding_global_memory.insert(
                  std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                                global_ptr));
            } else {
              llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                  *M, array_type, false, llvm::GlobalValue::ExternalLinkage,
                  NULL, new_name, NULL,
                  llvm::GlobalValue::GeneralDynamicTLSModel, 1);
              ConstantAggregateZero *const_array =
                  ConstantAggregateZero::get(array_type);
              global_memory->setInitializer(const_array);
              corresponding_global_memory.insert(
                  std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                                global_memory));
            }
          } else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
            auto zero = llvm::ConstantInt::get(int_type, 0, true);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
@ -54,6 +76,16 @@ void mem_share2global(llvm::Module *M) {
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else if (element_type->isFloatTy()) {
            auto FP_type = llvm::Type::getFloatTy(*C);
            auto zero = llvm::ConstantFP::get(FP_type, 0);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
                false);
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
          }
@ -62,57 +94,11 @@ void mem_share2global(llvm::Module *M) {
    }
  }
-  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+  for (auto k : corresponding_global_memory) {
-    Function *F = &(*i);
+    auto share_addr = k.first;
-    for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+    auto global_addr = k.second;
-      BasicBlock *b = &*i;
+    share_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
-      for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
+        global_addr, cast<PointerType>(share_addr->getType())));
        if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
          auto read_array = get_element_ptr->getPointerOperand();
          if (GlobalVariable *read_share_memory =
                  dyn_cast<llvm::GlobalVariable>(read_array)) {
            // find a GetElementPtr which read share memory
            if (corresponding_global_memory.find(read_share_memory) !=
                corresponding_global_memory.end()) {
              std::vector<Value *> Indices;
              for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
                Indices.push_back(get_element_ptr->getOperand(i + 1));
              auto new_GEP = GetElementPtrInst::Create(
                  NULL, // Pointee type
                  corresponding_global_memory.find(read_share_memory)
                      ->second, // Alloca
                  Indices,      // Indices
                  "", get_element_ptr);
              // replace all get_element_ptr with new_GEP:
              // we can not directly use:
              // get_element_ptr->replaceAllUsesWith(new_GEP);
              // as get_element_ptr and new_GEP have different return type
              llvm::Type *original_type = get_element_ptr->getType();
              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
                  new_GEP, original_type, "", get_element_ptr);
              get_element_ptr->replaceAllUsesWith(FormatASC);
              need_remove.insert(get_element_ptr);
            }
          }
        } else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
          auto read_array = addr_cast->getOperand(0);
          if (GlobalVariable *read_share_memory =
                  dyn_cast<llvm::GlobalVariable>(read_array)) {
            // find a GetElementPtr which read share memory
            if (corresponding_global_memory.find(read_share_memory) !=
                corresponding_global_memory.end()) {
              llvm::Type *original_type = addr_cast->getType();
              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
                  corresponding_global_memory.find(read_share_memory)->second,
                  original_type, "", addr_cast);
              addr_cast->replaceAllUsesWith(FormatASC);
              need_remove.insert(addr_cast);
            }
          }
        }
      }
    }
  }
  for (auto i : need_remove) {
@ -124,3 +110,83 @@ void mem_share2global(llvm::Module *M) {
    i->eraseFromParent();
  }
 }
 void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int64T = Type::getInt64Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
  std::set<llvm::Instruction *> need_remove;
  std::set<GlobalVariable *> need_remove_constant_memory;
  // find all constant memory and generate corresponding global memory
  for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
    if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
      if (auto PT = dyn_cast<PointerType>(I->getType())) {
        unsigned AS = PT->getAddressSpace();
        if (AS == 4) { // find a share memory
          need_remove_constant_memory.insert(constant_memory);
          // generate the corresponding global memory variable
          auto new_name = "wrapper_global_" + constant_memory->getName().str();
          auto element_type = PT->getElementType();
          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
            if (constant_memory->hasExternalLinkage() &&
                array_type->getArrayNumElements() == 0) {
              // external shared memory of []
              // generate global type pointer
              PointerType *PointerTy =
                  PointerType::get(array_type->getElementType(), 0);
              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
              llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
                  *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage, x1,
                  "wrapper_global_data", NULL,
                  llvm::GlobalValue::NotThreadLocal, 0, true);
              corresponding_global_memory.insert(
                  std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
                                                                global_ptr));
            } else {
              llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                  *M, array_type, false, llvm::GlobalValue::ExternalLinkage,
                  NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
              corresponding_global_memory.insert(
                  std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
                                                                global_memory));
            }
          } else if (element_type->isStructTy()) {
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
                NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
                                                              global_memory));
          } else {
            assert(0 && "The required Constant Memory Type is not supported\n");
          }
        }
      }
    }
  }
  fout << "ConstMemory2GlobalMemory\n";
  for (auto k : corresponding_global_memory) {
    auto const_addr = k.first;
    auto global_addr = k.second;
    const_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
        global_addr, cast<PointerType>(const_addr->getType())));
    // this file will be used by host translator
    fout << const_addr->getName().str().c_str() << " to "
         << global_addr->getName().str().c_str() << std::endl;
  }
  fout << "END\n";
  for (auto i : need_remove) {
    i->dropAllReferences();
    i->eraseFromParent();
  }
  for (auto i : need_remove_constant_memory) {
    i->dropAllReferences();
    i->eraseFromParent();
  }
 }
--- a/compilation/KernelTranslation/lib/tool.cpp
+++ b/compilation/KernelTranslation/lib/tool.cpp
@ -1,5 +1,6 @@
 #include "tool.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
@ -187,7 +188,52 @@ void remove_cuda_built_in(llvm::Module *M) {
  }
 }
-void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
+// copied from POCL
 static void breakConstantExpressions(llvm::Value *Val, llvm::Function *Func) {
  std::vector<llvm::Value *> Users(Val->user_begin(), Val->user_end());
  for (auto *U : Users) {
    if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(U)) {
      // First, make sure no users of this constant expression are themselves
      // constant expressions.
      breakConstantExpressions(U, Func);
      // Convert this constant expression to an instruction.
      llvm::Instruction *I = CE->getAsInstruction();
      I->insertBefore(&*Func->begin()->begin());
      CE->replaceAllUsesWith(I);
      CE->destroyConstant();
    }
  }
 }
 void replace_dynamic_shared_memory(llvm::Module *M) {
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    if (!isKernelFunction(M, F))
      continue;
    for (Module::global_iterator i = M->global_begin(), e = M->global_end();
         i != e; ++i) {
      breakConstantExpressions(&*i, F);
    }
    auto dynamic_shared_memory_addr =
        M->getGlobalVariable("dynamic_shared_memory");
    if (!dynamic_shared_memory_addr) {
      return;
    }
    auto load_shared_memory =
        new LoadInst(dynamic_shared_memory_addr, "new_load");
    auto new_bit_cast =
        new BitCastInst(load_shared_memory,
                        dynamic_shared_memory_addr->getType(), "new_bit_cast");
    new_bit_cast->insertBefore(&*F->begin()->begin());
    load_shared_memory->insertBefore(new_bit_cast);
    dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) {
      auto *Instr = dyn_cast<Instruction>(U.getUser());
      return Instr != new_bit_cast && Instr != load_shared_memory;
    });
  }
 }
 void replace_built_in_function(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<llvm::Instruction *> need_remove;
@ -203,28 +249,60 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
    auto local_intra_warp_idx =
        builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
                             0, "local_intra_warp_idx");
-    global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
+    global_intra_warp_idx->replaceUsesWithIf(local_intra_warp_idx, [&](Use &U) {
      auto *Instr = dyn_cast<Instruction>(U.getUser());
      return Instr->getParent()->getParent()->getName().str() == func_name;
    });
    auto global_inter_warp_idx =
        F->getParent()->getGlobalVariable("inter_warp_index");
    auto local_inter_warp_idx =
        builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
                             0, "local_inter_warp_idx");
-    global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
+
    builder.CreateStore(ConstantInt::get(I32, 0), local_inter_warp_idx);
    global_inter_warp_idx->replaceUsesWithIf(local_inter_warp_idx, [&](Use &U) {
      auto *Instr = dyn_cast<Instruction>(U.getUser());
      return Instr->getParent()->getParent()->getName().str() == func_name;
    });
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Load = dyn_cast<LoadInst>(BI)) {
          auto load_from = Load->getOperand(0);
          if (load_from == F->getParent()->getGlobalVariable("block_size")) {
            Load->replaceAllUsesWith(ConstantInt::get(
                I32, block_dim[0] * block_dim[1] * block_dim[2]));
            need_remove.push_back(Load);
          }
        } else if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
-            if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
+            if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
                func_name ==
                    "_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv") {
              auto block_size_addr = M->getGlobalVariable("block_size_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto val = builder.CreateLoad(block_size_addr);
              Call->replaceAllUsesWith(val);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
              auto block_size_addr = M->getGlobalVariable("block_size_y");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto val = builder.CreateLoad(block_size_addr);
              Call->replaceAllUsesWith(val);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
              auto block_size_addr = M->getGlobalVariable("block_size_z");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto val = builder.CreateLoad(block_size_addr);
              Call->replaceAllUsesWith(val);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" ||
                       func_name == "_ZN26__cuda_builtin_threadIdx_t17__fetch_"
                                    "builtin_xEv") {
              // replace it by warp_id
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
@ -234,12 +312,11 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
              thread_idx = builder.CreateBinOp(
                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
                  thread_idx, "thread_idx");
-              if (block_dim[1] != 1 || block_dim[2] != 1) {
+
-                printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
+              thread_idx = builder.CreateBinOp(
-                thread_idx = builder.CreateBinOp(
+                  Instruction::SRem, thread_idx,
-                    Instruction::SRem, thread_idx,
+                  builder.CreateLoad(M->getGlobalVariable("block_size_x")),
-                    ConstantInt::get(I32, block_dim[0]), "thread_id_x");
+                  "thread_id_x");
              }
              Call->replaceAllUsesWith(thread_idx);
              need_remove.push_back(Call);
@ -257,63 +334,61 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
              // tidy = tid / block_dim.x
              thread_idx = builder.CreateBinOp(
                  Instruction::SDiv, thread_idx,
-                  ConstantInt::get(I32, block_dim[0]),
+                  builder.CreateLoad(M->getGlobalVariable("block_size_x")),
                  // builder.CreateLoad(M->getGlobalVariable("block_size_x")),
                  "thread_id_y");
              Call->replaceAllUsesWith(thread_idx);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
-              printf("[WARNING] We DO NOT support multi-dim block\n");
+              printf("[WARNING] We DO NOT support triple-dim block\n");
              exit(1);
              auto zero = ConstantInt::get(I32, 0);
              Call->replaceAllUsesWith(zero);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x" ||
-              auto block_index_addr = M->getGlobalVariable("block_index");
+                       func_name == "_ZN25__cuda_builtin_blockIdx_t17__fetch_"
                                    "builtin_xEv") {
              auto block_index_addr = M->getGlobalVariable("block_index_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto block_idx = builder.CreateLoad(block_index_addr);
              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") {
-                       func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
+              auto block_index_addr = M->getGlobalVariable("block_index_y");
              printf("[WARNING We DO NOT support multi-dim grid\n");
              auto zero = ConstantInt::get(I32, 0);
              Call->replaceAllUsesWith(zero);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
              auto block_size_addr = M->getGlobalVariable("block_size_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
-              auto block_size = ConstantInt::get(I32, block_dim[0]);
+              auto block_idx = builder.CreateLoad(block_index_addr);
-              Call->replaceAllUsesWith(block_size);
+              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
-              auto block_size_addr = M->getGlobalVariable("block_size_y");
+              auto block_index_addr = M->getGlobalVariable("block_index_z");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
-              auto block_size = ConstantInt::get(I32, block_dim[1]);
+              auto block_idx = builder.CreateLoad(block_index_addr);
-              Call->replaceAllUsesWith(block_size);
+              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" ||
-              auto block_size_addr = M->getGlobalVariable("block_size_z");
+                       func_name == "_ZN24__cuda_builtin_gridDim_t17__fetch_"
                                    "builtin_xEv") {
              auto grid_size_addr = M->getGlobalVariable("grid_size_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
-              auto block_size = ConstantInt::get(I32, block_dim[2]);
+              auto grid_size = builder.CreateLoad(grid_size_addr);
              Call->replaceAllUsesWith(block_size);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
              auto grid_size_addr = M->getGlobalVariable("grid_size");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto grid_size = ConstantInt::get(I32, grid_dim[0]);
              Call->replaceAllUsesWith(grid_size);
              need_remove.push_back(Call);
-            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") {
-                       func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
+              auto grid_size_addr = M->getGlobalVariable("grid_size_y");
-              printf("[WARNING We DO NOT support multi-dim grid\n");
+              IRBuilder<> builder(context);
-              auto one = ConstantInt::get(I32, 1);
+              builder.SetInsertPoint(Call);
-              Call->replaceAllUsesWith(one);
+              auto grid_size = builder.CreateLoad(grid_size_addr);
              Call->replaceAllUsesWith(grid_size);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
              auto grid_size_addr = M->getGlobalVariable("grid_size_z");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto grid_size = builder.CreateLoad(grid_size_addr);
              Call->replaceAllUsesWith(grid_size);
              need_remove.push_back(Call);
            }
          }
@ -334,6 +409,98 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
      }
    }
  }
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
            auto callFn = Call->getCalledFunction();
            if (func_name == "vprintf") {
              /*
               * replace CUDA's printf to C's printf
               * CUDA:
               * %0 = tail call i32 @vprintf(i8* getelementptr inbounds ([19 x
               * i8], [19 x i8]* @.str, i64 0, i64 0), i8* null)
               * C: %call1 = call i32 (i8*, ...) @printf(i8* getelementptr
               * inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0))
               */
              // find/create C's printf function
              std::vector<llvm::Type *> args;
              args.push_back(llvm::Type::getInt8PtrTy(context));
              llvm::FunctionType *printfType =
                  FunctionType::get(I32, args, true);
              llvm::FunctionCallee _f =
                  M->getOrInsertFunction("printf", printfType);
              llvm::Function *func_printf =
                  llvm::cast<llvm::Function>(_f.getCallee());
              // construct argument(s)
              std::vector<Value *> printf_args;
              // first argument is same between CUDA and C
              auto placeholder = Call->getArgOperand(0);
              printf_args.push_back(placeholder);
              // insert arguments
              auto compressed_args = Call->getArgOperand(1);
              if (auto BC = dyn_cast<BitCastInst>(compressed_args)) {
                auto src_alloc = BC->getOperand(0);
                auto SrcPointTy =
                    dyn_cast<PointerType>(BC->getOperand(0)->getType());
                auto SrcTy = SrcPointTy->getElementType();
                // reverse the bitcast
                auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
                assert(SrcTy->isStructTy() == 1);
                auto StructTy = dyn_cast<StructType>(SrcTy);
                for (int i = 0; i < StructTy->getNumElements(); i++) {
                  std::vector<Value *> Indices;
                  Indices.push_back(ConstantInt::get(I32, 0));
                  Indices.push_back(ConstantInt::get(I32, i));
                  auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type
                                                           src_alloc, // Alloca
                                                           Indices,   // Indices
                                                           "", Call);
                  auto new_load = new LoadInst(new_GEP, "", Call);
                  printf_args.push_back(new_load);
                }
              }
              auto c_printf_inst =
                  llvm::CallInst::Create(func_printf, printf_args, "", Call);
              // insert
              Call->replaceAllUsesWith(c_printf_inst);
              need_remove.push_back(Call);
            } else if (func_name == "__nv_fast_log2f" ||
                       func_name == "__nv_log2f" ||
                       func_name == "__nv_fast_powf" ||
                       func_name == "__nv_powf" || func_name == "__nv_logf" ||
                       func_name == "__nv_expf" || func_name == "__nv_fabsf" ||
                       func_name == "__nv_log10f" ||
                       func_name == "__nv_fmodf" || func_name == "__nv_sqrt" ||
                       func_name == "__nv_sqrtf" || func_name == "__nv_exp" ||
                       func_name == "__nv_isnanf" ||
                       func_name == "__nv_isinff" || func_name == "__nv_powi" ||
                       func_name == "__nv_powif") {
              Call->getCalledFunction()->deleteBody();
            } else if (func_name == "llvm.nvvm.fma.rn.d") {
              Call->getCalledFunction()->setName("__nvvm_fma_rn_d");
            } else if (func_name == "llvm.nvvm.d2i.lo") {
              Call->getCalledFunction()->setName("__nvvm_d2i_lo");
            } else if (func_name == "llvm.nvvm.d2i.hi") {
              Call->getCalledFunction()->setName("__nvvm_d2i_hi");
            } else if (func_name == "llvm.nvvm.add.rn.d") {
              Call->getCalledFunction()->setName("__nvvm_add_rn_d");
            } else if (func_name == "llvm.nvvm.lohi.i2d") {
              Call->getCalledFunction()->setName("__nvvm_lohi_i2d");
            } else if (func_name == "llvm.nvvm.fabs.f") {
              Call->getCalledFunction()->setName("__nvvm_fabs_f");
            } else if (func_name == "llvm.nvvm.mul24.i") {
              Call->getCalledFunction()->setName("__nvvm_mul24_i");
            }
          }
        }
      }
    }
  }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
@ -382,6 +549,8 @@ bool has_warp_barrier(llvm::BasicBlock *B) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
      if (Call->isInlineAsm())
        continue;
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.bar.warp.sync") {
        return true;
@ -396,6 +565,8 @@ bool has_barrier(llvm::BasicBlock *B) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
      if (Call->isInlineAsm())
        continue;
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.barrier0" ||
          func_name == "llvm.nvvm.bar.warp.sync" ||
@ -412,6 +583,8 @@ bool has_block_barrier(llvm::BasicBlock *B) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
      if (Call->isInlineAsm())
        continue;
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.barrier0" ||
          func_name == "llvm.nvvm.barrier.sync") {
@ -478,3 +651,21 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
  }
  return 0;
 }
 /*
  Print IR to String Output for Debugging Purposes
 */
 // void printModule(llvm::Module *M) {
 //   std::string str;
 //   llvm::raw_string_ostream ss(str);
 //   std::cout << "### Printing Module ###" << std::endl;
 //   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
 //     Function *F = &(*i);
 //     auto func_name = F->getName().str();
 //     std::cout << func_name << std::endl;
 //     for (Function::iterator b = F->begin(); b != F->end(); ++b) {
 //       BasicBlock *B = &(*b);
 //       errs() << *B;
 //     }
 //   }
 // }
--- a/compilation/KernelTranslation/lib/warp_func.cpp
+++ b/compilation/KernelTranslation/lib/warp_func.cpp
@ -44,6 +44,8 @@ void handle_warp_vote(llvm::Module *M) {
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
        if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
          if (vote_any_sync->isInlineAsm())
            continue;
          auto func_name = vote_any_sync->getCalledFunction()->getName();
          if (func_name == "llvm.nvvm.vote.any.sync" ||
              func_name == "llvm.nvvm.vote.all.sync") {
--- a/compilation/examples/reduce/host.cpp
+++ b/compilation/examples/reduce/host.cpp
@ -1,82 +0,0 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #define NUM_WARP 2
 #define NUM_BLOCK 1
 int block_size = 32 * NUM_WARP;
 int block_size_x = block_size;
 int block_size_y = 1;
 int block_size_z = 1;
 __thread int block_index = 0;
 int grid_size = NUM_BLOCK;
 extern "C" {
 void *_Z7reduce0PiS_j_wrapper(void *);
 __thread int warp_shfl[32];
 }
 void *wrap(void *p) {
  int **res = (int **)p;
  block_index = (*(int *)res[3]);
  _Z7reduce0PiS_j_wrapper(p);
  return NULL;
 }
 void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
  int **ret = new int *[4];
  int **p0 = new int *;
  *p0 = g_idata;
  ret[0] = (int *)(p0);
  int **p1 = new int *;
  *p1 = g_odata;
  ret[1] = (int *)(p1);
  unsigned int *p2 = new unsigned int;
  *p2 = n;
  ret[2] = (int *)p2;
  int *p3 = new int;
  *p3 = bid;
  ret[3] = (int *)p3;
  return (void *)ret;
 }
 int main(int argc, char *argv[]) {
  int *g_idata;
  int size = block_size * NUM_BLOCK;
  g_idata = new int[size * 2];
  int *res = new int[size];
  for (int i = 0; i < size; i++) {
    g_idata[i] = i;
  }
  pthread_t threads[NUM_BLOCK];
  void *inp[NUM_BLOCK];
  for (long t = 0; t < NUM_BLOCK; t++) {
    inp[t] = gen_input(t, g_idata, res, size);
  }
  for (long t = 0; t < NUM_BLOCK; t++) {
    pthread_create(&threads[t], NULL, wrap, inp[t]);
  }
  for (long t = 0; t < NUM_BLOCK; t++)
    pthread_join(threads[t], NULL);
  int gold = 0;
  for (int i = 0; i < size; i++) {
    gold += g_idata[i];
  }
  assert(*res == gold && "Incorrect res\n");
  printf("PASS\n");
  pthread_exit(NULL);
 }
--- a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,150 +0,0 @@
 ; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: convergent nounwind
 define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
 entry:
  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
  %mul = mul i32 %2, %1
  %add = add i32 %mul, %0
  %cmp = icmp ult i32 %add, %n
  br i1 %cmp, label %cond.true, label %cond.end
 cond.true:                                        ; preds = %entry
  %idxprom = zext i32 %add to i64
  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
  %3 = load i32, i32* %arrayidx, align 4, !tbaa !13
  br label %cond.end
 cond.end:                                         ; preds = %entry, %cond.true
  %cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
  %idxprom5 = zext i32 %0 to i64
  %arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
  %arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
  store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
  %cmp839 = icmp ugt i32 %2, 1
  br i1 %cmp839, label %for.body, label %for.cond.cleanup
 for.cond.cleanup:                                 ; preds = %if.end, %cond.end
  %cmp18 = icmp eq i32 %0, 0
  br i1 %cmp18, label %if.then19, label %if.end23
 for.body:                                         ; preds = %cond.end, %if.end
  %s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
  %mul9 = shl nuw nsw i32 %s.040, 1
  %rem = urem i32 %0, %mul9
  %cmp10 = icmp eq i32 %rem, 0
  br i1 %cmp10, label %if.then, label %if.end
 if.then:                                          ; preds = %for.body
  %add11 = add i32 %s.040, %0
  %idxprom12 = zext i32 %add11 to i64
  %arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
  %arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
  %4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
  %5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
  %add16 = add nsw i32 %5, %4
  store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
  br label %if.end
 if.end:                                           ; preds = %if.then, %for.body
  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
  %cmp8 = icmp ult i32 %mul9, %2
  br i1 %cmp8, label %for.body, label %for.cond.cleanup
 if.then19:                                        ; preds = %for.cond.cleanup
  %idxprom21 = zext i32 %1 to i64
  %arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
  %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
  store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
  br label %if.end23
 if.end23:                                         ; preds = %if.then19, %for.cond.cleanup
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier.sync(i32) #3
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 attributes #4 = { nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
 !10 = !{i32 0, i32 1024}
 !11 = !{i32 0, i32 2147483647}
 !12 = !{i32 1, i32 1025}
 !13 = !{!14, !14, i64 0}
 !14 = !{!"int", !15, i64 0}
 !15 = !{!"omnipotent char", !16, i64 0}
 !16 = !{!"Simple C++ TBAA"}
--- a/compilation/examples/reduce/run.sh
+++ b/compilation/examples/reduce/run.sh
@ -1,6 +0,0 @@
 #!/bin/bash
 llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 ../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
 llc --filetype=obj kernel.bc
 g++ host.cpp kernel.o -lpthread -o test
 ./test
--- a/compilation/examples/reduce_shuffle/host.cpp
+++ b/compilation/examples/reduce_shuffle/host.cpp
@ -1,82 +0,0 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #define NUM_WARP 2
 #define NUM_BLOCK 1
 int block_size = 32 * NUM_WARP;
 int block_size_x = block_size;
 int block_size_y = 1;
 int block_size_z = 1;
 __thread int block_index = 0;
 int grid_size = NUM_BLOCK;
 extern "C" {
 void *_Z7reduce5PiS_j_wrapper(void *);
 __thread int warp_shfl[32];
 }
 void *wrap(void *p) {
  int **res = (int **)p;
  block_index = (*(int *)res[3]);
  _Z7reduce5PiS_j_wrapper(p);
  return NULL;
 }
 void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
  int **ret = new int *[4];
  int **p0 = new int *;
  *p0 = g_idata;
  ret[0] = (int *)(p0);
  int **p1 = new int *;
  *p1 = g_odata;
  ret[1] = (int *)(p1);
  unsigned int *p2 = new unsigned int;
  *p2 = n;
  ret[2] = (int *)p2;
  int *p3 = new int;
  *p3 = bid;
  ret[3] = (int *)p3;
  return (void *)ret;
 }
 int main(int argc, char *argv[]) {
  int *g_idata;
  int size = block_size * NUM_BLOCK;
  g_idata = new int[size * 2];
  int *res = new int[size];
  for (int i = 0; i < size; i++) {
    g_idata[i] = i;
  }
  pthread_t threads[NUM_BLOCK];
  void *inp[NUM_BLOCK];
  for (long t = 0; t < NUM_BLOCK; t++) {
    inp[t] = gen_input(t, g_idata, res, size);
  }
  for (long t = 0; t < NUM_BLOCK; t++) {
    pthread_create(&threads[t], NULL, wrap, inp[t]);
  }
  for (long t = 0; t < NUM_BLOCK; t++)
    pthread_join(threads[t], NULL);
  int gold = 0;
  for (int i = 0; i < size; i++) {
    gold += g_idata[i];
  }
  assert(*res == gold && "Incorrect res\n");
  printf("PASS\n");
  pthread_exit(NULL);
 }
--- a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,179 +0,0 @@
 ; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: convergent nounwind
 define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
 entry:
  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
  %mul = shl i32 %1, 7
  %add = add i32 %mul, %0
  %cmp = icmp ult i32 %add, %n
  br i1 %cmp, label %cond.true, label %cond.end
 cond.true:                                        ; preds = %entry
  %idxprom = zext i32 %add to i64
  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
  %2 = load i32, i32* %arrayidx, align 4, !tbaa !12
  br label %cond.end
 cond.end:                                         ; preds = %entry, %cond.true
  %cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
  %add4 = add i32 %add, 64
  %cmp5 = icmp ult i32 %add4, %n
  br i1 %cmp5, label %if.then, label %if.end
 if.then:                                          ; preds = %cond.end
  %idxprom7 = zext i32 %add4 to i64
  %arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
  %3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
  %add9 = add nsw i32 %3, %cond
  br label %if.end
 if.end:                                           ; preds = %if.then, %cond.end
  %mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
  %idxprom10 = zext i32 %0 to i64
  %arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
  %arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
  store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
  %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
  %mul.i.i52 = mul nuw nsw i32 %5, %4
  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
  %mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
  %add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
  %add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
  %cmp14 = icmp ult i32 %add8.i.i55, 32
  br i1 %cmp14, label %if.then15, label %if.end32
 if.then15:                                        ; preds = %if.end
  %add16 = add nuw nsw i32 %0, 32
  %idxprom17 = zext i32 %add16 to i64
  %arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
  %arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
  %8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
  %add19 = add nsw i32 %8, %mySum.0
  %9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
  %add23 = add nsw i32 %9, %add19
  %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
  %add23.1 = add nsw i32 %10, %add23
  %11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
  %add23.2 = add nsw i32 %11, %add23.1
  %12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
  %add23.3 = add nsw i32 %12, %add23.2
  %13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
  %cmp27 = icmp eq i32 %add8.i.i55, 0
  br i1 %cmp27, label %if.then28, label %if.end32
 if.then28:                                        ; preds = %if.then15
  %add23.4 = add nsw i32 %13, %add23.3
  %idxprom30 = zext i32 %1 to i64
  %arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
  store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
  br label %if.end32
 if.end32:                                         ; preds = %if.end, %if.then28, %if.then15
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier.sync(i32) #3
 ; Function Attrs: convergent inaccessiblememonly nounwind
 declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 attributes #4 = { convergent inaccessiblememonly nounwind }
 attributes #5 = { nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
 !10 = !{i32 0, i32 1024}
 !11 = !{i32 0, i32 2147483647}
 !12 = !{!13, !13, i64 0}
 !13 = !{!"int", !14, i64 0}
 !14 = !{!"omnipotent char", !15, i64 0}
 !15 = !{!"Simple C++ TBAA"}
 !16 = !{i32 0, i32 64}
 !17 = !{i32 1, i32 1025}
--- a/compilation/examples/reduce_shuffle/run.sh
+++ b/compilation/examples/reduce_shuffle/run.sh
@ -1,6 +0,0 @@
 #!/bin/bash
 llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 ../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
 llc --filetype=obj kernel.bc
 g++ host.cpp kernel.o -lpthread -o test
 ./test
--- a/compilation/examples/run_example.sh
+++ b/compilation/examples/run_example.sh
@ -1,11 +0,0 @@
 #!bin/sh
 for file in ./*
 do
    if test -d $file
    then
        echo executing $file
        cd $file
        bash run.sh
        cd ..
    fi
 done
--- a/compilation/examples/vecadd/host.cpp
+++ b/compilation/examples/vecadd/host.cpp
@ -1,84 +0,0 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #define NUM_BLOCK 1
 int N = 32;
 int block_size = 32;
 int block_size_x = block_size;
 int block_size_y = 1;
 int block_size_z = 1;
 __thread int block_index = 0;
 int grid_size = NUM_BLOCK;
 extern "C" {
 void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
 }
 void *wrap(void *p) {
  int **res = (int **)p;
  block_index = (*(int *)res[4]);
  _Z9vectorAddPKfS0_Pfi_wrapper(p);
  return NULL;
 }
 void *gen_input(int bid, float *A, float *B, float *C, int N) {
  int **ret = new int *[5];
  float **p0 = new float *;
  *p0 = A;
  ret[0] = (int *)(p0);
  float **p1 = new float *;
  *p1 = B;
  ret[1] = (int *)(p1);
  float **p2 = new float *;
  *p2 = C;
  ret[2] = (int *)(p2);
  int *p3 = new int;
  *p3 = N;
  ret[3] = (int *)p3;
  int *p4 = new int;
  *p4 = bid;
  ret[4] = (int *)p4;
  return (void *)ret;
 }
 int main() {
  float *A, *B, *C;
  A = new float[N];
  B = new float[N];
  C = new float[N];
  for (int i = 0; i < N; i++) {
    A[i] = i;
    B[i] = 1;
    C[i] = 0;
  }
  pthread_t threads[NUM_BLOCK];
  int rc;
  for (long t = 0; t < NUM_BLOCK; t++) {
    void *inp = gen_input(t, A, B, C, N);
    rc = pthread_create(&threads[t], NULL, wrap, inp);
  }
  clock_t t1 = clock();
  /* Last thing that main() should do */
  for (long t = 0; t < NUM_BLOCK; t++)
    pthread_join(threads[t], NULL);
  for (int i = 0; i < N; i++) {
    assert(C[i] == (A[i] + B[i]));
  }
  printf("PASS\n");
  pthread_exit(NULL);
 }
--- a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -1,86 +0,0 @@
 ; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nofree nounwind
 define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
 entry:
  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
  %idxprom8 = zext i32 %0 to i64
  %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
  %1 = load float, float* %arrayidx, align 4, !tbaa !11
  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
  %2 = load float, float* %arrayidx2, align 4, !tbaa !11
  %add = fadd contract float %1, %2
  %arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
  store float %add, float* %arrayidx4, align 4, !tbaa !11
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
 !10 = !{i32 0, i32 1024}
 !11 = !{!12, !12, i64 0}
 !12 = !{!"float", !13, i64 0}
 !13 = !{!"omnipotent char", !14, i64 0}
 !14 = !{!"Simple C++ TBAA"}
--- a/compilation/examples/vecadd/run.sh
+++ b/compilation/examples/vecadd/run.sh
@ -1,6 +0,0 @@
 #!/bin/bash
 llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 ../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
 llc --filetype=obj kernel.bc
 g++ host.cpp kernel.o -lpthread -o test
 ./test
--- a/docs/figures/workflow.png
+++ b/docs/figures/workflow.png
--- a/docs/workflow.md
+++ b/docs/workflow.md
@ -1,11 +0,0 @@
 # The workflow of CuPBoP
 The workflow of CuPBoP is described as following:
 ![The workflow of executing CUDA applications on CuPBoP.](figures/workflow.png)
 First, CuPBoP uses Clang to compile the CUDA source code into NVVM IR,
 which consists of two parts: Host part and Kernel Part.
 In the next step, CuPBoP-compilation parses and transforms these NVVM IRs
 to make it suitable for executing on specific architectures.
 The CuPBoP-runtime compiles the transformed Host IR and executes the generated programs,
 which will compile the transformed Kernel IR and
 upload the compiled kernel programs to specific architectures.
--- a/examples/backprop/backprop.c
+++ b/examples/backprop/backprop.c
@ -0,0 +1,454 @@
 #include "backprop.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 //#define OPEN
 #define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
 #define fastcopy(to, from, len)                                                \
  {                                                                            \
    register char *_to, *_from;                                                \
    register int _i, _l;                                                       \
    _to = (char *)(to);                                                        \
    _from = (char *)(from);                                                    \
    _l = (len);                                                                \
    for (_i = 0; _i < _l; _i++)                                                \
      *_to++ = *_from++;                                                       \
  }
 /*** Return random number between 0.0 and 1.0 ***/
 float drnd() { return ((float)rand() / (float)BIGRND); }
 /*** Return random number between -1.0 and 1.0 ***/
 float dpn1() { return ((drnd() * 2.0) - 1.0); }
 /*** The squashing function.  Currently, it's a sigmoid. ***/
 float squash(x)
 float x;
 {
  float m;
  // x = -x;
  // m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
  // return(1.0 / (1.0 + m));
  return (1.0 / (1.0 + exp(-x)));
 }
 /*** Allocate 1d array of floats ***/
 float *alloc_1d_dbl(n)
 int n;
 {
  float *new;
  new = (float *)malloc((unsigned)(n * sizeof(float)));
  if (new == NULL) {
    printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
    return (NULL);
  }
  return (new);
 }
 /*** Allocate 2d array of floats ***/
 float **alloc_2d_dbl(m, n)
 int m, n;
 {
  int i;
  float **new;
  new = (float **)malloc((unsigned)(m * sizeof(float *)));
  if (new == NULL) {
    printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
    return (NULL);
  }
  for (i = 0; i < m; i++) {
    new[i] = alloc_1d_dbl(n);
  }
  return (new);
 }
 bpnn_randomize_weights(w, m, n) float **w;
 int m, n;
 {
  int i, j;
  for (i = 0; i <= m; i++) {
    for (j = 0; j <= n; j++) {
      w[i][j] = (float)rand() / RAND_MAX;
      //  w[i][j] = dpn1();
    }
  }
 }
 bpnn_randomize_row(w, m) float *w;
 int m;
 {
  int i;
  for (i = 0; i <= m; i++) {
    // w[i] = (float) rand()/RAND_MAX;
    w[i] = 0.1;
  }
 }
 bpnn_zero_weights(w, m, n) float **w;
 int m, n;
 {
  int i, j;
  for (i = 0; i <= m; i++) {
    for (j = 0; j <= n; j++) {
      w[i][j] = 0.0;
    }
  }
 }
 void bpnn_initialize(seed) {
  printf("Random number generator seed: %d\n", seed);
  srand(seed);
 }
 BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
 int n_in, n_hidden, n_out;
 {
  BPNN *newnet;
  newnet = (BPNN *)malloc(sizeof(BPNN));
  if (newnet == NULL) {
    printf("BPNN_CREATE: Couldn't allocate neural network\n");
    return (NULL);
  }
  newnet->input_n = n_in;
  newnet->hidden_n = n_hidden;
  newnet->output_n = n_out;
  newnet->input_units = alloc_1d_dbl(n_in + 1);
  newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
  newnet->output_units = alloc_1d_dbl(n_out + 1);
  newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
  newnet->output_delta = alloc_1d_dbl(n_out + 1);
  newnet->target = alloc_1d_dbl(n_out + 1);
  newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
  newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
  newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
  newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
  return (newnet);
 }
 void bpnn_free(net) BPNN *net;
 {
  int n1, n2, i;
  n1 = net->input_n;
  n2 = net->hidden_n;
  free((char *)net->input_units);
  free((char *)net->hidden_units);
  free((char *)net->output_units);
  free((char *)net->hidden_delta);
  free((char *)net->output_delta);
  free((char *)net->target);
  for (i = 0; i <= n1; i++) {
    free((char *)net->input_weights[i]);
    free((char *)net->input_prev_weights[i]);
  }
  free((char *)net->input_weights);
  free((char *)net->input_prev_weights);
  for (i = 0; i <= n2; i++) {
    free((char *)net->hidden_weights[i]);
    free((char *)net->hidden_prev_weights[i]);
  }
  free((char *)net->hidden_weights);
  free((char *)net->hidden_prev_weights);
  free((char *)net);
 }
 /*** Creates a new fully-connected network from scratch,
     with the given numbers of input, hidden, and output units.
     Threshold units are automatically included.  All weights are
     randomly initialized.
     Space is also allocated for temporary storage (momentum weights,
     error computations, etc).
 ***/
 BPNN *bpnn_create(n_in, n_hidden, n_out)
 int n_in, n_hidden, n_out;
 {
  BPNN *newnet;
  newnet = bpnn_internal_create(n_in, n_hidden, n_out);
 #ifdef INITZERO
  bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
 #else
  bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
 #endif
  bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
  bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
  bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
  bpnn_randomize_row(newnet->target, n_out);
  return (newnet);
 }
 void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
 int n1, n2;
 {
  float sum;
  int j, k;
  /*** Set up thresholding unit ***/
  l1[0] = 1.0;
 #ifdef OPEN
  omp_set_num_threads(NUM_THREAD);
 #pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
 #endif
  /*** For each unit in second layer ***/
  for (j = 1; j <= n2; j++) {
    /*** Compute weighted sum of its inputs ***/
    sum = 0.0;
    for (k = 0; k <= n1; k++) {
      sum += conn[k][j] * l1[k];
    }
    l2[j] = squash(sum);
  }
 }
 // extern "C"
 void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
    *output, *err;
 int nj;
 {
  int j;
  float o, t, errsum;
  errsum = 0.0;
  for (j = 1; j <= nj; j++) {
    o = output[j];
    t = target[j];
    delta[j] = o * (1.0 - o) * (t - o);
    errsum += ABS(delta[j]);
  }
  *err = errsum;
 }
 void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
                       err) float *delta_h,
    *delta_o, *hidden, **who, *err;
 int nh, no;
 {
  int j, k;
  float h, sum, errsum;
  errsum = 0.0;
  for (j = 1; j <= nh; j++) {
    h = hidden[j];
    sum = 0.0;
    for (k = 1; k <= no; k++) {
      sum += delta_o[k] * who[j][k];
    }
    delta_h[j] = h * (1.0 - h) * sum;
    errsum += ABS(delta_h[j]);
  }
  *err = errsum;
 }
 void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
    **w, **oldw;
 {
  float new_dw;
  int k, j;
  ly[0] = 1.0;
  // eta = 0.3;
  // momentum = 0.3;
 #ifdef OPEN
  omp_set_num_threads(NUM_THREAD);
 #pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw)          \
    firstprivate(ndelta, nly, momentum)
 #endif
  for (j = 1; j <= ndelta; j++) {
    for (k = 0; k <= nly; k++) {
      new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
      w[k][j] += new_dw;
      oldw[k][j] = new_dw;
    }
  }
 }
 void bpnn_feedforward(net) BPNN *net;
 {
  int in, hid, out;
  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;
  /*** Feed forward input activations. ***/
  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
                    hid);
  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
                    hid, out);
 }
 void bpnn_train(net, eo, eh) BPNN *net;
 float *eo, *eh;
 {
  int in, hid, out;
  float out_err, hid_err;
  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;
  /*** Feed forward input activations. ***/
  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
                    hid);
  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
                    hid, out);
  /*** Compute error on output and hidden units. ***/
  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
                    &out_err);
  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
                    net->hidden_weights, net->hidden_units, &hid_err);
  *eo = out_err;
  *eh = hid_err;
  /*** Adjust input and hidden weights. ***/
  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
                      net->hidden_weights, net->hidden_prev_weights);
  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
                      net->input_weights, net->input_prev_weights);
 }
 void bpnn_save(net, filename) BPNN *net;
 char *filename;
 {
  int n1, n2, n3, i, j, memcnt;
  float dvalue, **w;
  char *mem;
  /// add//
  FILE *pFile;
  pFile = fopen(filename, "w+");
  ///////
  /*
  if ((fd = creat(filename, 0644)) == -1) {
    printf("BPNN_SAVE: Cannot create '%s'\n", filename);
    return;
  }
  */
  n1 = net->input_n;
  n2 = net->hidden_n;
  n3 = net->output_n;
  printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
  // fflush(stdout);
  // write(fd, (char *) &n1, sizeof(int));
  // write(fd, (char *) &n2, sizeof(int));
  // write(fd, (char *) &n3, sizeof(int));
  fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
  fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
  fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
  memcnt = 0;
  w = net->input_weights;
  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
  for (i = 0; i <= n1; i++) {
    for (j = 0; j <= n2; j++) {
      dvalue = w[i][j];
      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
      memcnt += sizeof(float);
    }
  }
  // write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
  fwrite(mem, (unsigned)(sizeof(float)),
         (unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
  free(mem);
  memcnt = 0;
  w = net->hidden_weights;
  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
  for (i = 0; i <= n2; i++) {
    for (j = 0; j <= n3; j++) {
      dvalue = w[i][j];
      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
      memcnt += sizeof(float);
    }
  }
  // write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
  fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
         pFile);
  free(mem);
  fclose(pFile);
  return;
 }
 BPNN *bpnn_read(filename)
 char *filename;
 {
  char *mem;
  BPNN *new;
  int fd, n1, n2, n3, i, j, memcnt;
  if ((fd = open(filename, 0, 0644)) == -1) {
    return (NULL);
  }
  printf("Reading '%s'\n", filename); // fflush(stdout);
  read(fd, (char *)&n1, sizeof(int));
  read(fd, (char *)&n2, sizeof(int));
  read(fd, (char *)&n3, sizeof(int));
  new = bpnn_internal_create(n1, n2, n3);
  printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
  printf("Reading input weights..."); // fflush(stdout);
  memcnt = 0;
  mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
  read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
  for (i = 0; i <= n1; i++) {
    for (j = 0; j <= n2; j++) {
      fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
      memcnt += sizeof(float);
    }
  }
  free(mem);
  printf("Done\nReading hidden weights..."); // fflush(stdout);
  memcnt = 0;
  mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
  read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
  for (i = 0; i <= n2; i++) {
    for (j = 0; j <= n3; j++) {
      fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
      memcnt += sizeof(float);
    }
  }
  free(mem);
  close(fd);
  printf("Done\n"); // fflush(stdout);
  bpnn_zero_weights(new->input_prev_weights, n1, n2);
  bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
  return (new);
 }
--- a/examples/backprop/backprop.h
+++ b/examples/backprop/backprop.h
@ -0,0 +1,50 @@
 #ifndef _BACKPROP_H_
 #define _BACKPROP_H_
 #define BIGRND 0x7fffffff
 #define GPU
 #define THREADS 256
 #define WIDTH 16  // shared memory width
 #define HEIGHT 16 // shared memory height
 #define ETA 0.3      // eta value
 #define MOMENTUM 0.3 // momentum value
 #define NUM_THREAD 4 // OpenMP threads
 typedef struct {
  int input_n;  /* number of input units */
  int hidden_n; /* number of hidden units */
  int output_n; /* number of output units */
  float *input_units;  /* the input units */
  float *hidden_units; /* the hidden units */
  float *output_units; /* the output units */
  float *hidden_delta; /* storage for hidden unit error */
  float *output_delta; /* storage for output unit error */
  float *target; /* storage for target vector */
  float **input_weights;  /* weights from input to hidden layer */
  float **hidden_weights; /* weights from hidden to output layer */
  /*** The next two are for momentum ***/
  float **input_prev_weights;  /* previous change on input to hidden wgt */
  float **hidden_prev_weights; /* previous change on hidden to output wgt */
 } BPNN;
 /*** User-level functions ***/
 void bpnn_initialize();
 BPNN *bpnn_create();
 void bpnn_free();
 void bpnn_train();
 void bpnn_feedforward();
 void bpnn_save();
 BPNN *bpnn_read();
 #endif
--- a/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,615 @@
 ; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "backprop_cuda.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
 entry:
  %input_cuda.addr = alloca float*, align 8
  %output_hidden_cuda.addr = alloca float*, align 8
  %input_hidden_cuda.addr = alloca float*, align 8
  %hidden_partial_sum.addr = alloca float*, align 8
  %in.addr = alloca i32, align 4
  %hid.addr = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %index = alloca i32, align 4
  %index_in = alloca i32, align 4
  %i = alloca i32, align 4
  %power_two = alloca i32, align 4
  store float* %input_cuda, float** %input_cuda.addr, align 8
  store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
  store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
  store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
  store i32 %in, i32* %in.addr, align 4
  store i32 %hid, i32* %hid.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call, i32* %by, align 4
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %tx, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call2, i32* %ty, align 4
  %0 = load i32, i32* %hid.addr, align 4
  %add = add nsw i32 %0, 1
  %mul = mul nsw i32 %add, 16
  %1 = load i32, i32* %by, align 4
  %mul3 = mul nsw i32 %mul, %1
  %2 = load i32, i32* %hid.addr, align 4
  %add4 = add nsw i32 %2, 1
  %3 = load i32, i32* %ty, align 4
  %mul5 = mul nsw i32 %add4, %3
  %add6 = add nsw i32 %mul3, %mul5
  %4 = load i32, i32* %tx, align 4
  %add7 = add nsw i32 %add6, %4
  %add8 = add nsw i32 %add7, 1
  %5 = load i32, i32* %hid.addr, align 4
  %add9 = add nsw i32 %5, 1
  %add10 = add nsw i32 %add8, %add9
  store i32 %add10, i32* %index, align 4
  %6 = load i32, i32* %by, align 4
  %mul11 = mul nsw i32 16, %6
  %7 = load i32, i32* %ty, align 4
  %add12 = add nsw i32 %mul11, %7
  %add13 = add nsw i32 %add12, 1
  store i32 %add13, i32* %index_in, align 4
  %8 = load i32, i32* %tx, align 4
  %cmp = icmp eq i32 %8, 0
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  %9 = load float*, float** %input_cuda.addr, align 8
  %10 = load i32, i32* %index_in, align 4
  %idxprom = sext i32 %10 to i64
  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
  %11 = load float, float* %arrayidx, align 4
  %12 = load i32, i32* %ty, align 4
  %idxprom14 = sext i32 %12 to i64
  %arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
  store float %11, float* %arrayidx15, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %entry
  call void @llvm.nvvm.barrier0()
  %13 = load float*, float** %input_hidden_cuda.addr, align 8
  %14 = load i32, i32* %index, align 4
  %idxprom16 = sext i32 %14 to i64
  %arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
  %15 = load float, float* %arrayidx17, align 4
  %16 = load i32, i32* %ty, align 4
  %idxprom18 = sext i32 %16 to i64
  %arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
  %17 = load i32, i32* %tx, align 4
  %idxprom20 = sext i32 %17 to i64
  %arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
  store float %15, float* %arrayidx21, align 4
  call void @llvm.nvvm.barrier0()
  %18 = load i32, i32* %ty, align 4
  %idxprom22 = sext i32 %18 to i64
  %arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
  %19 = load i32, i32* %tx, align 4
  %idxprom24 = sext i32 %19 to i64
  %arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
  %20 = load float, float* %arrayidx25, align 4
  %21 = load i32, i32* %ty, align 4
  %idxprom26 = sext i32 %21 to i64
  %arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
  %22 = load float, float* %arrayidx27, align 4
  %mul28 = fmul contract float %20, %22
  %23 = load i32, i32* %ty, align 4
  %idxprom29 = sext i32 %23 to i64
  %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
  %24 = load i32, i32* %tx, align 4
  %idxprom31 = sext i32 %24 to i64
  %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
  store float %mul28, float* %arrayidx32, align 4
  call void @llvm.nvvm.barrier0()
  store i32 1, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %if.end
  %25 = load i32, i32* %i, align 4
  %conv = sitofp i32 %25 to float
  %call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
  %cmp34 = fcmp ole float %conv, %call33
  br i1 %cmp34, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %26 = load i32, i32* %i, align 4
  %conv35 = sitofp i32 %26 to float
  %call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
  %conv37 = fptosi float %call36 to i32
  store i32 %conv37, i32* %power_two, align 4
  %27 = load i32, i32* %ty, align 4
  %28 = load i32, i32* %power_two, align 4
  %rem = srem i32 %27, %28
  %cmp38 = icmp eq i32 %rem, 0
  br i1 %cmp38, label %if.then39, label %if.end54
 if.then39:                                        ; preds = %for.body
  %29 = load i32, i32* %ty, align 4
  %idxprom40 = sext i32 %29 to i64
  %arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
  %30 = load i32, i32* %tx, align 4
  %idxprom42 = sext i32 %30 to i64
  %arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
  %31 = load float, float* %arrayidx43, align 4
  %32 = load i32, i32* %ty, align 4
  %33 = load i32, i32* %power_two, align 4
  %div = sdiv i32 %33, 2
  %add44 = add nsw i32 %32, %div
  %idxprom45 = sext i32 %add44 to i64
  %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
  %34 = load i32, i32* %tx, align 4
  %idxprom47 = sext i32 %34 to i64
  %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
  %35 = load float, float* %arrayidx48, align 4
  %add49 = fadd contract float %31, %35
  %36 = load i32, i32* %ty, align 4
  %idxprom50 = sext i32 %36 to i64
  %arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
  %37 = load i32, i32* %tx, align 4
  %idxprom52 = sext i32 %37 to i64
  %arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
  store float %add49, float* %arrayidx53, align 4
  br label %if.end54
 if.end54:                                         ; preds = %if.then39, %for.body
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end54
  %38 = load i32, i32* %i, align 4
  %inc = add nsw i32 %38, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %39 = load i32, i32* %ty, align 4
  %idxprom55 = sext i32 %39 to i64
  %arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
  %40 = load i32, i32* %tx, align 4
  %idxprom57 = sext i32 %40 to i64
  %arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
  %41 = load float, float* %arrayidx58, align 4
  %42 = load float*, float** %input_hidden_cuda.addr, align 8
  %43 = load i32, i32* %index, align 4
  %idxprom59 = sext i32 %43 to i64
  %arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
  store float %41, float* %arrayidx60, align 4
  call void @llvm.nvvm.barrier0()
  %44 = load i32, i32* %tx, align 4
  %cmp61 = icmp eq i32 %44, 0
  br i1 %cmp61, label %if.then62, label %if.end71
 if.then62:                                        ; preds = %for.end
  %45 = load i32, i32* %tx, align 4
  %idxprom63 = sext i32 %45 to i64
  %arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
  %46 = load i32, i32* %ty, align 4
  %idxprom65 = sext i32 %46 to i64
  %arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
  %47 = load float, float* %arrayidx66, align 4
  %48 = load float*, float** %hidden_partial_sum.addr, align 8
  %49 = load i32, i32* %by, align 4
  %50 = load i32, i32* %hid.addr, align 4
  %mul67 = mul nsw i32 %49, %50
  %51 = load i32, i32* %ty, align 4
  %add68 = add nsw i32 %mul67, %51
  %idxprom69 = sext i32 %add68 to i64
  %arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
  store float %47, float* %arrayidx70, align 4
  br label %if.end71
 if.end71:                                         ; preds = %if.then62, %for.end
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: alwaysinline convergent nounwind
 define internal float @_ZL7__log2ff(float %__a) #1 {
 entry:
  %__a.addr = alloca float, align 4
  store float %__a, float* %__a.addr, align 4
  %0 = load float, float* %__a.addr, align 4
  %call = call float @__nv_fast_log2f(float %0) #2
  ret float %call
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
 entry:
  %__a.addr = alloca float, align 4
  %__b.addr = alloca float, align 4
  store float %__a, float* %__a.addr, align 4
  store float %__b, float* %__b.addr, align 4
  %0 = load float, float* %__a.addr, align 4
  %1 = load float, float* %__b.addr, align 4
  %call = call float @__nv_fast_powf(float %0, float %1) #2
  ret float %call
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
 entry:
  %delta.addr = alloca float*, align 8
  %hid.addr = alloca i32, align 4
  %ly.addr = alloca float*, align 8
  %in.addr = alloca i32, align 4
  %w.addr = alloca float*, align 8
  %oldw.addr = alloca float*, align 8
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %index = alloca i32, align 4
  %index_y = alloca i32, align 4
  %index_x = alloca i32, align 4
  store float* %delta, float** %delta.addr, align 8
  store i32 %hid, i32* %hid.addr, align 4
  store float* %ly, float** %ly.addr, align 8
  store i32 %in, i32* %in.addr, align 4
  store float* %w, float** %w.addr, align 8
  store float* %oldw, float** %oldw.addr, align 8
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call, i32* %by, align 4
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %tx, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call2, i32* %ty, align 4
  %0 = load i32, i32* %hid.addr, align 4
  %add = add nsw i32 %0, 1
  %mul = mul nsw i32 %add, 16
  %1 = load i32, i32* %by, align 4
  %mul3 = mul nsw i32 %mul, %1
  %2 = load i32, i32* %hid.addr, align 4
  %add4 = add nsw i32 %2, 1
  %3 = load i32, i32* %ty, align 4
  %mul5 = mul nsw i32 %add4, %3
  %add6 = add nsw i32 %mul3, %mul5
  %4 = load i32, i32* %tx, align 4
  %add7 = add nsw i32 %add6, %4
  %add8 = add nsw i32 %add7, 1
  %5 = load i32, i32* %hid.addr, align 4
  %add9 = add nsw i32 %5, 1
  %add10 = add nsw i32 %add8, %add9
  store i32 %add10, i32* %index, align 4
  %6 = load i32, i32* %by, align 4
  %mul11 = mul nsw i32 16, %6
  %7 = load i32, i32* %ty, align 4
  %add12 = add nsw i32 %mul11, %7
  %add13 = add nsw i32 %add12, 1
  store i32 %add13, i32* %index_y, align 4
  %8 = load i32, i32* %tx, align 4
  %add14 = add nsw i32 %8, 1
  store i32 %add14, i32* %index_x, align 4
  %9 = load float*, float** %delta.addr, align 8
  %10 = load i32, i32* %index_x, align 4
  %idxprom = sext i32 %10 to i64
  %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
  %11 = load float, float* %arrayidx, align 4
  %conv = fpext float %11 to double
  %mul15 = fmul contract double 3.000000e-01, %conv
  %12 = load float*, float** %ly.addr, align 8
  %13 = load i32, i32* %index_y, align 4
  %idxprom16 = sext i32 %13 to i64
  %arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
  %14 = load float, float* %arrayidx17, align 4
  %conv18 = fpext float %14 to double
  %mul19 = fmul contract double %mul15, %conv18
  %15 = load float*, float** %oldw.addr, align 8
  %16 = load i32, i32* %index, align 4
  %idxprom20 = sext i32 %16 to i64
  %arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
  %17 = load float, float* %arrayidx21, align 4
  %conv22 = fpext float %17 to double
  %mul23 = fmul contract double 3.000000e-01, %conv22
  %add24 = fadd contract double %mul19, %mul23
  %18 = load float*, float** %w.addr, align 8
  %19 = load i32, i32* %index, align 4
  %idxprom25 = sext i32 %19 to i64
  %arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
  %20 = load float, float* %arrayidx26, align 4
  %conv27 = fpext float %20 to double
  %add28 = fadd contract double %conv27, %add24
  %conv29 = fptrunc double %add28 to float
  store float %conv29, float* %arrayidx26, align 4
  %21 = load float*, float** %delta.addr, align 8
  %22 = load i32, i32* %index_x, align 4
  %idxprom30 = sext i32 %22 to i64
  %arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
  %23 = load float, float* %arrayidx31, align 4
  %conv32 = fpext float %23 to double
  %mul33 = fmul contract double 3.000000e-01, %conv32
  %24 = load float*, float** %ly.addr, align 8
  %25 = load i32, i32* %index_y, align 4
  %idxprom34 = sext i32 %25 to i64
  %arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
  %26 = load float, float* %arrayidx35, align 4
  %conv36 = fpext float %26 to double
  %mul37 = fmul contract double %mul33, %conv36
  %27 = load float*, float** %oldw.addr, align 8
  %28 = load i32, i32* %index, align 4
  %idxprom38 = sext i32 %28 to i64
  %arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
  %29 = load float, float* %arrayidx39, align 4
  %conv40 = fpext float %29 to double
  %mul41 = fmul contract double 3.000000e-01, %conv40
  %add42 = fadd contract double %mul37, %mul41
  %conv43 = fptrunc double %add42 to float
  %30 = load float*, float** %oldw.addr, align 8
  %31 = load i32, i32* %index, align 4
  %idxprom44 = sext i32 %31 to i64
  %arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
  store float %conv43, float* %arrayidx45, align 4
  call void @llvm.nvvm.barrier0()
  %32 = load i32, i32* %ty, align 4
  %cmp = icmp eq i32 %32, 0
  br i1 %cmp, label %land.lhs.true, label %if.end
 land.lhs.true:                                    ; preds = %entry
  %33 = load i32, i32* %by, align 4
  %cmp46 = icmp eq i32 %33, 0
  br i1 %cmp46, label %if.then, label %if.end
 if.then:                                          ; preds = %land.lhs.true
  %34 = load float*, float** %delta.addr, align 8
  %35 = load i32, i32* %index_x, align 4
  %idxprom47 = sext i32 %35 to i64
  %arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
  %36 = load float, float* %arrayidx48, align 4
  %conv49 = fpext float %36 to double
  %mul50 = fmul contract double 3.000000e-01, %conv49
  %37 = load float*, float** %oldw.addr, align 8
  %38 = load i32, i32* %index_x, align 4
  %idxprom51 = sext i32 %38 to i64
  %arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
  %39 = load float, float* %arrayidx52, align 4
  %conv53 = fpext float %39 to double
  %mul54 = fmul contract double 3.000000e-01, %conv53
  %add55 = fadd contract double %mul50, %mul54
  %40 = load float*, float** %w.addr, align 8
  %41 = load i32, i32* %index_x, align 4
  %idxprom56 = sext i32 %41 to i64
  %arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
  %42 = load float, float* %arrayidx57, align 4
  %conv58 = fpext float %42 to double
  %add59 = fadd contract double %conv58, %add55
  %conv60 = fptrunc double %add59 to float
  store float %conv60, float* %arrayidx57, align 4
  %43 = load float*, float** %delta.addr, align 8
  %44 = load i32, i32* %index_x, align 4
  %idxprom61 = sext i32 %44 to i64
  %arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
  %45 = load float, float* %arrayidx62, align 4
  %conv63 = fpext float %45 to double
  %mul64 = fmul contract double 3.000000e-01, %conv63
  %46 = load float*, float** %oldw.addr, align 8
  %47 = load i32, i32* %index_x, align 4
  %idxprom65 = sext i32 %47 to i64
  %arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
  %48 = load float, float* %arrayidx66, align 4
  %conv67 = fpext float %48 to double
  %mul68 = fmul contract double 3.000000e-01, %conv67
  %add69 = fadd contract double %mul64, %mul68
  %conv70 = fptrunc double %add69 to float
  %49 = load float*, float** %oldw.addr, align 8
  %50 = load i32, i32* %index_x, align 4
  %idxprom71 = sext i32 %50 to i64
  %arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
  store float %conv70, float* %arrayidx72, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
 ; Function Attrs: alwaysinline convergent inlinehint nounwind
 define internal float @__nv_fast_log2f(float %a) #4 {
  %call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %1 = icmp ne i32 %call.i, 0
  br i1 %1, label %2, label %4
 2:                                                ; preds = %0
  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
  br label %__nvvm_builtin_log2f.exit
 4:                                                ; preds = %0
  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
  br label %__nvvm_builtin_log2f.exit
 __nvvm_builtin_log2f.exit:                        ; preds = %4, %2
  %retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
  ret float %retval.0.i
 }
 ; Function Attrs: convergent nounwind
 declare i32 @__nvvm_reflect(i8*) #5
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.lg2.approx.f(float) #3
 ; Function Attrs: alwaysinline convergent inlinehint nounwind
 define internal float @__nv_fast_powf(float %a, float %b) #4 {
  %call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %1 = icmp ne i32 %call.i.i, 0
  br i1 %1, label %2, label %4
 2:                                                ; preds = %0
  %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
  br label %__nv_fast_log2f.exit
 4:                                                ; preds = %0
  %5 = call float @llvm.nvvm.lg2.approx.f(float %a)
  br label %__nv_fast_log2f.exit
 __nv_fast_log2f.exit:                             ; preds = %4, %2
  %retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
  %6 = fmul float %b, %retval.0.i.i
  %call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %7 = icmp ne i32 %call.i.i1, 0
  br i1 %7, label %8, label %10
 8:                                                ; preds = %__nv_fast_log2f.exit
  %9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
  br label %__nv_exp2f.exit
 10:                                               ; preds = %__nv_fast_log2f.exit
  %11 = call float @llvm.nvvm.ex2.approx.f(float %6)
  br label %__nv_exp2f.exit
 __nv_exp2f.exit:                                  ; preds = %10, %8
  %retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
  ret float %retval.0.i.i2
 }
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
 ; Function Attrs: nounwind readnone
 declare float @llvm.nvvm.ex2.approx.f(float) #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
 !llvm.ident = !{!9}
 !nvvmir.version = !{!10}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
 !4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
 !5 = !{null, !"align", i32 8}
 !6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !7 = !{null, !"align", i32 16}
 !8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !10 = !{i32 1, i32 4}
--- a/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll
--- a/examples/backprop/backprop_cuda.cu
+++ b/examples/backprop/backprop_cuda.cu
@ -0,0 +1,195 @@
 #include <cuda.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 // includes, kernels
 #include "backprop.h"
 #include "backprop_cuda_kernel.cu"
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
                                  int n2);
 extern "C" void bpnn_output_error(float *delta, float *target, float *output,
                                  int nj, float *err);
 extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
                                  int no, float **who, float *hidden,
                                  float *err);
 extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
                                    int nly, float **w, float **oldw);
 extern "C" int setup(int argc, char **argv);
 extern "C" float **alloc_2d_dbl(int m, int n);
 extern "C" float squash(float x);
 double gettime() {
  struct timeval t;
  gettimeofday(&t, NULL);
  return t.tv_sec + t.tv_usec * 1e-6;
 }
 unsigned int num_threads = 0;
 unsigned int num_blocks = 0;
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  setup(argc, argv);
 }
 extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
  int in, hid, out;
  float out_err, hid_err;
  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;
 #ifdef GPU
  int m = 0;
  float *input_hidden_cuda;
  float *input_cuda;
  float *output_hidden_cuda;
  float *partial_sum;
  float *hidden_partial_sum;
  float *hidden_delta_cuda;
  float *input_prev_weights_cuda;
  float sum;
  float *input_weights_one_dim;
  float *input_weights_prev_one_dim;
  num_blocks = in / 16;
  dim3 grid(1, num_blocks);
  dim3 threads(16, 16);
  input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
  input_weights_prev_one_dim =
      (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
  partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
  // this preprocessing stage is added to correct the bugs of wrong memcopy
  // using two-dimensional net->inputweights
  for (int k = 0; k <= in; k++) {
    for (int j = 0; j <= hid; j++) {
      input_weights_one_dim[m] = net->input_weights[k][j];
      input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
      m++;
    }
  }
  cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
  cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
  cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
  cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
 #endif
 #ifdef CPU
  printf("Performing CPU computation\n");
  bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
                    hid);
 #endif
 #ifdef GPU
  printf("Performing GPU computation\n");
  // printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
  cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
                                            input_hidden_cuda,
                                            hidden_partial_sum, in, hid);
  cudaThreadSynchronize();
  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) {
    printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
    exit(EXIT_FAILURE);
  }
  cudaMemcpy(partial_sum, hidden_partial_sum,
             num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
  for (int j = 1; j <= hid; j++) {
    sum = 0.0;
    for (int k = 0; k < num_blocks; k++) {
      sum += partial_sum[k * hid + j - 1];
    }
    sum += net->input_weights[0][j];
    net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
  }
 #endif
  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
                    hid, out);
  bpnn_output_error(net->output_delta, net->target, net->output_units, out,
                    &out_err);
  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
                    net->hidden_weights, net->hidden_units, &hid_err);
  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
                      net->hidden_weights, net->hidden_prev_weights);
 #ifdef CPU
  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
                      net->input_weights, net->input_prev_weights);
 #endif
 #ifdef GPU
  cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
  cudaMalloc((void **)&input_prev_weights_cuda,
             (in + 1) * (hid + 1) * sizeof(float));
  cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
                                              input_cuda, in, input_hidden_cuda,
                                              input_prev_weights_cuda);
  cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
             cudaMemcpyDeviceToHost);
  cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
             (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < (in + 1) * (hid + 1); i++) {
    printf("%f ", input_weights_one_dim[i]);
  }
  printf("\n");
  cudaFree(input_cuda);
  cudaFree(output_hidden_cuda);
  cudaFree(input_hidden_cuda);
  cudaFree(hidden_partial_sum);
  cudaFree(input_prev_weights_cuda);
  cudaFree(hidden_delta_cuda);
  free(partial_sum);
  free(input_weights_one_dim);
  free(input_weights_prev_one_dim);
 #endif
 }
--- a/examples/backprop/backprop_cuda_kernel.cu
+++ b/examples/backprop/backprop_cuda_kernel.cu
@ -0,0 +1,96 @@
 #ifndef _BACKPROP_CUDA_KERNEL_H_
 #define _BACKPROP_CUDA_KERNEL_H_
 #include "backprop.h"
 #include "cuda.h"
 #include "math.h"
 #include <stdio.h>
 __global__ void bpnn_layerforward_CUDA(float *input_cuda,
                                       float *output_hidden_cuda,
                                       float *input_hidden_cuda,
                                       float *hidden_partial_sum, int in,
                                       int hid) {
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
  int index_in = HEIGHT * by + ty + 1;
  __shared__ float input_node[HEIGHT];
  __shared__ float weight_matrix[HEIGHT][WIDTH];
  if (tx == 0)
    input_node[ty] = input_cuda[index_in];
  __syncthreads();
  weight_matrix[ty][tx] = input_hidden_cuda[index];
  __syncthreads();
  weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
  __syncthreads();
  for (int i = 1; i <= __log2f(HEIGHT); i++) {
    int power_two = __powf(2, i);
    if (ty % power_two == 0)
      weight_matrix[ty][tx] =
          weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
    __syncthreads();
  }
  //__syncthreads();
  input_hidden_cuda[index] = weight_matrix[ty][tx];
  /*
     for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
             unsigned int power_two = i - 1;
             if( (ty & power_two) == 0 ) {
                  weight_matrix[ty][tx] = weight_matrix[ty][tx] +
     weight_matrix[ty + power_two/2][tx];
             }
     }
     */
  __syncthreads();
  if (tx == 0) {
    hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
  }
 }
 __global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
                                         int in, float *w, float *oldw) {
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
  int index_y = HEIGHT * by + ty + 1;
  int index_x = tx + 1;
  // eta = 0.3;
  // momentum = 0.3;
  w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
  oldw[index] =
      ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
  __syncthreads();
  if (ty == 0 && by == 0) {
    w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
    oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
  }
 }
 #endif
--- a/examples/backprop/facetrain.c
+++ b/examples/backprop/facetrain.c
@ -0,0 +1,48 @@
 #include "backprop.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 extern char *strcpy();
 extern void exit();
 int layer_size = 0;
 backprop_face() {
  BPNN *net;
  int i;
  float out_err, hid_err;
  net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
  printf("Input layer size : %d\n", layer_size);
  load(net);
  // entering the training kernel, only one iteration
  printf("Starting training kernel\n");
  bpnn_train_cuda(net, &out_err, &hid_err);
  bpnn_free(net);
  printf("Training done\n");
 }
 int setup(argc, argv)
 int argc;
 char *argv[];
 {
  int seed;
  if (argc != 2) {
    fprintf(stderr, "usage: backprop <num of input elements>\n");
    exit(0);
  }
  layer_size = atoi(argv[1]);
  if (layer_size % 16 != 0) {
    fprintf(stderr, "The number of input points must be divided by 16\n");
    exit(0);
  }
  seed = 7;
  bpnn_initialize(seed);
  backprop_face();
  exit(0);
 }
--- a/examples/backprop/imagenet.c
+++ b/examples/backprop/imagenet.c
@ -0,0 +1,22 @@
 #include "backprop.h"
 #include <stdio.h>
 #include <stdlib.h>
 extern layer_size;
 load(net) BPNN *net;
 {
  float *units;
  int nr, nc, imgsize, i, j, k;
  nr = layer_size;
  imgsize = nr * nc;
  units = net->input_units;
  k = 1;
  for (i = 0; i < nr; i++) {
    units[k] = (float)rand() / RAND_MAX;
    k++;
  }
 }
--- a/examples/backprop/run.sh
+++ b/examples/backprop/run.sh
@ -0,0 +1,28 @@
 #!/bin/bash
 set -e
 clang -c -emit-llvm backprop.c
 clang -c -emit-llvm facetrain.c
 clang -c -emit-llvm imagenet.c
 llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 llc --relocation-model=pic --filetype=obj  backprop.bc
 llc --relocation-model=pic --filetype=obj  facetrain.bc
 llc --relocation-model=pic --filetype=obj  imagenet.bc
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o demo \
    -fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
    -lc -lx86Runtime -lthreadPool -lpthread
 ./demo 1024 > res.log
 if grep -q -e "0.173289 0.259645 0.350836" res.log; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,307 @@
 ; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "bfs.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 %struct.Node = type { i32, i32 }
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
 entry:
  %g_graph_nodes.addr = alloca %struct.Node*, align 8
  %g_graph_edges.addr = alloca i32*, align 8
  %g_graph_mask.addr = alloca i8*, align 8
  %g_updating_graph_mask.addr = alloca i8*, align 8
  %g_graph_visited.addr = alloca i8*, align 8
  %g_cost.addr = alloca i32*, align 8
  %no_of_nodes.addr = alloca i32, align 4
  %tid = alloca i32, align 4
  %i = alloca i32, align 4
  %id = alloca i32, align 4
  store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
  store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
  store i32* %g_cost, i32** %g_cost.addr, align 8
  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call, 512
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add = add i32 %mul, %call1
  store i32 %add, i32* %tid, align 4
  %0 = load i32, i32* %tid, align 4
  %1 = load i32, i32* %no_of_nodes.addr, align 4
  %cmp = icmp slt i32 %0, %1
  br i1 %cmp, label %land.lhs.true, label %if.end26
 land.lhs.true:                                    ; preds = %entry
  %2 = load i8*, i8** %g_graph_mask.addr, align 8
  %3 = load i32, i32* %tid, align 4
  %idxprom = sext i32 %3 to i64
  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
  %4 = load i8, i8* %arrayidx, align 1
  %tobool = trunc i8 %4 to i1
  br i1 %tobool, label %if.then, label %if.end26
 if.then:                                          ; preds = %land.lhs.true
  %5 = load i8*, i8** %g_graph_mask.addr, align 8
  %6 = load i32, i32* %tid, align 4
  %idxprom2 = sext i32 %6 to i64
  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
  store i8 0, i8* %arrayidx3, align 1
  %7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
  %8 = load i32, i32* %tid, align 4
  %idxprom4 = sext i32 %8 to i64
  %arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
  %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
  %9 = load i32, i32* %starting, align 4
  store i32 %9, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %if.then
  %10 = load i32, i32* %i, align 4
  %11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
  %12 = load i32, i32* %tid, align 4
  %idxprom6 = sext i32 %12 to i64
  %arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
  %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
  %13 = load i32, i32* %no_of_edges, align 4
  %14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
  %15 = load i32, i32* %tid, align 4
  %idxprom8 = sext i32 %15 to i64
  %arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
  %starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
  %16 = load i32, i32* %starting10, align 4
  %add11 = add nsw i32 %13, %16
  %cmp12 = icmp slt i32 %10, %add11
  br i1 %cmp12, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %17 = load i32*, i32** %g_graph_edges.addr, align 8
  %18 = load i32, i32* %i, align 4
  %idxprom13 = sext i32 %18 to i64
  %arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
  %19 = load i32, i32* %arrayidx14, align 4
  store i32 %19, i32* %id, align 4
  %20 = load i8*, i8** %g_graph_visited.addr, align 8
  %21 = load i32, i32* %id, align 4
  %idxprom15 = sext i32 %21 to i64
  %arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
  %22 = load i8, i8* %arrayidx16, align 1
  %tobool17 = trunc i8 %22 to i1
  br i1 %tobool17, label %if.end, label %if.then18
 if.then18:                                        ; preds = %for.body
  %23 = load i32*, i32** %g_cost.addr, align 8
  %24 = load i32, i32* %tid, align 4
  %idxprom19 = sext i32 %24 to i64
  %arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
  %25 = load i32, i32* %arrayidx20, align 4
  %add21 = add nsw i32 %25, 1
  %26 = load i32*, i32** %g_cost.addr, align 8
  %27 = load i32, i32* %id, align 4
  %idxprom22 = sext i32 %27 to i64
  %arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
  store i32 %add21, i32* %arrayidx23, align 4
  %28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
  %29 = load i32, i32* %id, align 4
  %idxprom24 = sext i32 %29 to i64
  %arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
  store i8 1, i8* %arrayidx25, align 1
  br label %if.end
 if.end:                                           ; preds = %if.then18, %for.body
  br label %for.inc
 for.inc:                                          ; preds = %if.end
  %30 = load i32, i32* %i, align 4
  %inc = add nsw i32 %30, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  br label %if.end26
 if.end26:                                         ; preds = %for.end, %land.lhs.true, %entry
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
 entry:
  %g_graph_mask.addr = alloca i8*, align 8
  %g_updating_graph_mask.addr = alloca i8*, align 8
  %g_graph_visited.addr = alloca i8*, align 8
  %g_over.addr = alloca i8*, align 8
  %no_of_nodes.addr = alloca i32, align 4
  %tid = alloca i32, align 4
  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
  store i8* %g_over, i8** %g_over.addr, align 8
  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call, 512
  %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add = add i32 %mul, %call1
  store i32 %add, i32* %tid, align 4
  %0 = load i32, i32* %tid, align 4
  %1 = load i32, i32* %no_of_nodes.addr, align 4
  %cmp = icmp slt i32 %0, %1
  br i1 %cmp, label %land.lhs.true, label %if.end
 land.lhs.true:                                    ; preds = %entry
  %2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
  %3 = load i32, i32* %tid, align 4
  %idxprom = sext i32 %3 to i64
  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
  %4 = load i8, i8* %arrayidx, align 1
  %tobool = trunc i8 %4 to i1
  br i1 %tobool, label %if.then, label %if.end
 if.then:                                          ; preds = %land.lhs.true
  %5 = load i8*, i8** %g_graph_mask.addr, align 8
  %6 = load i32, i32* %tid, align 4
  %idxprom2 = sext i32 %6 to i64
  %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
  store i8 1, i8* %arrayidx3, align 1
  %7 = load i8*, i8** %g_graph_visited.addr, align 8
  %8 = load i32, i32* %tid, align 4
  %idxprom4 = sext i32 %8 to i64
  %arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
  store i8 1, i8* %arrayidx5, align 1
  %9 = load i8*, i8** %g_over.addr, align 8
  store i8 1, i8* %9, align 1
  %10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
  %11 = load i32, i32* %tid, align 4
  %idxprom6 = sext i32 %11 to i64
  %arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
  store i8 0, i8* %arrayidx7, align 1
  br label %if.end
 if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
 !llvm.ident = !{!9}
 !nvvmir.version = !{!10}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
 !4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
 !5 = !{null, !"align", i32 8}
 !6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !7 = !{null, !"align", i32 16}
 !8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !10 = !{i32 1, i32 4}
--- a/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll
--- a/examples/bfs/bfs.cu
+++ b/examples/bfs/bfs.cu
@ -0,0 +1,213 @@
 #include <cuda.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define MAX_THREADS_PER_BLOCK 512
 int no_of_nodes;
 int edge_list_size;
 FILE *fp;
 // Structure to hold a node information
 struct Node {
  int starting;
  int no_of_edges;
 };
 #include "kernel.cu"
 #include "kernel2.cu"
 void BFSGraph(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  no_of_nodes = 0;
  edge_list_size = 0;
  BFSGraph(argc, argv);
 }
 void Usage(int argc, char **argv) {
  fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Apply BFS on a Graph using CUDA
 ////////////////////////////////////////////////////////////////////////////////
 void BFSGraph(int argc, char **argv) {
  char *input_f;
  if (argc != 2) {
    Usage(argc, argv);
    exit(0);
  }
  input_f = argv[1];
  printf("Reading File\n");
  // Read in Graph from a file
  fp = fopen(input_f, "r");
  if (!fp) {
    printf("Error Reading graph file\n");
    return;
  }
  int source = 0;
  fscanf(fp, "%d", &no_of_nodes);
  int num_of_blocks = 1;
  int num_of_threads_per_block = no_of_nodes;
  // Make execution Parameters according to the number of nodes
  // Distribute threads across multiple Blocks if necessary
  if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
    num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
    num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
  }
  // allocate host memory
  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
  bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
  bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
  bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
  int start, edgeno;
  // initalize the memory
  for (unsigned int i = 0; i < no_of_nodes; i++) {
    fscanf(fp, "%d %d", &start, &edgeno);
    h_graph_nodes[i].starting = start;
    h_graph_nodes[i].no_of_edges = edgeno;
    h_graph_mask[i] = false;
    h_updating_graph_mask[i] = false;
    h_graph_visited[i] = false;
  }
  // read the source node from the file
  fscanf(fp, "%d", &source);
  source = 0;
  // set the source node as true in the mask
  h_graph_mask[source] = true;
  h_graph_visited[source] = true;
  fscanf(fp, "%d", &edge_list_size);
  int id, cost;
  int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
  for (int i = 0; i < edge_list_size; i++) {
    fscanf(fp, "%d", &id);
    fscanf(fp, "%d", &cost);
    h_graph_edges[i] = id;
  }
  if (fp)
    fclose(fp);
  printf("Read File\n");
  // Copy the Node list to device memory
  Node *d_graph_nodes;
  cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
  cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
             cudaMemcpyHostToDevice);
  // Copy the Edge List to device Memory
  int *d_graph_edges;
  cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
  cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
             cudaMemcpyHostToDevice);
  // Copy the Mask to device memory
  bool *d_graph_mask;
  cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
             cudaMemcpyHostToDevice);
  bool *d_updating_graph_mask;
  cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
             sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
  // Copy the Visited nodes array to device memory
  bool *d_graph_visited;
  cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
             cudaMemcpyHostToDevice);
  // allocate mem for the result on host side
  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
  for (int i = 0; i < no_of_nodes; i++)
    h_cost[i] = -1;
  h_cost[source] = 0;
  // allocate device memory for result
  int *d_cost;
  cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
  cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
  // make a bool to check if the execution is over
  bool *d_over;
  cudaMalloc((void **)&d_over, sizeof(bool));
  printf("Copied Everything to GPU memory\n");
  // setup execution parameters
  dim3 grid(num_of_blocks, 1, 1);
  dim3 threads(num_of_threads_per_block, 1, 1);
  int k = 0;
  printf("Start traversing the tree\n");
  bool stop;
  // Call the Kernel untill all the elements of Frontier are not false
  do {
    // if no thread changes this value then the loop stops
    stop = false;
    cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
    Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
                                 d_updating_graph_mask, d_graph_visited, d_cost,
                                 no_of_nodes);
    cudaDeviceSynchronize();
    // check if kernel execution generated and error
    Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
                                  d_graph_visited, d_over, no_of_nodes);
    cudaDeviceSynchronize();
    // check if kernel execution generated and error
    cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
    k++;
  } while (stop);
  printf("Kernel Executed %d times\n", k);
  // copy result from device to host
  cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
  // Store the result into a file
  FILE *fpo = fopen("result.txt", "w");
  for (int i = 0; i < no_of_nodes; i++)
    fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
  fclose(fpo);
  printf("Result stored in result.txt\n");
  // cleanup memory
  free(h_graph_nodes);
  free(h_graph_edges);
  free(h_graph_mask);
  free(h_updating_graph_mask);
  free(h_graph_visited);
  free(h_cost);
  cudaFree(d_graph_nodes);
  cudaFree(d_graph_edges);
  cudaFree(d_graph_mask);
  cudaFree(d_updating_graph_mask);
  cudaFree(d_graph_visited);
  cudaFree(d_cost);
 }
--- a/examples/bfs/kernel.cu
+++ b/examples/bfs/kernel.cu
@ -0,0 +1,23 @@
 #ifndef _KERNEL_H_
 #define _KERNEL_H_
 __global__ void
 Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
 {
 	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
 	if( tid<no_of_nodes && g_graph_mask[tid])
 	{
 		g_graph_mask[tid]=false;
 		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
 			{
 			int id = g_graph_edges[i];
 			if(!g_graph_visited[id])
 				{
 				g_cost[id]=g_cost[tid]+1;
 				g_updating_graph_mask[id]=true;
 				}
 			}
 	}
 }
 #endif
--- a/examples/bfs/kernel2.cu
+++ b/examples/bfs/kernel2.cu
@ -0,0 +1,18 @@
 #ifndef _KERNEL2_H_
 #define _KERNEL2_H_
 __global__ void
 Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
 {
 	int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
 	if( tid<no_of_nodes && g_updating_graph_mask[tid])
 	{
 		g_graph_mask[tid]=true;
 		g_graph_visited[tid]=true;
 		*g_over=true;
 		g_updating_graph_mask[tid]=false;
 	}
 }
 #endif
--- a/examples/bfs/run.sh
+++ b/examples/bfs/run.sh
@ -0,0 +1,21 @@
 #!/bin/bash
 set -e
 llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
    -o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./bfs.out ../../rodinia-data/bfs/graph65536.txt
 if grep -q "0) cost:0" result.txt; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/btree/common.h
+++ b/examples/btree/common.h
@ -0,0 +1,343 @@
 // # ifdef __cplusplus
 // extern "C" {
 // # endif
 // #ifndef LIST_H
 // # define LIST_H
 //===============================================================================================================================================================================================================200
 //	DEFINE/INCLUDE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE (for some reason these are not recognized when defined in main
 // file before this one is included)
 //======================================================================================================================================================150
 #include <stdbool.h> // (in path known to compiler)			needed by true/false, bool
 #include <stdint.h>  // (in path known to compiler)			needed by uint32_t
 #include <stdlib.h>  // (in path known to compiler)			needed by malloc
 //======================================================================================================================================================150
 //	DEFINE
 //======================================================================================================================================================150
 #define fp float
 #define Version "1.5"
 #ifdef WINDOWS
 #define bool char
 #define false 0
 #define true 1
 #endif
 /* #define DEFAULT_ORDER 256 */
 #ifdef RD_WG_SIZE_0_0
 #define DEFAULT_ORDER RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define DEFAULT_ORDER RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define DEFAULT_ORDER RD_WG_SIZE
 #else
 #define DEFAULT_ORDER 256
 #endif
 /* #ifdef RD_WG_SIZE_1_0 */
 /*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
 /* #elif defined(RD_WG_SIZE_1) */
 /*         #define  DEFAULT_ORDER_2 RD_WG_SIZE_1 */
 /* #elif defined(RD_WG_SIZE) */
 /*         #define  DEFAULT_ORDER_2 RD_WG_SIZE */
 /* #else */
 /*         #define  DEFAULT_ORDER_2 256 */
 /* #endif */
 /* #define DEFAULT_ORDER 508 */
 #define malloc(size)                                                           \
  ({                                                                           \
    void *_tmp;                                                                \
                                                                               \
    if (!(_tmp = malloc(size))) {                                              \
      fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__);    \
      exit(-1);                                                                \
    }                                                                          \
                                                                               \
    _tmp;                                                                      \
  })
 //======================================================================================================================================================150
 //	STRUCTURES
 //======================================================================================================================================================150
 // struct list_item;
 typedef struct list_item list_item_t;
 typedef struct list_t {
  list_item_t *head, *tail;
  uint32_t length;
  int32_t (*compare)(const void *key, const void *with);
  void (*datum_delete)(void *);
 } list_t;
 typedef list_item_t *list_iterator_t;
 typedef list_item_t *list_reverse_iterator_t;
 /* Type representing the record
 * to which a given key refers.
 * In a real B+ tree system, the
 * record would hold data (in a database)
 * or a file (in an operating system)
 * or some other information.
 * Users can rewrite this part of the code
 * to change the type and content
 * of the value field.
 */
 typedef struct record {
  int value;
 } record;
 /* Type representing a node in the B+ tree.
 * This type is general enough to serve for both
 * the leaf and the internal node.
 * The heart of the node is the array
 * of keys and the array of corresponding
 * pointers.  The relation between keys
 * and pointers differs between leaves and
 * internal nodes.  In a leaf, the index
 * of each key equals the index of its corresponding
 * pointer, with a maximum of order - 1 key-pointer
 * pairs.  The last pointer points to the
 * leaf to the right (or NULL in the case
 * of the rightmost leaf).
 * In an internal node, the first pointer
 * refers to lower nodes with keys less than
 * the smallest key in the keys array.  Then,
 * with indices i starting at 0, the pointer
 * at i + 1 points to the subtree with keys
 * greater than or equal to the key in this
 * node at index i.
 * The num_keys field is used to keep
 * track of the number of valid keys.
 * In an internal node, the number of valid
 * pointers is always num_keys + 1.
 * In a leaf, the number of valid pointers
 * to data is always num_keys.  The
 * last leaf pointer points to the next leaf.
 */
 typedef struct node {
  void **pointers;
  int *keys;
  struct node *parent;
  bool is_leaf;
  int num_keys;
  struct node *next; // Used for queue.
 } node;
 //
 typedef struct knode {
  int location;
  int indices[DEFAULT_ORDER + 1];
  int keys[DEFAULT_ORDER + 1];
  bool is_leaf;
  int num_keys;
 } knode;
 struct list_item {
  struct list_item *pred, *next;
  void *datum;
 };
 //===============================================================================================================================================================================================================200
 //	PROTOTYPES
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 // Other
 //======================================================================================================================================================150
 void list_item_init(list_item_t *li, void *datum);
 void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
 void list_insert_item_tail(list_t *l, list_item_t *i);
 void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
 void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
 void list_insert_item_sorted(list_t *l, list_item_t *i);
 //======================================================================================================================================================150
 // ???
 //======================================================================================================================================================150
 void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
               void (*datum_delete)(void *datum));
 void list_delete(list_t *l);
 void list_reset(list_t *l);
 void list_insert_head(list_t *l, void *v);
 void list_insert_tail(list_t *l, void *v);
 void list_insert_before(list_t *l, list_item_t *next, void *v);
 void list_insert_after(list_t *l, list_item_t *pred, void *v);
 void list_insert_sorted(list_t *l, void *v);
 void list_insert_item_head(list_t *l, list_item_t *i);
 void list_remove_item(list_t *l, list_item_t *i);
 void list_remove_head(list_t *l);
 void list_remove_tail(list_t *l);
 list_item_t *list_find_item(list_t *l, void *datum);
 list_item_t *list_get_head_item(list_t *l);
 list_item_t *list_get_tail_item(list_t *l);
 void *list_find(list_t *l, void *datum);
 void *list_get_head(list_t *l);
 void *list_get_tail(list_t *l);
 uint32_t list_get_length(list_t *l);
 bool list_is_empty(list_t *l);
 bool list_not_empty(list_t *l);
 void list_visit_items(list_t *l, void (*visitor)(void *v));
 void *list_item_get_datum(list_item_t *li);
 void list_iterator_init(list_t *l, list_iterator_t *li);
 void list_iterator_delete(list_iterator_t *li);
 void list_iterator_next(list_iterator_t *li);
 void list_iterator_prev(list_iterator_t *li);
 void *list_iterator_get_datum(list_iterator_t *li);
 bool list_iterator_is_valid(list_iterator_t *li);
 void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
 void list_reverse_iterator_delete(list_iterator_t *li);
 void list_reverse_iterator_next(list_iterator_t *li);
 void list_reverse_iterator_prev(list_iterator_t *li);
 void *list_reverse_iterator_get_datum(list_iterator_t *li);
 bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
 //======================================================================================================================================================150
 // Output and utility
 //======================================================================================================================================================150
 void *kmalloc(int size);
 long transform_to_cuda(node *n,
                       bool verbose); // returns actual mem used in a long
 void usage_1(void);
 void usage_2(void);
 void enqueue(node *new_node);
 node *dequeue(void);
 int height(node *root);
 int path_to_root(node *root, node *child);
 void print_leaves(node *root);
 void print_tree(node *root);
 node *find_leaf(node *root, int key, bool verbose);
 record *find(node *root, int key, bool verbose);
 int cut(int length);
 //======================================================================================================================================================150
 // Insertion
 //======================================================================================================================================================150
 record *make_record(int value);
 node *make_node(void);
 node *make_leaf(void);
 int get_left_index(node *parent, node *left);
 node *insert_into_leaf(node *leaf, int key, record *pointer);
 node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
                                       record *pointer);
 node *insert_into_node(node *root, node *parent, int left_index, int key,
                       node *right);
 node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
                                       int key, node *right);
 node *insert_into_parent(node *root, node *left, int key, node *right);
 node *insert_into_new_root(node *left, int key, node *right);
 node *start_new_tree(int key, record *pointer);
 node *insert(node *root, int key, int value);
 //======================================================================================================================================================150
 // Deletion
 //======================================================================================================================================================150
 int get_neighbor_index(node *n);
 node *adjust_root(node *root);
 node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
                     int k_prime);
 node *redistribute_nodes(node *root, node *n, node *neighbor,
                         int neighbor_index, int k_prime_index, int k_prime);
 node *delete_entry(node *root, node *n, int key, void *pointer);
 node *deleteVal(node *root, int key);
 //===============================================================================================================================================================================================================200
 //	HEADER
 //===============================================================================================================================================================================================================200
 // int main(	int argc,
 // char *argv []);
 //===============================================================================================================================================================================================================200
 //	END
 //===============================================================================================================================================================================================================200
 // #endif
 // # ifdef __cplusplus
 // }
 // # endif
--- a/examples/btree/kernel/kernel_gpu_cuda.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda.cu
@ -0,0 +1,54 @@
 //========================================================================================================================================================================================================200
 //	findK function
 //========================================================================================================================================================================================================200
 __global__ void
 findK(	long height,
 		knode *knodesD,
 		long knodes_elem,
 		record *recordsD,
 		long *currKnodeD,
 		long *offsetD,
 		int *keysD,
 		record *ansD)
 {
 	// private thread IDs
 	int thid = threadIdx.x;
 	int bid = blockIdx.x;
 	// processtree levels
 	int i;
 	for(i = 0; i < height; i++){
 		// if value is between the two keys
 		if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
 			// this conditional statement is inserted to avoid crush due to but in original code
 			// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
 			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
 			if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
 				offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
 			}
 		}
 		__syncthreads();
 		// set for next tree level
 		if(thid==0){
 			currKnodeD[bid] = offsetD[bid];
 		}
 		__syncthreads();
 	}
 	//At this point, we have a candidate leaf node which may contain
 	//the target record.  Check each key to hopefully find the record
 	if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
 		ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
 	}
 }
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_2.cu
@ -0,0 +1,70 @@
 //========================================================================================================================================================================================================200
 //	findRangeK function
 //========================================================================================================================================================================================================200
 __global__ void
 findRangeK(	long height,
 			knode *knodesD,
 			long knodes_elem,
 			long *currKnodeD,
 			long *offsetD,
 			long *lastKnodeD,
 			long *offset_2D,
 			int *startD,
 			int *endD,
 			int *RecstartD,
 			int *ReclenD)
 {
 	// private thread IDs
 	int thid = threadIdx.x;
 	int bid = blockIdx.x;
 	// ???
 	int i;
 	for(i = 0; i < height; i++){
 		if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
 			// this conditional statement is inserted to avoid crush due to but in original code
 			// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
 			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
 			if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
 				offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
 			}
 		}
 		if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
 			// this conditional statement is inserted to avoid crush due to but in original code
 			// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
 			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
 			if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
 				offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
 			}
 		}
 		__syncthreads();
 		// set for next tree level
 		if(thid==0){
 			currKnodeD[bid] = offsetD[bid];
 			lastKnodeD[bid] = offset_2D[bid];
 		}
 		__syncthreads();
 	}
 	// Find the index of the starting record
 	if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
 		RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
 	}
 	__syncthreads();
 	// Find the index of the ending record
 	if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
 		ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
 	}
 }
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu
@ -0,0 +1,292 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	DEFINE/INCLUDE
 //========================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	COMMON
 //======================================================================================================================================================150
 #include "../common.h"								// (in main program directory)			needed to recognized input variables
 //======================================================================================================================================================150
 //	UTILITIES
 //======================================================================================================================================================150
 #include "../util/cuda/cuda.h"					// (in path specified to compiler)	needed by for device functions
 #include "../util/timer/timer.h"					// (in path specified to compiler)	needed by timer
 //======================================================================================================================================================150
 //	KERNEL
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda.cu"						// (in current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
 //======================================================================================================================================================150
 //	HEADER
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda_wrapper.h"				// (in current directory)
 //========================================================================================================================================================================================================200
 //	KERNEL_GPU_CUDA_WRAPPER FUNCTION
 //========================================================================================================================================================================================================200
 void
 kernel_gpu_cuda_wrapper(record *records,
 						long records_mem,
 						knode *knodes,
 						long knodes_elem,
 						long knodes_mem,
 						int order,
 						long maxheight,
 						int count,
 						long *currKnode,
 						long *offset,
 						int *keys,
 						record *ans)
 {
 	//======================================================================================================================================================150
 	//	CPU VARIABLES
 	//======================================================================================================================================================150
 	// timer
 	long long time0;
 	long long time1;
 	long long time2;
 	long long time3;
 	long long time4;
 	long long time5;
 	long long time6;
 	time0 = get_time();
 	//======================================================================================================================================================150
 	//	GPU SETUP
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 	cudaThreadSynchronize();
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
 	//====================================================================================================100
 	int numBlocks;
 	numBlocks = count;									// max # of blocks can be 65,535
 	int threadsPerBlock;
 	threadsPerBlock = order < 1024 ? order : 1024;
 	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
 	time1 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY				(MALLOC)
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN
 	//====================================================================================================100
 	//==================================================50
 	//	recordsD
 	//==================================================50
 	record *recordsD;
 	cudaMalloc((void**)&recordsD, records_mem);
 	checkCUDAError("cudaMalloc  recordsD");
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	knode *knodesD;
 	cudaMalloc((void**)&knodesD, knodes_mem);
 	checkCUDAError("cudaMalloc  recordsD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	long *currKnodeD;
 	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	long *offsetD;
 	cudaMalloc((void**)&offsetD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  offsetD");
 	//==================================================50
 	//	keysD
 	//==================================================50
 	int *keysD;
 	cudaMalloc((void**)&keysD, count*sizeof(int));
 	checkCUDAError("cudaMalloc  keysD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansD
 	//==================================================50
 	record *ansD;
 	cudaMalloc((void**)&ansD, count*sizeof(record));
 	checkCUDAError("cudaMalloc ansD");
 	time2 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	GPU MEMORY				(MALLOC) COPY IN
 	//====================================================================================================100
 	//==================================================50
 	//	recordsD
 	//==================================================50
 	cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy memD");
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy memD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
 	//==================================================50
 	//	keysD
 	//==================================================50
 	cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy keysD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansD
 	//==================================================50
 	cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy ansD");
 	time3 = get_time();
 	//======================================================================================================================================================150
 	// findK kernel
 	//======================================================================================================================================================150
 	findK<<<numBlocks, threadsPerBlock>>>(	maxheight,
 											knodesD,
 											knodes_elem,
 											recordsD,
 											currKnodeD,
 											offsetD,
 											keysD,
 											ansD);
 	cudaThreadSynchronize();
 	checkCUDAError("findK");
 	time4 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY (CONTD.)
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansD
 	//==================================================50
 	cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
 	checkCUDAError("cudaMemcpy ansD");
 	time5 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY DEALLOCATION
 	//======================================================================================================================================================150
 	cudaFree(recordsD);
 	cudaFree(knodesD);
 	cudaFree(currKnodeD);
 	cudaFree(offsetD);
 	cudaFree(keysD);
 	cudaFree(ansD);
 	time6 = get_time();
 	//======================================================================================================================================================150
 	//	DISPLAY TIMING
 	//======================================================================================================================================================150
 	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
 	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
 	printf("Total time:\n");
 	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
 }
 //========================================================================================================================================================================================================200
 //	END
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.h
@ -0,0 +1,23 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	KERNEL_GPU_CUDA_WRAPPER HEADER
 //========================================================================================================================================================================================================200
 void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
                             long knodes_elem, long knodes_mem,
                             int order, long maxheight, int count,
                             long *currKnode, long *offset, int *keys,
                             record *ans);
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu
@ -0,0 +1,347 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	INCLUDE
 //========================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	COMMON
 //======================================================================================================================================================150
 #include "../common.h"									// (in the main program folder)	needed to recognized input parameters
 //======================================================================================================================================================150
 //	UTILITIES
 //======================================================================================================================================================150
 #include "../util/cuda/cuda.h"							// (in library path specified to compiler)	needed by for device functions
 #include "../util/timer/timer.h"						// (in library path specified to compiler)	needed by timer
 //======================================================================================================================================================150
 //	KERNEL
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda_2.cu"						// (in the current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables
 //======================================================================================================================================================150
 //	HEADER
 //======================================================================================================================================================150
 #include "./kernel_gpu_cuda_wrapper_2.h"				// (in the current directory)
 //========================================================================================================================================================================================================200
 //	FUNCTION
 //========================================================================================================================================================================================================200
 void
 kernel_gpu_cuda_wrapper_2(	knode *knodes,
 							long knodes_elem,
 							long knodes_mem,
 							int order,
 							long maxheight,
 							int count,
 							long *currKnode,
 							long *offset,
 							long *lastKnode,
 							long *offset_2,
 							int *start,
 							int *end,
 							int *recstart,
 							int *reclength)
 {
 	//======================================================================================================================================================150
 	//	CPU VARIABLES
 	//======================================================================================================================================================150
 	// timer
 	long long time0;
 	long long time1;
 	long long time2;
 	long long time3;
 	long long time4;
 	long long time5;
 	long long time6;
 	time0 = get_time();
 	//======================================================================================================================================================150
 	//	GPU SETUP
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 	cudaThreadSynchronize();
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
 	//====================================================================================================100
 	int numBlocks;
 	numBlocks = count;
 	int threadsPerBlock;
 	threadsPerBlock = order < 1024 ? order : 1024;
 	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
 	time1 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY				MALLOC
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN
 	//====================================================================================================100
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	knode *knodesD;
 	cudaMalloc((void**)&knodesD, knodes_mem);
 	checkCUDAError("cudaMalloc  recordsD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	long *currKnodeD;
 	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	long *offsetD;
 	cudaMalloc((void**)&offsetD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  offsetD");
 	//==================================================50
 	//	lastKnodeD
 	//==================================================50
 	long *lastKnodeD;
 	cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
 	checkCUDAError("cudaMalloc  lastKnodeD");
 	//==================================================50
 	//	offset_2D
 	//==================================================50
 	long *offset_2D;
 	cudaMalloc((void**)&offset_2D, count*sizeof(long));
 	checkCUDAError("cudaMalloc  offset_2D");
 	//==================================================50
 	//	startD
 	//==================================================50
 	int *startD;
 	cudaMalloc((void**)&startD, count*sizeof(int));
 	checkCUDAError("cudaMalloc startD");
 	//==================================================50
 	//	endD
 	//==================================================50
 	int *endD;
 	cudaMalloc((void**)&endD, count*sizeof(int));
 	checkCUDAError("cudaMalloc endD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansDStart
 	//==================================================50
 	int *ansDStart;
 	cudaMalloc((void**)&ansDStart, count*sizeof(int));
 	checkCUDAError("cudaMalloc ansDStart");
 	//==================================================50
 	//	ansDLength
 	//==================================================50
 	int *ansDLength;
 	cudaMalloc((void**)&ansDLength, count*sizeof(int));
 	checkCUDAError("cudaMalloc ansDLength");
 	time2 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN
 	//====================================================================================================100
 	//==================================================50
 	//	knodesD
 	//==================================================50
 	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy memD");
 	//==================================================50
 	//	currKnodeD
 	//==================================================50
 	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
 	//==================================================50
 	//	offsetD
 	//==================================================50
 	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy offsetD");
 	//==================================================50
 	//	lastKnodeD
 	//==================================================50
 	cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
 	//==================================================50
 	//	offset_2D
 	//==================================================50
 	cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
 	//==================================================50
 	//	startD
 	//==================================================50
 	cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy startD");
 	//==================================================50
 	//	endD
 	//==================================================50
 	cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy endD");
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansDStart
 	//==================================================50
 	cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy ansDStart");
 	//==================================================50
 	//	ansDLength
 	//==================================================50
 	cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
 	checkCUDAError("cudaMemcpy ansDLength");
 	time3 = get_time();
 	//======================================================================================================================================================150
 	//	KERNEL
 	//======================================================================================================================================================150
 	// [GPU] findRangeK kernel
 	findRangeK<<<numBlocks, threadsPerBlock>>>(	maxheight,
 												knodesD,
 												knodes_elem,
 												currKnodeD,
 												offsetD,
 												lastKnodeD,
 												offset_2D,
 												startD,
 												endD,
 												ansDStart,
 												ansDLength);
 	cudaThreadSynchronize();
 	checkCUDAError("findRangeK");
 	time4 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY			COPY (CONTD.)
 	//======================================================================================================================================================150
 	//====================================================================================================100
 	//	DEVICE IN/OUT
 	//====================================================================================================100
 	//==================================================50
 	//	ansDStart
 	//==================================================50
 	cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
 	checkCUDAError("cudaMemcpy ansDStart");
 	//==================================================50
 	//	ansDLength
 	//==================================================50
 	cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
 	checkCUDAError("cudaMemcpy ansDLength");
 	time5 = get_time();
 	//======================================================================================================================================================150
 	//	GPU MEMORY DEALLOCATION
 	//======================================================================================================================================================150
 	cudaFree(knodesD);
 	cudaFree(currKnodeD);
 	cudaFree(offsetD);
 	cudaFree(lastKnodeD);
 	cudaFree(offset_2D);
 	cudaFree(startD);
 	cudaFree(endD);
 	cudaFree(ansDStart);
 	cudaFree(ansDLength);
 	time6 = get_time();
 	//======================================================================================================================================================150
 	//	DISPLAY TIMING
 	//======================================================================================================================================================150
 	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
 	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
 	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
 	printf("Total time:\n");
 	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);
 }
 //========================================================================================================================================================================================================200
 //	END
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
+++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h
@ -0,0 +1,23 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //========================================================================================================================================================================================================200
 //	KERNEL_GPU_CUDA_WRAPPER HEADER
 //========================================================================================================================================================================================================200
 void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
                               int order, long maxheight, int count,
                               long *currKnode, long *offset, long *lastKnode,
                               long *offset_2, int *start, int *end,
                               int *recstart, int *reclength);
 //========================================================================================================================================================================================================200
 //	End
 //========================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,332 @@
 ; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 %struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
 %struct.record = type { i32 }
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
 entry:
  %height.addr = alloca i64, align 8
  %knodesD.addr = alloca %struct.knode*, align 8
  %knodes_elem.addr = alloca i64, align 8
  %recordsD.addr = alloca %struct.record*, align 8
  %currKnodeD.addr = alloca i64*, align 8
  %offsetD.addr = alloca i64*, align 8
  %keysD.addr = alloca i32*, align 8
  %ansD.addr = alloca %struct.record*, align 8
  %thid = alloca i32, align 4
  %bid = alloca i32, align 4
  %i = alloca i32, align 4
  store i64 %height, i64* %height.addr, align 8
  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
  store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
  store i64* %offsetD, i64** %offsetD.addr, align 8
  store i32* %keysD, i32** %keysD.addr, align 8
  store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %thid, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %bid, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, i32* %i, align 4
  %conv = sext i32 %0 to i64
  %1 = load i64, i64* %height.addr, align 8
  %cmp = icmp slt i64 %conv, %1
  br i1 %cmp, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %3 = load i64*, i64** %currKnodeD.addr, align 8
  %4 = load i32, i32* %bid, align 4
  %idxprom = sext i32 %4 to i64
  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
  %5 = load i64, i64* %arrayidx, align 8
  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
  %6 = load i32, i32* %thid, align 4
  %idxprom3 = sext i32 %6 to i64
  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
  %7 = load i32, i32* %arrayidx4, align 4
  %8 = load i32*, i32** %keysD.addr, align 8
  %9 = load i32, i32* %bid, align 4
  %idxprom5 = sext i32 %9 to i64
  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
  %10 = load i32, i32* %arrayidx6, align 4
  %cmp7 = icmp sle i32 %7, %10
  br i1 %cmp7, label %land.lhs.true, label %if.end34
 land.lhs.true:                                    ; preds = %for.body
  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %12 = load i64*, i64** %currKnodeD.addr, align 8
  %13 = load i32, i32* %bid, align 4
  %idxprom8 = sext i32 %13 to i64
  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
  %14 = load i64, i64* %arrayidx9, align 8
  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
  %15 = load i32, i32* %thid, align 4
  %add = add nsw i32 %15, 1
  %idxprom12 = sext i32 %add to i64
  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
  %16 = load i32, i32* %arrayidx13, align 4
  %17 = load i32*, i32** %keysD.addr, align 8
  %18 = load i32, i32* %bid, align 4
  %idxprom14 = sext i32 %18 to i64
  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
  %19 = load i32, i32* %arrayidx15, align 4
  %cmp16 = icmp sgt i32 %16, %19
  br i1 %cmp16, label %if.then, label %if.end34
 if.then:                                          ; preds = %land.lhs.true
  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %21 = load i64*, i64** %offsetD.addr, align 8
  %22 = load i32, i32* %bid, align 4
  %idxprom17 = sext i32 %22 to i64
  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
  %23 = load i64, i64* %arrayidx18, align 8
  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
  %24 = load i32, i32* %thid, align 4
  %idxprom20 = sext i32 %24 to i64
  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
  %25 = load i32, i32* %arrayidx21, align 4
  %conv22 = sext i32 %25 to i64
  %26 = load i64, i64* %knodes_elem.addr, align 8
  %cmp23 = icmp slt i64 %conv22, %26
  br i1 %cmp23, label %if.then24, label %if.end
 if.then24:                                        ; preds = %if.then
  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %28 = load i64*, i64** %offsetD.addr, align 8
  %29 = load i32, i32* %bid, align 4
  %idxprom25 = sext i32 %29 to i64
  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
  %30 = load i64, i64* %arrayidx26, align 8
  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
  %31 = load i32, i32* %thid, align 4
  %idxprom29 = sext i32 %31 to i64
  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
  %32 = load i32, i32* %arrayidx30, align 4
  %conv31 = sext i32 %32 to i64
  %33 = load i64*, i64** %offsetD.addr, align 8
  %34 = load i32, i32* %bid, align 4
  %idxprom32 = sext i32 %34 to i64
  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
  store i64 %conv31, i64* %arrayidx33, align 8
  br label %if.end
 if.end:                                           ; preds = %if.then24, %if.then
  br label %if.end34
 if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
  call void @llvm.nvvm.barrier0()
  %35 = load i32, i32* %thid, align 4
  %cmp35 = icmp eq i32 %35, 0
  br i1 %cmp35, label %if.then36, label %if.end41
 if.then36:                                        ; preds = %if.end34
  %36 = load i64*, i64** %offsetD.addr, align 8
  %37 = load i32, i32* %bid, align 4
  %idxprom37 = sext i32 %37 to i64
  %arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
  %38 = load i64, i64* %arrayidx38, align 8
  %39 = load i64*, i64** %currKnodeD.addr, align 8
  %40 = load i32, i32* %bid, align 4
  %idxprom39 = sext i32 %40 to i64
  %arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
  store i64 %38, i64* %arrayidx40, align 8
  br label %if.end41
 if.end41:                                         ; preds = %if.then36, %if.end34
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end41
  %41 = load i32, i32* %i, align 4
  %inc = add nsw i32 %41, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %43 = load i64*, i64** %currKnodeD.addr, align 8
  %44 = load i32, i32* %bid, align 4
  %idxprom42 = sext i32 %44 to i64
  %arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
  %45 = load i64, i64* %arrayidx43, align 8
  %arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
  %keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
  %46 = load i32, i32* %thid, align 4
  %idxprom46 = sext i32 %46 to i64
  %arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
  %47 = load i32, i32* %arrayidx47, align 4
  %48 = load i32*, i32** %keysD.addr, align 8
  %49 = load i32, i32* %bid, align 4
  %idxprom48 = sext i32 %49 to i64
  %arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
  %50 = load i32, i32* %arrayidx49, align 4
  %cmp50 = icmp eq i32 %47, %50
  br i1 %cmp50, label %if.then51, label %if.end63
 if.then51:                                        ; preds = %for.end
  %51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
  %52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %53 = load i64*, i64** %currKnodeD.addr, align 8
  %54 = load i32, i32* %bid, align 4
  %idxprom52 = sext i32 %54 to i64
  %arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
  %55 = load i64, i64* %arrayidx53, align 8
  %arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
  %indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
  %56 = load i32, i32* %thid, align 4
  %idxprom56 = sext i32 %56 to i64
  %arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
  %57 = load i32, i32* %arrayidx57, align 4
  %idxprom58 = sext i32 %57 to i64
  %arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
  %value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
  %58 = load i32, i32* %value, align 4
  %59 = load %struct.record*, %struct.record** %ansD.addr, align 8
  %60 = load i32, i32* %bid, align 4
  %idxprom60 = sext i32 %60 to i64
  %arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
  %value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
  store i32 %58, i32* %value62, align 4
  br label %if.end63
 if.end63:                                         ; preds = %if.then51, %for.end
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,475 @@
 ; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 %struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
 entry:
  %height.addr = alloca i64, align 8
  %knodesD.addr = alloca %struct.knode*, align 8
  %knodes_elem.addr = alloca i64, align 8
  %currKnodeD.addr = alloca i64*, align 8
  %offsetD.addr = alloca i64*, align 8
  %lastKnodeD.addr = alloca i64*, align 8
  %offset_2D.addr = alloca i64*, align 8
  %startD.addr = alloca i32*, align 8
  %endD.addr = alloca i32*, align 8
  %RecstartD.addr = alloca i32*, align 8
  %ReclenD.addr = alloca i32*, align 8
  %thid = alloca i32, align 4
  %bid = alloca i32, align 4
  %i = alloca i32, align 4
  store i64 %height, i64* %height.addr, align 8
  store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
  store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
  store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
  store i64* %offsetD, i64** %offsetD.addr, align 8
  store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
  store i64* %offset_2D, i64** %offset_2D.addr, align 8
  store i32* %startD, i32** %startD.addr, align 8
  store i32* %endD, i32** %endD.addr, align 8
  store i32* %RecstartD, i32** %RecstartD.addr, align 8
  store i32* %ReclenD, i32** %ReclenD.addr, align 8
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %thid, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call1, i32* %bid, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, i32* %i, align 4
  %conv = sext i32 %0 to i64
  %1 = load i64, i64* %height.addr, align 8
  %cmp = icmp slt i64 %conv, %1
  br i1 %cmp, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %3 = load i64*, i64** %currKnodeD.addr, align 8
  %4 = load i32, i32* %bid, align 4
  %idxprom = sext i32 %4 to i64
  %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
  %5 = load i64, i64* %arrayidx, align 8
  %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
  %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
  %6 = load i32, i32* %thid, align 4
  %idxprom3 = sext i32 %6 to i64
  %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
  %7 = load i32, i32* %arrayidx4, align 4
  %8 = load i32*, i32** %startD.addr, align 8
  %9 = load i32, i32* %bid, align 4
  %idxprom5 = sext i32 %9 to i64
  %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
  %10 = load i32, i32* %arrayidx6, align 4
  %cmp7 = icmp sle i32 %7, %10
  br i1 %cmp7, label %land.lhs.true, label %if.end34
 land.lhs.true:                                    ; preds = %for.body
  %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %12 = load i64*, i64** %currKnodeD.addr, align 8
  %13 = load i32, i32* %bid, align 4
  %idxprom8 = sext i32 %13 to i64
  %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
  %14 = load i64, i64* %arrayidx9, align 8
  %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
  %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
  %15 = load i32, i32* %thid, align 4
  %add = add nsw i32 %15, 1
  %idxprom12 = sext i32 %add to i64
  %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
  %16 = load i32, i32* %arrayidx13, align 4
  %17 = load i32*, i32** %startD.addr, align 8
  %18 = load i32, i32* %bid, align 4
  %idxprom14 = sext i32 %18 to i64
  %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
  %19 = load i32, i32* %arrayidx15, align 4
  %cmp16 = icmp sgt i32 %16, %19
  br i1 %cmp16, label %if.then, label %if.end34
 if.then:                                          ; preds = %land.lhs.true
  %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %21 = load i64*, i64** %currKnodeD.addr, align 8
  %22 = load i32, i32* %bid, align 4
  %idxprom17 = sext i32 %22 to i64
  %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
  %23 = load i64, i64* %arrayidx18, align 8
  %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
  %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
  %24 = load i32, i32* %thid, align 4
  %idxprom20 = sext i32 %24 to i64
  %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
  %25 = load i32, i32* %arrayidx21, align 4
  %conv22 = sext i32 %25 to i64
  %26 = load i64, i64* %knodes_elem.addr, align 8
  %cmp23 = icmp slt i64 %conv22, %26
  br i1 %cmp23, label %if.then24, label %if.end
 if.then24:                                        ; preds = %if.then
  %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %28 = load i64*, i64** %currKnodeD.addr, align 8
  %29 = load i32, i32* %bid, align 4
  %idxprom25 = sext i32 %29 to i64
  %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
  %30 = load i64, i64* %arrayidx26, align 8
  %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
  %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
  %31 = load i32, i32* %thid, align 4
  %idxprom29 = sext i32 %31 to i64
  %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
  %32 = load i32, i32* %arrayidx30, align 4
  %conv31 = sext i32 %32 to i64
  %33 = load i64*, i64** %offsetD.addr, align 8
  %34 = load i32, i32* %bid, align 4
  %idxprom32 = sext i32 %34 to i64
  %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
  store i64 %conv31, i64* %arrayidx33, align 8
  br label %if.end
 if.end:                                           ; preds = %if.then24, %if.then
  br label %if.end34
 if.end34:                                         ; preds = %if.end, %land.lhs.true, %for.body
  %35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %36 = load i64*, i64** %lastKnodeD.addr, align 8
  %37 = load i32, i32* %bid, align 4
  %idxprom35 = sext i32 %37 to i64
  %arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
  %38 = load i64, i64* %arrayidx36, align 8
  %arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
  %keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
  %39 = load i32, i32* %thid, align 4
  %idxprom39 = sext i32 %39 to i64
  %arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
  %40 = load i32, i32* %arrayidx40, align 4
  %41 = load i32*, i32** %endD.addr, align 8
  %42 = load i32, i32* %bid, align 4
  %idxprom41 = sext i32 %42 to i64
  %arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
  %43 = load i32, i32* %arrayidx42, align 4
  %cmp43 = icmp sle i32 %40, %43
  br i1 %cmp43, label %land.lhs.true44, label %if.end75
 land.lhs.true44:                                  ; preds = %if.end34
  %44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %45 = load i64*, i64** %lastKnodeD.addr, align 8
  %46 = load i32, i32* %bid, align 4
  %idxprom45 = sext i32 %46 to i64
  %arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
  %47 = load i64, i64* %arrayidx46, align 8
  %arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
  %keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
  %48 = load i32, i32* %thid, align 4
  %add49 = add nsw i32 %48, 1
  %idxprom50 = sext i32 %add49 to i64
  %arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
  %49 = load i32, i32* %arrayidx51, align 4
  %50 = load i32*, i32** %endD.addr, align 8
  %51 = load i32, i32* %bid, align 4
  %idxprom52 = sext i32 %51 to i64
  %arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
  %52 = load i32, i32* %arrayidx53, align 4
  %cmp54 = icmp sgt i32 %49, %52
  br i1 %cmp54, label %if.then55, label %if.end75
 if.then55:                                        ; preds = %land.lhs.true44
  %53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %54 = load i64*, i64** %lastKnodeD.addr, align 8
  %55 = load i32, i32* %bid, align 4
  %idxprom56 = sext i32 %55 to i64
  %arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
  %56 = load i64, i64* %arrayidx57, align 8
  %arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
  %indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
  %57 = load i32, i32* %thid, align 4
  %idxprom60 = sext i32 %57 to i64
  %arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
  %58 = load i32, i32* %arrayidx61, align 4
  %conv62 = sext i32 %58 to i64
  %59 = load i64, i64* %knodes_elem.addr, align 8
  %cmp63 = icmp slt i64 %conv62, %59
  br i1 %cmp63, label %if.then64, label %if.end74
 if.then64:                                        ; preds = %if.then55
  %60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %61 = load i64*, i64** %lastKnodeD.addr, align 8
  %62 = load i32, i32* %bid, align 4
  %idxprom65 = sext i32 %62 to i64
  %arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
  %63 = load i64, i64* %arrayidx66, align 8
  %arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
  %indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
  %64 = load i32, i32* %thid, align 4
  %idxprom69 = sext i32 %64 to i64
  %arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
  %65 = load i32, i32* %arrayidx70, align 4
  %conv71 = sext i32 %65 to i64
  %66 = load i64*, i64** %offset_2D.addr, align 8
  %67 = load i32, i32* %bid, align 4
  %idxprom72 = sext i32 %67 to i64
  %arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
  store i64 %conv71, i64* %arrayidx73, align 8
  br label %if.end74
 if.end74:                                         ; preds = %if.then64, %if.then55
  br label %if.end75
 if.end75:                                         ; preds = %if.end74, %land.lhs.true44, %if.end34
  call void @llvm.nvvm.barrier0()
  %68 = load i32, i32* %thid, align 4
  %cmp76 = icmp eq i32 %68, 0
  br i1 %cmp76, label %if.then77, label %if.end86
 if.then77:                                        ; preds = %if.end75
  %69 = load i64*, i64** %offsetD.addr, align 8
  %70 = load i32, i32* %bid, align 4
  %idxprom78 = sext i32 %70 to i64
  %arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
  %71 = load i64, i64* %arrayidx79, align 8
  %72 = load i64*, i64** %currKnodeD.addr, align 8
  %73 = load i32, i32* %bid, align 4
  %idxprom80 = sext i32 %73 to i64
  %arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
  store i64 %71, i64* %arrayidx81, align 8
  %74 = load i64*, i64** %offset_2D.addr, align 8
  %75 = load i32, i32* %bid, align 4
  %idxprom82 = sext i32 %75 to i64
  %arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
  %76 = load i64, i64* %arrayidx83, align 8
  %77 = load i64*, i64** %lastKnodeD.addr, align 8
  %78 = load i32, i32* %bid, align 4
  %idxprom84 = sext i32 %78 to i64
  %arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
  store i64 %76, i64* %arrayidx85, align 8
  br label %if.end86
 if.end86:                                         ; preds = %if.then77, %if.end75
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end86
  %79 = load i32, i32* %i, align 4
  %inc = add nsw i32 %79, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %81 = load i64*, i64** %currKnodeD.addr, align 8
  %82 = load i32, i32* %bid, align 4
  %idxprom87 = sext i32 %82 to i64
  %arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
  %83 = load i64, i64* %arrayidx88, align 8
  %arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
  %keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
  %84 = load i32, i32* %thid, align 4
  %idxprom91 = sext i32 %84 to i64
  %arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
  %85 = load i32, i32* %arrayidx92, align 4
  %86 = load i32*, i32** %startD.addr, align 8
  %87 = load i32, i32* %bid, align 4
  %idxprom93 = sext i32 %87 to i64
  %arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
  %88 = load i32, i32* %arrayidx94, align 4
  %cmp95 = icmp eq i32 %85, %88
  br i1 %cmp95, label %if.then96, label %if.end105
 if.then96:                                        ; preds = %for.end
  %89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %90 = load i64*, i64** %currKnodeD.addr, align 8
  %91 = load i32, i32* %bid, align 4
  %idxprom97 = sext i32 %91 to i64
  %arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
  %92 = load i64, i64* %arrayidx98, align 8
  %arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
  %indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
  %93 = load i32, i32* %thid, align 4
  %idxprom101 = sext i32 %93 to i64
  %arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
  %94 = load i32, i32* %arrayidx102, align 4
  %95 = load i32*, i32** %RecstartD.addr, align 8
  %96 = load i32, i32* %bid, align 4
  %idxprom103 = sext i32 %96 to i64
  %arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
  store i32 %94, i32* %arrayidx104, align 4
  br label %if.end105
 if.end105:                                        ; preds = %if.then96, %for.end
  call void @llvm.nvvm.barrier0()
  %97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %98 = load i64*, i64** %lastKnodeD.addr, align 8
  %99 = load i32, i32* %bid, align 4
  %idxprom106 = sext i32 %99 to i64
  %arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
  %100 = load i64, i64* %arrayidx107, align 8
  %arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
  %keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
  %101 = load i32, i32* %thid, align 4
  %idxprom110 = sext i32 %101 to i64
  %arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
  %102 = load i32, i32* %arrayidx111, align 4
  %103 = load i32*, i32** %endD.addr, align 8
  %104 = load i32, i32* %bid, align 4
  %idxprom112 = sext i32 %104 to i64
  %arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
  %105 = load i32, i32* %arrayidx113, align 4
  %cmp114 = icmp eq i32 %102, %105
  br i1 %cmp114, label %if.then115, label %if.end127
 if.then115:                                       ; preds = %if.end105
  %106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
  %107 = load i64*, i64** %lastKnodeD.addr, align 8
  %108 = load i32, i32* %bid, align 4
  %idxprom116 = sext i32 %108 to i64
  %arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
  %109 = load i64, i64* %arrayidx117, align 8
  %arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
  %indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
  %110 = load i32, i32* %thid, align 4
  %idxprom120 = sext i32 %110 to i64
  %arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
  %111 = load i32, i32* %arrayidx121, align 4
  %112 = load i32*, i32** %RecstartD.addr, align 8
  %113 = load i32, i32* %bid, align 4
  %idxprom122 = sext i32 %113 to i64
  %arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
  %114 = load i32, i32* %arrayidx123, align 4
  %sub = sub nsw i32 %111, %114
  %add124 = add nsw i32 %sub, 1
  %115 = load i32*, i32** %ReclenD.addr, align 8
  %116 = load i32, i32* %bid, align 4
  %idxprom125 = sext i32 %116 to i64
  %arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
  store i32 %add124, i32* %arrayidx126, align 4
  br label %if.end127
 if.end127:                                        ; preds = %if.then115, %if.end105
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
--- a/examples/btree/main.c
+++ b/examples/btree/main.c
--- a/examples/btree/run.sh
+++ b/examples/btree/run.sh
@ -0,0 +1,40 @@
 #!/bin/bash
 set -e
 clang -c -emit-llvm util/timer/timer.c
 clang -c -emit-llvm util/num/num.c
 #clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
 #clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
 #clang++ kernel/kernel_gpu_cuda_wrapper.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 #clang++ kernel/kernel_gpu_cuda_wrapper_2.cu  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 clang -c -emit-llvm main.c
 llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
 llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
 ../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
 ../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
 ../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
 llc --relocation-model=pic --filetype=obj  main.bc
 llc --relocation-model=pic --filetype=obj  cuda.bc
 llc --relocation-model=pic --filetype=obj  num.bc
 llc --relocation-model=pic --filetype=obj  timer.bc
 llc --relocation-model=pic --filetype=obj  kernel1.bc
 llc --relocation-model=pic --filetype=obj  kernel2.bc
 llc --relocation-model=pic --filetype=obj  host1.bc
 llc --relocation-model=pic --filetype=obj  host2.bc
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o b+tree.out \
    -fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
    -lc -lx86Runtime -lthreadPool -lpthread
 ./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
    command ../../rodinia-data/b+tree/command.txt
 if grep -q "0    840187    6001" output.txt; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/btree/util/cuda/cuda.cu
+++ b/examples/btree/util/cuda/cuda.cu
@ -0,0 +1,75 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	SET_DEVICE CODE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE/DEFINE
 //======================================================================================================================================================150
 #include "cuda.h"					// (in library path specified to compiler)
 //======================================================================================================================================================150
 //	FUNCTIONS
 //======================================================================================================================================================150
 //====================================================================================================100
 //	SET DEVICE
 //====================================================================================================100
 void setdevice(void){
 	// variables
 	int num_devices;
 	int device;
 	// work
 	cudaGetDeviceCount(&num_devices);
 	if (num_devices > 1) {
 		// variables
 		int max_multiprocessors;
 		int max_device;
 		cudaDeviceProp properties;
 		// initialize variables
 		max_multiprocessors = 0;
 		max_device = 0;
 		for (device = 0; device < num_devices; device++) {
 			cudaGetDeviceProperties(&properties, device);
 			if (max_multiprocessors < properties.multiProcessorCount) {
 				max_multiprocessors = properties.multiProcessorCount;
 				max_device = device;
 			}
 		}
 		cudaSetDevice(max_device);
 	}
 }
 //====================================================================================================100
 //	GET LAST ERROR
 //====================================================================================================100
 void checkCUDAError(const char *msg)
 {
 	cudaError_t err = cudaGetLastError();
 	if( cudaSuccess != err) {
 		// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
 		printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
 		fflush(NULL);
 		exit(EXIT_FAILURE);
 	}
 }
 //===============================================================================================================================================================================================================200
 //	END
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/cuda/cuda.h
+++ b/examples/btree/util/cuda/cuda.h
@ -0,0 +1,37 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	SET_DEVICE HEADER
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE/DEFINE
 //======================================================================================================================================================150
 #include <stdio.h> // (in library path known to compiler)		needed by printf
 //======================================================================================================================================================150
 //	FUNCTION PROTOTYPES
 //======================================================================================================================================================150
 //====================================================================================================100
 //	SET DEVICE
 //====================================================================================================100
 void setdevice(void);
 //====================================================================================================100
 //	GET LAST ERROR
 //====================================================================================================100
 void checkCUDAError(const char *msg);
 //===============================================================================================================================================================================================================200
 //	END SET_DEVICE HEADER
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/num/num.c
+++ b/examples/btree/util/num/num.c
@ -0,0 +1,55 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	DESCRIPTION
 //===============================================================================================================================================================================================================200
 // Returns:	0 if string does not represent integer
 //			1 if string represents integer
 //===============================================================================================================================================================================================================200
 //	NUM CODE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	ISINTEGER FUNCTION
 //======================================================================================================================================================150
 int isInteger(char *str) {
  //====================================================================================================100
  //	make sure it's not empty
  //====================================================================================================100
  if (*str == '\0') {
    return 0;
  }
  //====================================================================================================100
  //	if any digit is not a number, return false
  //====================================================================================================100
  for (; *str != '\0'; str++) {
    if (*str < 48 ||
        *str >
            57) { // digit characters (need to include . if checking for float)
      return 0;
    }
  }
  //====================================================================================================100
  //	it got past all my checks so I think it's a number
  //====================================================================================================100
  return 1;
 }
 //===============================================================================================================================================================================================================200
 //	END NUM CODE
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/num/num.h
+++ b/examples/btree/util/num/num.h
@ -0,0 +1,21 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	FILE HEADER
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	ISINTEGER FUNCTION PROTOTYPE
 //======================================================================================================================================================150
 int isInteger(char *str);
 //===============================================================================================================================================================================================================200
 //	END FILE HEADER
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/timer/timer.c
+++ b/examples/btree/util/timer/timer.c
@ -0,0 +1,36 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	TIMER CODE
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	INCLUDE/DEFINE
 //======================================================================================================================================================150
 #include <stdlib.h>
 //======================================================================================================================================================150
 //	FUNCTIONS
 //======================================================================================================================================================150
 //====================================================================================================100
 //	DISPLAY TIME
 //====================================================================================================100
 // Returns the current system time in microseconds
 long long get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (tv.tv_sec * 1000000) + tv.tv_usec;
 }
 //===============================================================================================================================================================================================================200
 //	END TIMER CODE
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/btree/util/timer/timer.h
+++ b/examples/btree/util/timer/timer.h
@ -0,0 +1,21 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================200
 //	TIMER HEADER
 //===============================================================================================================================================================================================================200
 //======================================================================================================================================================150
 //	FUNCTION PROTOTYPES
 //======================================================================================================================================================150
 long long get_time();
 //===============================================================================================================================================================================================================200
 //	END TIMER HEADER
 //===============================================================================================================================================================================================================200
 #ifdef __cplusplus
 }
 #endif
--- a/examples/cfd/euler3d.cu
+++ b/examples/cfd/euler3d.cu
@ -0,0 +1,662 @@
 #include <fstream>
 #include <helper_cuda.h>
 #include <helper_timer.h>
 #include <iostream>
 /*
 * Options
 *
 */
 #define GAMMA 1.4f
 #define iterations 2
 // #ifndef block_length
 // 	#define block_length 192
 // #endif
 #define NDIM 3
 #define NNB 4
 #define RK 3 // 3rd order RK
 #define ff_mach 1.2f
 #define deg_angle_of_attack 0.0f
 /*
 * not options
 */
 #ifdef RD_WG_SIZE_0_0
 #define BLOCK_SIZE_0 RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define BLOCK_SIZE_0 RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_0 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_0 192
 #endif
 #ifdef RD_WG_SIZE_1_0
 #define BLOCK_SIZE_1 RD_WG_SIZE_1_0
 #elif defined(RD_WG_SIZE_1)
 #define BLOCK_SIZE_1 RD_WG_SIZE_1
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_1 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_1 192
 #endif
 #ifdef RD_WG_SIZE_2_0
 #define BLOCK_SIZE_2 RD_WG_SIZE_2_0
 #elif defined(RD_WG_SIZE_1)
 #define BLOCK_SIZE_2 RD_WG_SIZE_2
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_2 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_2 192
 #endif
 #ifdef RD_WG_SIZE_3_0
 #define BLOCK_SIZE_3 RD_WG_SIZE_3_0
 #elif defined(RD_WG_SIZE_3)
 #define BLOCK_SIZE_3 RD_WG_SIZE_3
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_3 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_3 192
 #endif
 #ifdef RD_WG_SIZE_4_0
 #define BLOCK_SIZE_4 RD_WG_SIZE_4_0
 #elif defined(RD_WG_SIZE_4)
 #define BLOCK_SIZE_4 RD_WG_SIZE_4
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_4 RD_WG_SIZE
 #else
 #define BLOCK_SIZE_4 192
 #endif
 // #if block_length > 128
 // #warning "the kernels may fail too launch on some systems if the block length
 // is too large" #endif
 #define VAR_DENSITY 0
 #define VAR_MOMENTUM 1
 #define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
 #define NVAR (VAR_DENSITY_ENERGY + 1)
 /*
 * Generic functions
 */
 template <typename T> T *alloc(int N) {
  T *t;
  checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
  return t;
 }
 template <typename T> void dealloc(T *array) {
  checkCudaErrors(cudaFree((void *)array));
 }
 template <typename T> void copy(T *dst, T *src, int N) {
  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
                             cudaMemcpyDeviceToDevice));
 }
 template <typename T> void upload(T *dst, T *src, int N) {
  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
                             cudaMemcpyHostToDevice));
 }
 template <typename T> void download(T *dst, T *src, int N) {
  checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
                             cudaMemcpyDeviceToHost));
 }
 void dump(float *variables, int nel, int nelr) {
  float *h_variables = new float[nelr * NVAR];
  download(h_variables, variables, nelr * NVAR);
  {
    std::ofstream file("density");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++)
      file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
  }
  {
    std::ofstream file("momentum");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++) {
      for (int j = 0; j != NDIM; j++)
        file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
      file << std::endl;
    }
  }
  {
    std::ofstream file("density_energy");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++)
      file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
  }
  delete[] h_variables;
 }
 /*
 * Element-based Cell-centered FVM solver functions
 */
 __constant__ float ff_variable[NVAR];
 __constant__ float3 ff_flux_contribution_momentum_x[1];
 __constant__ float3 ff_flux_contribution_momentum_y[1];
 __constant__ float3 ff_flux_contribution_momentum_z[1];
 __constant__ float3 ff_flux_contribution_density_energy[1];
 __global__ void cuda_initialize_variables(int nelr, float *variables) {
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  for (int j = 0; j < NVAR; j++)
    variables[i + j * nelr] = ff_variable[j];
 }
 void initialize_variables(int nelr, float *variables) {
  dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
  cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
  getLastCudaError("initialize_variables failed");
 }
 __device__ __host__ inline void compute_flux_contribution(
    float &density, float3 &momentum, float &density_energy, float &pressure,
    float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
    float3 &fc_momentum_z, float3 &fc_density_energy) {
  fc_momentum_x.x = velocity.x * momentum.x + pressure;
  fc_momentum_x.y = velocity.x * momentum.y;
  fc_momentum_x.z = velocity.x * momentum.z;
  fc_momentum_y.x = fc_momentum_x.y;
  fc_momentum_y.y = velocity.y * momentum.y + pressure;
  fc_momentum_y.z = velocity.y * momentum.z;
  fc_momentum_z.x = fc_momentum_x.z;
  fc_momentum_z.y = fc_momentum_y.z;
  fc_momentum_z.z = velocity.z * momentum.z + pressure;
  float de_p = density_energy + pressure;
  fc_density_energy.x = velocity.x * de_p;
  fc_density_energy.y = velocity.y * de_p;
  fc_density_energy.z = velocity.z * de_p;
 }
 __device__ inline void compute_velocity(float &density, float3 &momentum,
                                        float3 &velocity) {
  velocity.x = momentum.x / density;
  velocity.y = momentum.y / density;
  velocity.z = momentum.z / density;
 }
 __device__ inline float compute_speed_sqd(float3 &velocity) {
  return velocity.x * velocity.x + velocity.y * velocity.y +
         velocity.z * velocity.z;
 }
 __device__ inline float compute_pressure(float &density, float &density_energy,
                                         float &speed_sqd) {
  return (float(GAMMA) - float(1.0f)) *
         (density_energy - float(0.5f) * density * speed_sqd);
 }
 __device__ inline float compute_speed_of_sound(float &density,
                                               float &pressure) {
  return sqrtf(float(GAMMA) * pressure / density);
 }
 __global__ void cuda_compute_step_factor(int nelr, float *variables,
                                         float *areas, float *step_factors) {
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  float density = variables[i + VAR_DENSITY * nelr];
  float3 momentum;
  momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
  momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
  momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
  float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
  float3 velocity;
  compute_velocity(density, momentum, velocity);
  float speed_sqd = compute_speed_sqd(velocity);
  float pressure = compute_pressure(density, density_energy, speed_sqd);
  float speed_of_sound = compute_speed_of_sound(density, pressure);
  // dt = float(0.5f) * sqrtf(areas[i]) /  (||v|| + c).... but when we do time
  // stepping, this later would need to be divided by the area, so we just do it
  // all at once
  step_factors[i] =
      float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
 }
 void compute_step_factor(int nelr, float *variables, float *areas,
                         float *step_factors) {
  dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
  cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
  getLastCudaError("compute_step_factor failed");
 }
 /*
 *
 *
 */
 __global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
                                  float *normals, float *variables,
                                  float *fluxes) {
  const float smoothing_coefficient = float(0.2f);
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  int j, nb;
  float3 normal;
  float normal_len;
  float factor;
  float density_i = variables[i + VAR_DENSITY * nelr];
  float3 momentum_i;
  momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
  momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
  momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
  float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
  float3 velocity_i;
  compute_velocity(density_i, momentum_i, velocity_i);
  float speed_sqd_i = compute_speed_sqd(velocity_i);
  float speed_i = sqrtf(speed_sqd_i);
  float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
  float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
  float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
      flux_contribution_i_momentum_z;
  float3 flux_contribution_i_density_energy;
  compute_flux_contribution(
      density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
      flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
      flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
  float flux_i_density = float(0.0f);
  float3 flux_i_momentum;
  flux_i_momentum.x = float(0.0f);
  flux_i_momentum.y = float(0.0f);
  flux_i_momentum.z = float(0.0f);
  float flux_i_density_energy = float(0.0f);
  float3 velocity_nb;
  float density_nb, density_energy_nb;
  float3 momentum_nb;
  float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
      flux_contribution_nb_momentum_z;
  float3 flux_contribution_nb_density_energy;
  float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
 #pragma unroll
  for (j = 0; j < NNB; j++) {
    nb = elements_surrounding_elements[i + j * nelr];
    normal.x = normals[i + (j + 0 * NNB) * nelr];
    normal.y = normals[i + (j + 1 * NNB) * nelr];
    normal.z = normals[i + (j + 2 * NNB) * nelr];
    normal_len =
        sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
    if (nb >= 0) // a legitimate neighbor
    {
      density_nb = variables[nb + VAR_DENSITY * nelr];
      momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
      momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
      momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
      density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
      compute_velocity(density_nb, momentum_nb, velocity_nb);
      speed_sqd_nb = compute_speed_sqd(velocity_nb);
      pressure_nb =
          compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
      speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
      compute_flux_contribution(
          density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
          flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
          flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
      // artificial viscosity
      factor = -normal_len * smoothing_coefficient * float(0.5f) *
               (speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
                speed_of_sound_nb);
      flux_i_density += factor * (density_i - density_nb);
      flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
      flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
      flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
      flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
      // accumulate cell-centered fluxes
      factor = float(0.5f) * normal.x;
      flux_i_density += factor * (momentum_nb.x + momentum_i.x);
      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
                                         flux_contribution_i_density_energy.x);
      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
                                     flux_contribution_i_momentum_x.x);
      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
                                     flux_contribution_i_momentum_y.x);
      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
                                     flux_contribution_i_momentum_z.x);
      factor = float(0.5f) * normal.y;
      flux_i_density += factor * (momentum_nb.y + momentum_i.y);
      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
                                         flux_contribution_i_density_energy.y);
      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
                                     flux_contribution_i_momentum_x.y);
      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
                                     flux_contribution_i_momentum_y.y);
      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
                                     flux_contribution_i_momentum_z.y);
      factor = float(0.5f) * normal.z;
      flux_i_density += factor * (momentum_nb.z + momentum_i.z);
      flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
                                         flux_contribution_i_density_energy.z);
      flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
                                     flux_contribution_i_momentum_x.z);
      flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
                                     flux_contribution_i_momentum_y.z);
      flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
                                     flux_contribution_i_momentum_z.z);
    } else if (nb == -1) // a wing boundary
    {
      flux_i_momentum.x += normal.x * pressure_i;
      flux_i_momentum.y += normal.y * pressure_i;
      flux_i_momentum.z += normal.z * pressure_i;
    } else if (nb == -2) // a far field boundary
    {
      factor = float(0.5f) * normal.x;
      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
      flux_i_density_energy +=
          factor * (ff_flux_contribution_density_energy[0].x +
                    flux_contribution_i_density_energy.x);
      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
                                     flux_contribution_i_momentum_x.x);
      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
                                     flux_contribution_i_momentum_y.x);
      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
                                     flux_contribution_i_momentum_z.x);
      factor = float(0.5f) * normal.y;
      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
      flux_i_density_energy +=
          factor * (ff_flux_contribution_density_energy[0].y +
                    flux_contribution_i_density_energy.y);
      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
                                     flux_contribution_i_momentum_x.y);
      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
                                     flux_contribution_i_momentum_y.y);
      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
                                     flux_contribution_i_momentum_z.y);
      factor = float(0.5f) * normal.z;
      flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
      flux_i_density_energy +=
          factor * (ff_flux_contribution_density_energy[0].z +
                    flux_contribution_i_density_energy.z);
      flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
                                     flux_contribution_i_momentum_x.z);
      flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
                                     flux_contribution_i_momentum_y.z);
      flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
                                     flux_contribution_i_momentum_z.z);
    }
  }
  fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
  fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
  fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
  fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
  fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
 }
 void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
                  float *variables, float *fluxes) {
  dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
  cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
                                variables, fluxes);
  getLastCudaError("compute_flux failed");
 }
 __global__ void cuda_time_step(int j, int nelr, float *old_variables,
                               float *variables, float *step_factors,
                               float *fluxes) {
  const int i = (blockDim.x * blockIdx.x + threadIdx.x);
  float factor = step_factors[i] / float(RK + 1 - j);
  variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
                                      factor * fluxes[i + VAR_DENSITY * nelr];
  variables[i + VAR_DENSITY_ENERGY * nelr] =
      old_variables[i + VAR_DENSITY_ENERGY * nelr] +
      factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
  variables[i + (VAR_MOMENTUM + 0) * nelr] =
      old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
      factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
  variables[i + (VAR_MOMENTUM + 1) * nelr] =
      old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
      factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
  variables[i + (VAR_MOMENTUM + 2) * nelr] =
      old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
      factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
 }
 void time_step(int j, int nelr, float *old_variables, float *variables,
               float *step_factors, float *fluxes) {
  dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
  cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
                             fluxes);
  getLastCudaError("update failed");
 }
 /*
 * Main function
 */
 int main(int argc, char **argv) {
  printf("WG size of kernel:initialize = %d, WG size of "
         "kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
         "%d, WG size of kernel:time_step = %d\n",
         BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
  if (argc < 2) {
    std::cout << "specify data file name" << std::endl;
    return 0;
  }
  const char *data_file_name = argv[1];
  cudaDeviceProp prop;
  int dev;
  checkCudaErrors(cudaSetDevice(0));
  // set far field conditions and load them into constant memory on the gpu
  {
    float h_ff_variable[NVAR];
    const float angle_of_attack =
        float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
    h_ff_variable[VAR_DENSITY] = float(1.4);
    float ff_pressure = float(1.0f);
    float ff_speed_of_sound =
        sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
    float ff_speed = float(ff_mach) * ff_speed_of_sound;
    float3 ff_velocity;
    ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
    ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
    ff_velocity.z = 0.0f;
    h_ff_variable[VAR_MOMENTUM + 0] =
        h_ff_variable[VAR_DENSITY] * ff_velocity.x;
    h_ff_variable[VAR_MOMENTUM + 1] =
        h_ff_variable[VAR_DENSITY] * ff_velocity.y;
    h_ff_variable[VAR_MOMENTUM + 2] =
        h_ff_variable[VAR_DENSITY] * ff_velocity.z;
    h_ff_variable[VAR_DENSITY_ENERGY] =
        h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
        (ff_pressure / float(GAMMA - 1.0f));
    float3 h_ff_momentum;
    h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
    h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
    h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
    float3 h_ff_flux_contribution_momentum_x;
    float3 h_ff_flux_contribution_momentum_y;
    float3 h_ff_flux_contribution_momentum_z;
    float3 h_ff_flux_contribution_density_energy;
    compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
                              h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
                              ff_velocity, h_ff_flux_contribution_momentum_x,
                              h_ff_flux_contribution_momentum_y,
                              h_ff_flux_contribution_momentum_z,
                              h_ff_flux_contribution_density_energy);
    // copy far field conditions to the gpu
    checkCudaErrors(
        cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
                                       &h_ff_flux_contribution_momentum_x,
                                       sizeof(float3)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
                                       &h_ff_flux_contribution_momentum_y,
                                       sizeof(float3)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
                                       &h_ff_flux_contribution_momentum_z,
                                       sizeof(float3)));
    checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
                                       &h_ff_flux_contribution_density_energy,
                                       sizeof(float3)));
  }
  int nel;
  int nelr;
  // read in domain geometry
  float *areas;
  int *elements_surrounding_elements;
  float *normals;
  {
    std::ifstream file(data_file_name);
    file >> nel;
    nelr =
        BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
    float *h_areas = new float[nelr];
    int *h_elements_surrounding_elements = new int[nelr * NNB];
    float *h_normals = new float[nelr * NDIM * NNB];
    // read in data
    for (int i = 0; i < nel; i++) {
      file >> h_areas[i];
      for (int j = 0; j < NNB; j++) {
        file >> h_elements_surrounding_elements[i + j * nelr];
        if (h_elements_surrounding_elements[i + j * nelr] < 0)
          h_elements_surrounding_elements[i + j * nelr] = -1;
        h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
                                                         // Fortran numbering
        for (int k = 0; k < NDIM; k++) {
          file >> h_normals[i + (j + k * NNB) * nelr];
          h_normals[i + (j + k * NNB) * nelr] =
              -h_normals[i + (j + k * NNB) * nelr];
        }
      }
    }
    // fill in remaining data
    int last = nel - 1;
    for (int i = nel; i < nelr; i++) {
      h_areas[i] = h_areas[last];
      for (int j = 0; j < NNB; j++) {
        // duplicate the last element
        h_elements_surrounding_elements[i + j * nelr] =
            h_elements_surrounding_elements[last + j * nelr];
        for (int k = 0; k < NDIM; k++)
          h_normals[last + (j + k * NNB) * nelr] =
              h_normals[last + (j + k * NNB) * nelr];
      }
    }
    areas = alloc<float>(nelr);
    upload<float>(areas, h_areas, nelr);
    elements_surrounding_elements = alloc<int>(nelr * NNB);
    upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
                nelr * NNB);
    normals = alloc<float>(nelr * NDIM * NNB);
    upload<float>(normals, h_normals, nelr * NDIM * NNB);
    delete[] h_areas;
    delete[] h_elements_surrounding_elements;
    delete[] h_normals;
  }
  // Create arrays and set initial conditions
  float *variables = alloc<float>(nelr * NVAR);
  initialize_variables(nelr, variables);
  float *old_variables = alloc<float>(nelr * NVAR);
  float *fluxes = alloc<float>(nelr * NVAR);
  float *step_factors = alloc<float>(nelr);
  // make sure all memory is floatly allocated before we start timing
  initialize_variables(nelr, old_variables);
  initialize_variables(nelr, fluxes);
  cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
  // make sure CUDA isn't still doing something before we start timing
  cudaThreadSynchronize();
  // these need to be computed the first time in order to compute time step
  std::cout << "Starting..." << std::endl;
  StopWatchInterface *timer = 0;
  //	unsigned int timer = 0;
  // CUT_SAFE_CALL( cutCreateTimer( &timer));
  // CUT_SAFE_CALL( cutStartTimer( timer));
  sdkCreateTimer(&timer);
  sdkStartTimer(&timer);
  // Begin iterations
  for (int i = 0; i < iterations; i++) {
    copy<float>(old_variables, variables, nelr * NVAR);
    // for the first iteration we compute the time step
    compute_step_factor(nelr, variables, areas, step_factors);
    getLastCudaError("compute_step_factor failed");
    for (int j = 0; j < RK; j++) {
      compute_flux(nelr, elements_surrounding_elements, normals, variables,
                   fluxes);
      getLastCudaError("compute_flux failed");
      time_step(j, nelr, old_variables, variables, step_factors, fluxes);
      getLastCudaError("time_step failed");
    }
  }
  cudaThreadSynchronize();
  //	CUT_SAFE_CALL( cutStopTimer(timer) );
  sdkStopTimer(&timer);
  std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
            << " seconds per iteration" << std::endl;
  std::cout << "Saving solution..." << std::endl;
  dump(variables, nel, nelr);
  std::cout << "Saved solution..." << std::endl;
  std::cout << "Cleaning up..." << std::endl;
  dealloc<float>(areas);
  dealloc<int>(elements_surrounding_elements);
  dealloc<float>(normals);
  dealloc<float>(variables);
  dealloc<float>(old_variables);
  dealloc<float>(fluxes);
  dealloc<float>(step_factors);
  std::cout << "Done..." << std::endl;
  return 0;
 }
--- a/examples/cfd/run.sh
+++ b/examples/cfd/run.sh
@ -0,0 +1,15 @@
 # # #!/bin/bash
 clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 /home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
 /home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc  host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 ./a.out ../rodinia-data/cfd/fvcorr.domn.097K
 # ./demo 1024
 # # # ./demo -f ../../data/matrix3.txt
 # # # run -f ../../data/gaussian/matrix3.txt
--- a/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,396 @@
 ; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "gaussian.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_blockDim_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
 entry:
  %m_cuda.addr = alloca float*, align 8
  %a_cuda.addr = alloca float*, align 8
  %Size.addr = alloca i32, align 4
  %t.addr = alloca i32, align 4
  store float* %m_cuda, float** %m_cuda.addr, align 8
  store float* %a_cuda, float** %a_cuda.addr, align 8
  store i32 %Size, i32* %Size.addr, align 4
  store i32 %t, i32* %t.addr, align 4
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call1, %call2
  %add = add i32 %call, %mul
  %0 = load i32, i32* %Size.addr, align 4
  %sub = sub nsw i32 %0, 1
  %1 = load i32, i32* %t.addr, align 4
  %sub3 = sub nsw i32 %sub, %1
  %cmp = icmp uge i32 %add, %sub3
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  br label %return
 if.end:                                           ; preds = %entry
  %2 = load float*, float** %a_cuda.addr, align 8
  %3 = load i32, i32* %Size.addr, align 4
  %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul6 = mul i32 %call4, %call5
  %call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add8 = add i32 %mul6, %call7
  %4 = load i32, i32* %t.addr, align 4
  %add9 = add i32 %add8, %4
  %add10 = add i32 %add9, 1
  %mul11 = mul i32 %3, %add10
  %idx.ext = zext i32 %mul11 to i64
  %add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
  %5 = load i32, i32* %t.addr, align 4
  %idx.ext12 = sext i32 %5 to i64
  %add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
  %6 = load float, float* %add.ptr13, align 4
  %7 = load float*, float** %a_cuda.addr, align 8
  %8 = load i32, i32* %Size.addr, align 4
  %9 = load i32, i32* %t.addr, align 4
  %mul14 = mul nsw i32 %8, %9
  %idx.ext15 = sext i32 %mul14 to i64
  %add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
  %10 = load i32, i32* %t.addr, align 4
  %idx.ext17 = sext i32 %10 to i64
  %add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
  %11 = load float, float* %add.ptr18, align 4
  %div = fdiv float %6, %11
  %12 = load float*, float** %m_cuda.addr, align 8
  %13 = load i32, i32* %Size.addr, align 4
  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul21 = mul i32 %call19, %call20
  %call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add23 = add i32 %mul21, %call22
  %14 = load i32, i32* %t.addr, align 4
  %add24 = add i32 %add23, %14
  %add25 = add i32 %add24, 1
  %mul26 = mul i32 %13, %add25
  %idx.ext27 = zext i32 %mul26 to i64
  %add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
  %15 = load i32, i32* %t.addr, align 4
  %idx.ext29 = sext i32 %15 to i64
  %add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
  store float %div, float* %add.ptr30, align 4
  br label %return
 return:                                           ; preds = %if.end, %if.then
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  ret i32 %0
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
 entry:
  %m_cuda.addr = alloca float*, align 8
  %a_cuda.addr = alloca float*, align 8
  %b_cuda.addr = alloca float*, align 8
  %Size.addr = alloca i32, align 4
  %j1.addr = alloca i32, align 4
  %t.addr = alloca i32, align 4
  %xidx = alloca i32, align 4
  %yidx = alloca i32, align 4
  store float* %m_cuda, float** %m_cuda.addr, align 8
  store float* %a_cuda, float** %a_cuda.addr, align 8
  store float* %b_cuda, float** %b_cuda.addr, align 8
  store i32 %Size, i32* %Size.addr, align 4
  store i32 %j1, i32* %j1.addr, align 4
  store i32 %t, i32* %t.addr, align 4
  %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call1, %call2
  %add = add i32 %call, %mul
  %0 = load i32, i32* %Size.addr, align 4
  %sub = sub nsw i32 %0, 1
  %1 = load i32, i32* %t.addr, align 4
  %sub3 = sub nsw i32 %sub, %1
  %cmp = icmp uge i32 %add, %sub3
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  br label %if.end58
 if.end:                                           ; preds = %entry
  %call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
  %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
  %call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
  %mul7 = mul i32 %call5, %call6
  %add8 = add i32 %call4, %mul7
  %2 = load i32, i32* %Size.addr, align 4
  %3 = load i32, i32* %t.addr, align 4
  %sub9 = sub nsw i32 %2, %3
  %cmp10 = icmp uge i32 %add8, %sub9
  br i1 %cmp10, label %if.then11, label %if.end12
 if.then11:                                        ; preds = %if.end
  br label %if.end58
 if.end12:                                         ; preds = %if.end
  %call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %mul15 = mul i32 %call13, %call14
  %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add17 = add i32 %mul15, %call16
  store i32 %add17, i32* %xidx, align 4
  %call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
  %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
  %mul20 = mul i32 %call18, %call19
  %call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
  %add22 = add i32 %mul20, %call21
  store i32 %add22, i32* %yidx, align 4
  %4 = load float*, float** %m_cuda.addr, align 8
  %5 = load i32, i32* %Size.addr, align 4
  %6 = load i32, i32* %xidx, align 4
  %add23 = add nsw i32 %6, 1
  %7 = load i32, i32* %t.addr, align 4
  %add24 = add nsw i32 %add23, %7
  %mul25 = mul nsw i32 %5, %add24
  %8 = load i32, i32* %t.addr, align 4
  %add26 = add nsw i32 %mul25, %8
  %idxprom = sext i32 %add26 to i64
  %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
  %9 = load float, float* %arrayidx, align 4
  %10 = load float*, float** %a_cuda.addr, align 8
  %11 = load i32, i32* %Size.addr, align 4
  %12 = load i32, i32* %t.addr, align 4
  %mul27 = mul nsw i32 %11, %12
  %13 = load i32, i32* %yidx, align 4
  %14 = load i32, i32* %t.addr, align 4
  %add28 = add nsw i32 %13, %14
  %add29 = add nsw i32 %mul27, %add28
  %idxprom30 = sext i32 %add29 to i64
  %arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
  %15 = load float, float* %arrayidx31, align 4
  %mul32 = fmul contract float %9, %15
  %16 = load float*, float** %a_cuda.addr, align 8
  %17 = load i32, i32* %Size.addr, align 4
  %18 = load i32, i32* %xidx, align 4
  %add33 = add nsw i32 %18, 1
  %19 = load i32, i32* %t.addr, align 4
  %add34 = add nsw i32 %add33, %19
  %mul35 = mul nsw i32 %17, %add34
  %20 = load i32, i32* %yidx, align 4
  %21 = load i32, i32* %t.addr, align 4
  %add36 = add nsw i32 %20, %21
  %add37 = add nsw i32 %mul35, %add36
  %idxprom38 = sext i32 %add37 to i64
  %arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
  %22 = load float, float* %arrayidx39, align 4
  %sub40 = fsub contract float %22, %mul32
  store float %sub40, float* %arrayidx39, align 4
  %23 = load i32, i32* %yidx, align 4
  %cmp41 = icmp eq i32 %23, 0
  br i1 %cmp41, label %if.then42, label %if.end58
 if.then42:                                        ; preds = %if.end12
  %24 = load float*, float** %m_cuda.addr, align 8
  %25 = load i32, i32* %Size.addr, align 4
  %26 = load i32, i32* %xidx, align 4
  %add43 = add nsw i32 %26, 1
  %27 = load i32, i32* %t.addr, align 4
  %add44 = add nsw i32 %add43, %27
  %mul45 = mul nsw i32 %25, %add44
  %28 = load i32, i32* %yidx, align 4
  %29 = load i32, i32* %t.addr, align 4
  %add46 = add nsw i32 %28, %29
  %add47 = add nsw i32 %mul45, %add46
  %idxprom48 = sext i32 %add47 to i64
  %arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
  %30 = load float, float* %arrayidx49, align 4
  %31 = load float*, float** %b_cuda.addr, align 8
  %32 = load i32, i32* %t.addr, align 4
  %idxprom50 = sext i32 %32 to i64
  %arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
  %33 = load float, float* %arrayidx51, align 4
  %mul52 = fmul contract float %30, %33
  %34 = load float*, float** %b_cuda.addr, align 8
  %35 = load i32, i32* %xidx, align 4
  %add53 = add nsw i32 %35, 1
  %36 = load i32, i32* %t.addr, align 4
  %add54 = add nsw i32 %add53, %36
  %idxprom55 = sext i32 %add54 to i64
  %arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
  %37 = load float, float* %arrayidx56, align 4
  %sub57 = fsub contract float %37, %mul52
  store float %sub57, float* %arrayidx56, align 4
  br label %if.end58
 if.end58:                                         ; preds = %if.then, %if.then11, %if.then42, %if.end12
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  ret i32 %0
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
 !llvm.ident = !{!9}
 !nvvmir.version = !{!10}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
 !4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
 !5 = !{null, !"align", i32 8}
 !6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !7 = !{null, !"align", i32 16}
 !8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !10 = !{i32 1, i32 4}
--- a/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll
--- a/examples/gauss/gaussian.cu
+++ b/examples/gauss/gaussian.cu
@ -0,0 +1,522 @@
 /*-----------------------------------------------------------
 ** gaussian.cu -- The program is to solve a linear system Ax = b
 **   by using Gaussian Elimination. The algorithm on page 101
 **   ("Foundations of Parallel Programming") is used.
 **   The sequential version is gaussian.c.  This parallel
 **   implementation converts three independent for() loops
 **   into three Fans.  Use the data file ge_3.dat to verify
 **   the correction of the output.
 **
 ** Written by Andreas Kura, 02/15/95
 ** Modified by Chong-wei Xu, 04/20/95
 ** Modified by Chris Gregg for CUDA, 07/20/2009
 **-----------------------------------------------------------
 */
 #include "cuda_runtime.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #ifdef TIMING
 #include "timing.h"
 #endif
 #ifdef RD_WG_SIZE_0_0
 #define MAXBLOCKSIZE RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define MAXBLOCKSIZE RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define MAXBLOCKSIZE RD_WG_SIZE
 #else
 #define MAXBLOCKSIZE 512
 #endif
 // 2D defines. Go from specific to general
 #ifdef RD_WG_SIZE_1_0
 #define BLOCK_SIZE_XY RD_WG_SIZE_1_0
 #elif defined(RD_WG_SIZE_1)
 #define BLOCK_SIZE_XY RD_WG_SIZE_1
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE_XY RD_WG_SIZE
 #else
 #define BLOCK_SIZE_XY 1
 #endif
 #ifdef TIMING
 struct timeval tv;
 struct timeval tv_total_start, tv_total_end;
 struct timeval tv_h2d_start, tv_h2d_end;
 struct timeval tv_d2h_start, tv_d2h_end;
 struct timeval tv_kernel_start, tv_kernel_end;
 struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
 struct timeval tv_close_start, tv_close_end;
 float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
      d2h_time = 0, close_time = 0, total_time = 0;
 #endif
 int Size;
 float *a, *b, *finalVec;
 float *m;
 FILE *fp;
 void InitProblemOnce(char *filename);
 void InitPerRun();
 void ForwardSub();
 void BackSub();
 __global__ void Fan1(float *m, float *a, int Size, int t);
 __global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
 void InitMat(float *ary, int nrow, int ncol);
 void InitAry(float *ary, int ary_size);
 void PrintMat(float *ary, int nrow, int ncolumn);
 void PrintAry(float *ary, int ary_size);
 void PrintDeviceProperties();
 void checkCUDAError(const char *msg);
 unsigned int totalKernelTime = 0;
 // create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
 void create_matrix(float *m, int size) {
  int i, j;
  float lamda = -0.01;
  float coe[2 * size - 1];
  float coe_i = 0.0;
  for (i = 0; i < size; i++) {
    coe_i = 10 * exp(lamda * i);
    j = size - 1 + i;
    coe[j] = coe_i;
    j = size - 1 - i;
    coe[j] = coe_i;
  }
  for (i = 0; i < size; i++) {
    for (j = 0; j < size; j++) {
      m[i * size + j] = coe[size - 1 - i + j];
    }
  }
 }
 int main(int argc, char *argv[]) {
  printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
         MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
  int verbose = 1;
  int i, j;
  char flag;
  if (argc < 2) {
    printf("Usage: gaussian -f filename / -s size [-q]\n\n");
    printf("-q (quiet) suppresses printing the matrix and result values.\n");
    printf("-f (filename) path of input file\n");
    printf(
        "-s (size) size of matrix. Create matrix and rhs in this program \n");
    printf(
        "The first line of the file contains the dimension of the matrix, n.");
    printf("The second line of the file is a newline.\n");
    printf("The next n lines contain n tab separated values for the matrix.");
    printf("The next line of the file is a newline.\n");
    printf("The next line of the file is a 1xn vector with tab separated "
           "values.\n");
    printf("The next line of the file is a newline. (optional)\n");
    printf("The final line of the file is the pre-computed solution. "
           "(optional)\n");
    printf("Example: matrix4.txt:\n");
    printf("4\n");
    printf("\n");
    printf("-0.6	-0.5	0.7	0.3\n");
    printf("-0.3	-0.9	0.3	0.7\n");
    printf("-0.4	-0.5	-0.3	-0.8\n");
    printf("0.0	-0.1	0.2	0.9\n");
    printf("\n");
    printf("-0.85	-0.68	0.24	-0.53\n");
    printf("\n");
    printf("0.7	0.0	-0.4	-0.5\n");
    exit(0);
  }
  cudaSetDevice(0);
  PrintDeviceProperties();
  // char filename[100];
  // sprintf(filename,"matrices/matrix%d.txt",size);
  for (i = 1; i < argc; i++) {
    if (argv[i][0] == '-') { // flag
      flag = argv[i][1];
      switch (flag) {
      case 's': // platform
        i++;
        Size = atoi(argv[i]);
        printf("Create matrix internally in parse, size = %d \n", Size);
        a = (float *)malloc(Size * Size * sizeof(float));
        create_matrix(a, Size);
        b = (float *)malloc(Size * sizeof(float));
        for (j = 0; j < Size; j++)
          b[j] = 1.0;
        m = (float *)malloc(Size * Size * sizeof(float));
        break;
      case 'f': // platform
        i++;
        printf("Read file from %s \n", argv[i]);
        InitProblemOnce(argv[i]);
        break;
      case 'q': // quiet
        verbose = 1;
        break;
      }
    }
  }
  // InitProblemOnce(filename);
  InitPerRun();
  // begin timing
  struct timeval time_start;
  gettimeofday(&time_start, NULL);
  // run kernels
  ForwardSub();
  // end timing
  struct timeval time_end;
  gettimeofday(&time_end, NULL);
  unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
                            (time_start.tv_sec * 1000000 + time_start.tv_usec);
  if (verbose) {
    printf("Matrix m is: \n");
    PrintMat(m, Size, Size);
    printf("Matrix a is: \n");
    PrintMat(a, Size, Size);
    printf("Array b is: \n");
    PrintAry(b, Size);
  }
  BackSub();
  if (verbose) {
    printf("The final solution is: \n");
    PrintAry(finalVec, Size);
  }
  printf("\nTime total (including memory transfers)\t%f sec\n",
         time_total * 1e-6);
  printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
  /*printf("%d,%d\n",size,time_total);
  fprintf(stderr,"%d,%d\n",size,time_total);*/
  free(m);
  free(a);
  free(b);
 #ifdef TIMING
  printf("Exec: %f\n", kernel_time);
 #endif
 }
 /*------------------------------------------------------
 ** PrintDeviceProperties
 **-----------------------------------------------------
 */
 void PrintDeviceProperties() {
  cudaDeviceProp deviceProp;
  int nDevCount = 0;
  cudaGetDeviceCount(&nDevCount);
  printf("Total Device found: %d", nDevCount);
  for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
    memset(&deviceProp, 0, sizeof(deviceProp));
    if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
      printf("\nDevice Name \t\t - %s ", deviceProp.name);
      printf("\n**************************************");
      printf("\nTotal Global Memory\t\t\t - %lu KB",
             deviceProp.totalGlobalMem / 1024);
      printf("\nShared memory available per block \t - %lu KB",
             deviceProp.sharedMemPerBlock / 1024);
      printf("\nNumber of registers per thread block \t - %d",
             deviceProp.regsPerBlock);
      printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
      printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
      printf("\nMaximum threads per block \t\t - %d",
             deviceProp.maxThreadsPerBlock);
      printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
             deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
             deviceProp.maxThreadsDim[2]);
      printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
             deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
             deviceProp.maxGridSize[2]);
      printf("\nTotal constant memory \t\t\t - %zu bytes",
             deviceProp.totalConstMem);
      printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
      printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
      printf("\nTexture Alignment \t\t\t - %zu bytes",
             deviceProp.textureAlignment);
      printf("\nDevice Overlap \t\t\t\t - %s",
             deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
      printf("\nNumber of Multi processors \t\t - %d\n\n",
             deviceProp.multiProcessorCount);
    } else
      printf("\n%s", cudaGetErrorString(cudaGetLastError()));
  }
 }
 /*------------------------------------------------------
 ** InitProblemOnce -- Initialize all of matrices and
 ** vectors by opening a data file specified by the user.
 **
 ** We used dynamic array *a, *b, and *m to allocate
 ** the memory storages.
 **------------------------------------------------------
 */
 void InitProblemOnce(char *filename) {
  // char *filename = argv[1];
  // printf("Enter the data file name: ");
  // scanf("%s", filename);
  printf("The file name is: %s\n", filename);
  fp = fopen(filename, "r");
  fscanf(fp, "%d", &Size);
  a = (float *)malloc(Size * Size * sizeof(float));
  InitMat(a, Size, Size);
  printf("The input matrix a is:\n");
  PrintMat(a, Size, Size);
  b = (float *)malloc(Size * sizeof(float));
  InitAry(b, Size);
  printf("The input array b is:\n");
  PrintAry(b, Size);
  m = (float *)malloc(Size * Size * sizeof(float));
 }
 /*------------------------------------------------------
 ** InitPerRun() -- Initialize the contents of the
 ** multipier matrix **m
 **------------------------------------------------------
 */
 void InitPerRun() {
  int i;
  for (i = 0; i < Size * Size; i++)
    *(m + i) = 0.0;
 }
 /*-------------------------------------------------------
 ** Fan1() -- Calculate multiplier matrix
 ** Pay attention to the index.  Index i give the range
 ** which starts from 0 to range-1.  The real values of
 ** the index should be adjust and related with the value
 ** of t which is defined on the ForwardSub().
 **-------------------------------------------------------
 */
 __global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
  // if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
  // 		printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
  // Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
  // }
  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
    return;
  *(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
      *(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
      *(a_cuda + Size * t + t);
 }
 /*-------------------------------------------------------
 ** Fan2() -- Modify the matrix A into LUD
 **-------------------------------------------------------
 */
 __global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
                     int j1, int t) {
  if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
    return;
  if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
    return;
  int xidx = blockIdx.x * blockDim.x + threadIdx.x;
  int yidx = blockIdx.y * blockDim.y + threadIdx.y;
  // printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
  // blockDim.x: %d, blockDim.y:
  // %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
  a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
      m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
  // a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
  if (yidx == 0) {
    // printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
    // printf("xidx:%d,yidx:%d\n",xidx,yidx);
    b_cuda[xidx + 1 + t] -=
        m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
  }
 }
 /*------------------------------------------------------
 ** ForwardSub() -- Forward substitution of Gaussian
 ** elimination.
 **------------------------------------------------------
 */
 void ForwardSub() {
  int t;
  float *m_cuda, *a_cuda, *b_cuda;
  int A = 1;
  int B = 2;
  int C = 3;
  int D = 4;
  int E = 5;
  int F = 6;
  // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
  // A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
  // threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
  // allocate memory on GPU
  cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
  cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
  cudaMalloc((void **)&b_cuda, Size * sizeof(float));
  // copy memory to GPU
  cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
  int block_size, grid_size;
  block_size = MAXBLOCKSIZE;
  grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
  printf("1d grid size: %d\n", grid_size);
  dim3 dimBlock(block_size);
  dim3 dimGrid(grid_size);
  // dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
  int blockSize2d, gridSize2d;
  blockSize2d = BLOCK_SIZE_XY;
  gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
  dim3 dimBlockXY(blockSize2d, blockSize2d);
  printf("BlockXY: %d \n", blockSize2d);
  dim3 dimGridXY(gridSize2d, gridSize2d);
 #ifdef TIMING
  gettimeofday(&tv_kernel_start, NULL);
 #endif
  printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
  // begin timing kernels
  struct timeval time_start;
  gettimeofday(&time_start, NULL);
  for (t = 0; t < (Size - 1); t++) {
    Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
    cudaDeviceSynchronize();
    Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
    cudaDeviceSynchronize();
    checkCUDAError("Fan2");
  }
  // end timing kernels
  struct timeval time_end;
  gettimeofday(&time_end, NULL);
  totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
                    (time_start.tv_sec * 1000000 + time_start.tv_usec);
 #ifdef TIMING
  tvsub(&time_end, &tv_kernel_start, &tv);
  kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
 #endif
  // copy memory back to CPU
  cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(m_cuda);
  cudaFree(a_cuda);
  cudaFree(b_cuda);
 }
 /*------------------------------------------------------
 ** BackSub() -- Backward substitution
 **------------------------------------------------------
 */
 void BackSub() {
  // create a new vector to hold the final answer
  finalVec = (float *)malloc(Size * sizeof(float));
  // solve "bottom up"
  int i, j;
  for (i = 0; i < Size; i++) {
    finalVec[Size - i - 1] = b[Size - i - 1];
    for (j = 0; j < i; j++) {
      finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
                                finalVec[Size - j - 1];
    }
    finalVec[Size - i - 1] =
        finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
  }
 }
 void InitMat(float *ary, int nrow, int ncol) {
  int i, j;
  for (i = 0; i < nrow; i++) {
    for (j = 0; j < ncol; j++) {
      fscanf(fp, "%f", ary + Size * i + j);
    }
  }
 }
 /*------------------------------------------------------
 ** PrintMat() -- Print the contents of the matrix
 **------------------------------------------------------
 */
 void PrintMat(float *ary, int nrow, int ncol) {
  return;
  int i, j;
  for (i = 0; i < nrow; i++) {
    for (j = 0; j < ncol; j++) {
      printf("%8.2f ", *(ary + Size * i + j));
    }
    printf("\n");
  }
  printf("\n");
 }
 /*------------------------------------------------------
 ** InitAry() -- Initialize the array (vector) by reading
 ** data from the data file
 **------------------------------------------------------
 */
 void InitAry(float *ary, int ary_size) {
  int i;
  for (i = 0; i < ary_size; i++) {
    fscanf(fp, "%f", &ary[i]);
  }
 }
 /*------------------------------------------------------
 ** PrintAry() -- Print the contents of the array (vector)
 **------------------------------------------------------
 */
 void PrintAry(float *ary, int ary_size) {
  int i;
  for (i = 0; i < ary_size; i++) {
    printf("%.2f ", ary[i]);
  }
  printf("\n\n");
 }
 void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if (cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 }
--- a/examples/gauss/run.sh
+++ b/examples/gauss/run.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 set -e
 llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L../../build/runtime \
     -L../../build/runtime/threadPool \
     -o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
 if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/heartwall/AVI/avilib.c
+++ b/examples/heartwall/AVI/avilib.c
--- a/examples/heartwall/AVI/avilib.h
+++ b/examples/heartwall/AVI/avilib.h
@ -0,0 +1,317 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*
 *  avilib.h
 *
 *  Copyright (C) Thomas Östreich - June 2001
 *  multiple audio track support Copyright (C) 2002 Thomas Östreich
 *
 *  Original code:
 *  Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
 *
 *  This file is part of transcode, a linux video stream processing tool
 *
 *  transcode is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  transcode is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */
 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 // #include <windows.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #ifndef AVILIB_H
 #define AVILIB_H
 #define AVI_MAX_TRACKS 8
 typedef struct {
  unsigned long key;
  unsigned long pos;
  unsigned long len;
 } video_index_entry;
 typedef struct {
  unsigned long pos;
  unsigned long len;
  unsigned long tot;
 } audio_index_entry;
 typedef struct track_s {
  long a_fmt;   /* Audio format, see #defines below */
  long a_chans; /* Audio channels, 0 for no audio */
  long a_rate;  /* Rate in Hz */
  long a_bits;  /* bits per audio sample */
  long mp3rate; /* mp3 bitrate kbs*/
  long audio_strn;   /* Audio stream number */
  long audio_bytes;  /* Total number of bytes of audio data */
  long audio_chunks; /* Chunks of audio data in the file */
  char audio_tag[4]; /* Tag of audio data */
  long audio_posc;   /* Audio position: chunk */
  long audio_posb;   /* Audio position: byte within chunk */
  long a_codech_off; /* absolut offset of audio codec information */
  long a_codecf_off; /* absolut offset of audio codec information */
  audio_index_entry *audio_index;
 } track_t;
 typedef struct {
  long fdes; /* File descriptor of AVI file */
  long mode; /* 0 for reading, 1 for writing */
  long width;          /* Width  of a video frame */
  long height;         /* Height of a video frame */
  double fps;          /* Frames per second */
  char compressor[8];  /* Type of compressor, 4 bytes + padding for 0 byte */
  char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
  long video_strn;     /* Video stream number */
  long video_frames;   /* Number of video frames */
  char video_tag[4];   /* Tag of video data */
  long video_pos;      /* Number of next frame to be read
                              (if index present) */
  unsigned long max_len; /* maximum video chunk present */
  track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
  unsigned long pos; /* position in file */
  long n_idx;        /* number of index entries actually filled */
  long max_idx;      /* number of index entries actually allocated */
  long v_codech_off; /* absolut offset of video codec (strh) info */
  long v_codecf_off; /* absolut offset of video codec (strf) info */
  unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
  video_index_entry *video_index;
  unsigned long last_pos; /* Position of last frame written */
  unsigned long last_len; /* Length of last frame written */
  int must_use_index;     /* Flag if frames are duplicated */
  unsigned long movi_start;
  int anum; // total number of audio tracks
  int aptr; // current audio working track
 } avi_t;
 #define AVI_MODE_WRITE 0
 #define AVI_MODE_READ 1
 /* The error codes delivered by avi_open_input_file */
 #define AVI_ERR_SIZELIM                                                        \
  1 /* The write of the data would exceed                                      \
                                           the maximum size of the AVI file.   \
                                           This is more a warning than an      \
       error since the file may be closed safely */
 #define AVI_ERR_OPEN                                                           \
  2 /* Error opening the AVI file - wrong path                                 \
                                           name or file nor readable/writable  \
     */
 #define AVI_ERR_READ 3 /* Error reading from AVI File */
 #define AVI_ERR_WRITE                                                          \
  4 /* Error writing to AVI File,                                              \
                                           disk full ??? */
 #define AVI_ERR_WRITE_INDEX                                                    \
  5 /* Could not write index to AVI file                                       \
                                           during close, file may still be     \
                                           usable */
 #define AVI_ERR_CLOSE                                                          \
  6 /* Could not write header to AVI file                                      \
                                           or not truncate the file during     \
       close, file is most probably corrupted */
 #define AVI_ERR_NOT_PERM                                                       \
  7 /* Operation not permitted:                                                \
                                           trying to read from a file open     \
                                           for writing or vice versa */
 #define AVI_ERR_NO_MEM 8 /* malloc failed */
 #define AVI_ERR_NO_AVI 9 /* Not an AVI file */
 #define AVI_ERR_NO_HDRL                                                        \
  10 /* AVI file has no has no header list,                                    \
                                            corrupted ??? */
 #define AVI_ERR_NO_MOVI                                                        \
  11 /* AVI file has no has no MOVI list,                                      \
                                            corrupted ??? */
 #define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
 #define AVI_ERR_NO_IDX                                                         \
  13 /* The file has been opened with                                          \
                                            getIndex==0, but an operation has  \
        been performed that needs an index */
 /* Possible Audio formats */
 #ifndef WAVE_FORMAT_PCM
 #define WAVE_FORMAT_UNKNOWN (0x0000)
 #define WAVE_FORMAT_PCM (0x0001)
 #define WAVE_FORMAT_ADPCM (0x0002)
 #define WAVE_FORMAT_IBM_CVSD (0x0005)
 #define WAVE_FORMAT_ALAW (0x0006)
 #define WAVE_FORMAT_MULAW (0x0007)
 #define WAVE_FORMAT_OKI_ADPCM (0x0010)
 #define WAVE_FORMAT_DVI_ADPCM (0x0011)
 #define WAVE_FORMAT_DIGISTD (0x0015)
 #define WAVE_FORMAT_DIGIFIX (0x0016)
 #define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
 #define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
 #define WAVE_FORMAT_GSM610 (0x0031)
 #define IBM_FORMAT_MULAW (0x0101)
 #define IBM_FORMAT_ALAW (0x0102)
 #define IBM_FORMAT_ADPCM (0x0103)
 #endif
 avi_t *AVI_open_output_file(char *filename);
 void AVI_set_video(avi_t *AVI, int width, int height, double fps,
                   char *compressor);
 void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
                   long mp3rate);
 int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
 int AVI_dup_frame(avi_t *AVI);
 int AVI_write_audio(avi_t *AVI, char *data, long bytes);
 int AVI_append_audio(avi_t *AVI, char *data, long bytes);
 long AVI_bytes_remain(avi_t *AVI);
 int AVI_close(avi_t *AVI);
 long AVI_bytes_written(avi_t *AVI);
 avi_t *AVI_open_input_file(char *filename, int getIndex);
 avi_t *AVI_open_fd(int fd, int getIndex);
 int avi_parse_input_file(avi_t *AVI, int getIndex);
 long AVI_audio_mp3rate(avi_t *AVI);
 long AVI_video_frames(avi_t *AVI);
 int AVI_video_width(avi_t *AVI);
 int AVI_video_height(avi_t *AVI);
 double AVI_frame_rate(avi_t *AVI);
 char *AVI_video_compressor(avi_t *AVI);
 int AVI_audio_channels(avi_t *AVI);
 int AVI_audio_bits(avi_t *AVI);
 int AVI_audio_format(avi_t *AVI);
 long AVI_audio_rate(avi_t *AVI);
 long AVI_audio_bytes(avi_t *AVI);
 long AVI_audio_chunks(avi_t *AVI);
 long AVI_max_video_chunk(avi_t *AVI);
 long AVI_frame_size(avi_t *AVI, long frame);
 long AVI_audio_size(avi_t *AVI, long frame);
 int AVI_seek_start(avi_t *AVI);
 int AVI_set_video_position(avi_t *AVI, long frame);
 long AVI_get_video_position(avi_t *AVI, long frame);
 long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
 int AVI_set_audio_position(avi_t *AVI, long byte);
 int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
 long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
 long AVI_audio_codech_offset(avi_t *AVI);
 long AVI_audio_codecf_offset(avi_t *AVI);
 long AVI_video_codech_offset(avi_t *AVI);
 long AVI_video_codecf_offset(avi_t *AVI);
 int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
                  long max_audbuf, long *len);
 void AVI_print_error(char *str);
 char *AVI_strerror();
 char *AVI_syserror();
 int AVI_scan(char *name);
 int AVI_dump(char *name, int mode);
 char *AVI_codec2str(short cc);
 int AVI_file_check(char *import_file);
 void AVI_info(avi_t *avifile);
 uint64_t AVI_max_size();
 int avi_update_header(avi_t *AVI);
 int AVI_set_audio_track(avi_t *AVI, int track);
 int AVI_get_audio_track(avi_t *AVI);
 int AVI_audio_tracks(avi_t *AVI);
 struct riff_struct {
  unsigned char id[4]; /* RIFF */
  unsigned long len;
  unsigned char wave_id[4]; /* WAVE */
 };
 struct chunk_struct {
  unsigned char id[4];
  unsigned long len;
 };
 struct common_struct {
  unsigned short wFormatTag;
  unsigned short wChannels;
  unsigned long dwSamplesPerSec;
  unsigned long dwAvgBytesPerSec;
  unsigned short wBlockAlign;
  unsigned short wBitsPerSample; /* Only for PCM */
 };
 struct wave_header {
  struct riff_struct riff;
  struct chunk_struct format;
  struct common_struct common;
  struct chunk_struct data;
 };
 struct AVIStreamHeader {
  long fccType;
  long fccHandler;
  long dwFlags;
  long dwPriority;
  long dwInitialFrames;
  long dwScale;
  long dwRate;
  long dwStart;
  long dwLength;
  long dwSuggestedBufferSize;
  long dwQuality;
  long dwSampleSize;
 };
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/examples/heartwall/AVI/avimod.c
+++ b/examples/heartwall/AVI/avimod.c
@ -0,0 +1,130 @@
 // #ifdef __cplusplus
 // extern "C" {
 // #endif
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 #include "avimod.h"
 //===============================================================================================================================================================================================================
 //	FUNCTIONS
 //===============================================================================================================================================================================================================
 // Flips the specified image and crops it to the specified dimensions
 // If scaled == true, all values are scaled to the range [0.0, 1.0
 fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
                    int converted) {
  // fixed dimensions for cropping or not cropping, square vertices starting
  // from initial point in top left corner going down and right
  int top;
  int bottom;
  int left;
  int right;
  if (cropped == 1) {
    top = 0;
    bottom = 0;
    left = 0;
    right = 0;
  } else {
    top = 0;
    bottom = height - 1;
    left = 0;
    right = width - 1;
  }
  // dimensions of new cropped image
  int height_new = bottom - top + 1;
  int width_new = right - left + 1;
  // counters
  int i, j;
  // allocate memory for cropped/flipped frame
  fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
  // crop/flip and scale frame
  fp temp;
  if (scaled) {
    fp scale = 1.0 / 255.0;
    for (i = 0; i < height_new; i++) {  // rows
      for (j = 0; j < width_new; j++) { // colums
        temp =
            (fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
        if (temp < 0) {
          result[i * width_new + j] = temp + 256;
        } else {
          result[i * width_new + j] = temp;
        }
      }
    }
  } else {
    for (i = 0; i < height_new; i++) {  // rows
      for (j = 0; j < width_new; j++) { // colums
        temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
        if (temp < 0) {
          result[i * width_new + j] = temp + 256;
        } else {
          result[i * width_new + j] = temp;
        }
      }
    }
  }
  // convert storage method (from row-major to column-major)
  fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
  if (converted == 1) {
    for (i = 0; i < width_new; i++) {    // rows
      for (j = 0; j < height_new; j++) { // colums
        result_converted[i * height_new + j] = result[j * width_new + i];
      }
    }
  } else {
    result_converted = result;
  }
  free(result);
  // return
  return result_converted;
 }
 // Returns the specified frame from the specified video file
 // If cropped == true, the frame is cropped to pre-determined dimensions
 //  (hardcoded to the boundaries of the blood vessel in the test video)
 // If scaled == true, all values are scaled to the range [0.0, 1.0]
 fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
              int converted) {
  // variable
  int dummy;
  int width = AVI_video_width(cell_file);
  int height = AVI_video_height(cell_file);
  int status;
  // There are 600 frames in this file (i.e. frame_num = 600 causes an error)
  AVI_set_video_position(cell_file, frame_num);
  // Read in the frame from the AVI
  char *image_buf = (char *)malloc(width * height * sizeof(char));
  status = AVI_read_frame(cell_file, image_buf, &dummy);
  if (status == -1) {
    AVI_print_error((char *)"Error with AVI_read_frame");
    exit(-1);
  }
  // The image is read in upside-down, so we need to flip it
  fp *image_chopped;
  image_chopped =
      chop_flip_image(image_buf, height, width, cropped, scaled, converted);
  // free image buffer
  free(image_buf);
  // return
  return image_chopped;
 }
 // #ifdef __cplusplus
 // }
 // #endif
--- a/examples/heartwall/AVI/avimod.h
+++ b/examples/heartwall/AVI/avimod.h
@ -0,0 +1,24 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 #define fp float
 #include "avilib.h"
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
                    int converted);
 fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
              int converted);
 #ifdef __cplusplus
 }
 #endif
--- a/examples/heartwall/define.c
+++ b/examples/heartwall/define.c
@ -0,0 +1,396 @@
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 #define fp float
 /* #define NUMBER_THREADS 512 */
 #ifdef RD_WG_SIZE_0_0
 #define NUMBER_THREADS RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define NUMBER_THREADS RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define NUMBER_THREADS RD_WG_SIZE
 #else
 #define NUMBER_THREADS 256
 #endif
 #define ENDO_POINTS 20
 #define EPI_POINTS 31
 #define ALL_POINTS 51
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	PARAMS_COMMON_CHANGE STRUCT
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 typedef struct params_common_change {
  //======================================================================================================================================================
  //	FRAME
  //======================================================================================================================================================
  fp *d_frame;
  int frame_no;
 } params_common_change;
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	PARAMS_COMMON STRUCTURE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 typedef struct params_common {
  //======================================================================================================================================================
  //	HARDCODED INPUTS FROM MATLAB
  //======================================================================================================================================================
  //====================================================================================================
  //	CONSTANTS
  //====================================================================================================
  int sSize;
  int tSize;
  int maxMove;
  fp alpha;
  //====================================================================================================
  //	FRAME
  //====================================================================================================
  int no_frames;
  int frame_rows;
  int frame_cols;
  int frame_elem;
  int frame_mem;
  //====================================================================================================
  //	ENDO POINTS
  //====================================================================================================
  int endoPoints;
  int endo_mem;
  int *endoRow;
  int *endoCol;
  int *tEndoRowLoc;
  int *tEndoColLoc;
  int *d_endoRow;
  int *d_endoCol;
  int *d_tEndoRowLoc;
  int *d_tEndoColLoc;
  fp *d_endoT;
  //====================================================================================================
  //	EPI POINTS
  //====================================================================================================
  int epiPoints;
  int epi_mem;
  int *epiRow;
  int *epiCol;
  int *tEpiRowLoc;
  int *tEpiColLoc;
  int *d_epiRow;
  int *d_epiCol;
  int *d_tEpiRowLoc;
  int *d_tEpiColLoc;
  fp *d_epiT;
  //====================================================================================================
  //	ALL POINTS
  //====================================================================================================
  int allPoints;
  //======================================================================================================================================================
  //	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
  //======================================================================================================================================================
  int in_rows;
  int in_cols;
  int in_elem;
  int in_mem;
  //======================================================================================================================================================
  // 	AREA AROUND POINT		FROM	FRAME
  //======================================================================================================================================================
  int in2_rows;
  int in2_cols;
  int in2_elem;
  int in2_mem;
  //======================================================================================================================================================
  //	CONVOLUTION
  //======================================================================================================================================================
  int conv_rows;
  int conv_cols;
  int conv_elem;
  int conv_mem;
  int ioffset;
  int joffset;
  //======================================================================================================================================================
  //	CUMULATIVE SUM 1
  //======================================================================================================================================================
  //====================================================================================================
  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
  //====================================================================================================
  int in2_pad_add_rows;
  int in2_pad_add_cols;
  int in2_pad_cumv_rows;
  int in2_pad_cumv_cols;
  int in2_pad_cumv_elem;
  int in2_pad_cumv_mem;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  int in2_pad_cumv_sel_rows;
  int in2_pad_cumv_sel_cols;
  int in2_pad_cumv_sel_elem;
  int in2_pad_cumv_sel_mem;
  int in2_pad_cumv_sel_rowlow;
  int in2_pad_cumv_sel_rowhig;
  int in2_pad_cumv_sel_collow;
  int in2_pad_cumv_sel_colhig;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  //====================================================================================================
  int in2_pad_cumv_sel2_rowlow;
  int in2_pad_cumv_sel2_rowhig;
  int in2_pad_cumv_sel2_collow;
  int in2_pad_cumv_sel2_colhig;
  int in2_sub_cumh_rows;
  int in2_sub_cumh_cols;
  int in2_sub_cumh_elem;
  int in2_sub_cumh_mem;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  int in2_sub_cumh_sel_rows;
  int in2_sub_cumh_sel_cols;
  int in2_sub_cumh_sel_elem;
  int in2_sub_cumh_sel_mem;
  int in2_sub_cumh_sel_rowlow;
  int in2_sub_cumh_sel_rowhig;
  int in2_sub_cumh_sel_collow;
  int in2_sub_cumh_sel_colhig;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  int in2_sub_cumh_sel2_rowlow;
  int in2_sub_cumh_sel2_rowhig;
  int in2_sub_cumh_sel2_collow;
  int in2_sub_cumh_sel2_colhig;
  int in2_sub2_rows;
  int in2_sub2_cols;
  int in2_sub2_elem;
  int in2_sub2_mem;
  //======================================================================================================================================================
  //	CUMULATIVE SUM 2
  //======================================================================================================================================================
  //====================================================================================================
  //	MULTIPLICATION
  //====================================================================================================
  int in2_sqr_rows;
  int in2_sqr_cols;
  int in2_sqr_elem;
  int in2_sqr_mem;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  int in2_sqr_sub2_rows;
  int in2_sqr_sub2_cols;
  int in2_sqr_sub2_elem;
  int in2_sqr_sub2_mem;
  //======================================================================================================================================================
  //	FINAL
  //======================================================================================================================================================
  int in_sqr_rows;
  int in_sqr_cols;
  int in_sqr_elem;
  int in_sqr_mem;
  //======================================================================================================================================================
  //	TEMPLATE MASK CREATE
  //======================================================================================================================================================
  int tMask_rows;
  int tMask_cols;
  int tMask_elem;
  int tMask_mem;
  //======================================================================================================================================================
  //	POINT MASK INITIALIZE
  //======================================================================================================================================================
  int mask_rows;
  int mask_cols;
  int mask_elem;
  int mask_mem;
  //======================================================================================================================================================
  //	MASK CONVOLUTION
  //======================================================================================================================================================
  int mask_conv_rows;
  int mask_conv_cols;
  int mask_conv_elem;
  int mask_conv_mem;
  int mask_conv_ioffset;
  int mask_conv_joffset;
 } params_common;
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	PARAMS_UNIQUE STRUCTURE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 typedef struct params_unique {
  //======================================================================================================================================================
  //	POINT NUMBER
  //======================================================================================================================================================
  int *d_Row;
  int *d_Col;
  int *d_tRowLoc;
  int *d_tColLoc;
  fp *d_T;
  //======================================================================================================================================================
  //	POINT NUMBER
  //======================================================================================================================================================
  int point_no;
  //======================================================================================================================================================
  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
  //======================================================================================================================================================
  int in_pointer;
  //======================================================================================================================================================
  //	AREA AROUND POINT		FROM	FRAME
  //======================================================================================================================================================
  fp *d_in2;
  //======================================================================================================================================================
  //	CONVOLUTION
  //======================================================================================================================================================
  fp *d_conv;
  fp *d_in_mod;
  //======================================================================================================================================================
  //	CUMULATIVE SUM
  //======================================================================================================================================================
  //====================================================================================================
  //	PAD ARRAY, VERTICAL CUMULATIVE SUM
  //====================================================================================================
  fp *d_in2_pad_cumv;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  fp *d_in2_pad_cumv_sel;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  //====================================================================================================
  fp *d_in2_sub_cumh;
  //====================================================================================================
  //	SELECTION
  //====================================================================================================
  fp *d_in2_sub_cumh_sel;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  fp *d_in2_sub2;
  //======================================================================================================================================================
  //	CUMULATIVE SUM 2
  //======================================================================================================================================================
  //====================================================================================================
  //	MULTIPLICATION
  //====================================================================================================
  fp *d_in2_sqr;
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  fp *d_in2_sqr_sub2;
  //======================================================================================================================================================
  //	FINAL
  //======================================================================================================================================================
  fp *d_in_sqr;
  //======================================================================================================================================================
  //	TEMPLATE MASK
  //======================================================================================================================================================
  fp *d_tMask;
  //======================================================================================================================================================
  //	POINT MASK INITIALIZE
  //======================================================================================================================================================
  fp *d_mask;
  //======================================================================================================================================================
  //	MASK CONVOLUTION
  //======================================================================================================================================================
  fp *d_mask_conv;
 } params_unique;
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	END OF STRUCTURE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
--- a/examples/heartwall/kernel.cu
+++ b/examples/heartwall/kernel.cu
--- a/examples/heartwall/main.cu
+++ b/examples/heartwall/main.cu
@ -0,0 +1,795 @@
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	DEFINE / INCLUDE
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //======================================================================================================================================================
 //	LIBRARIES
 //======================================================================================================================================================
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <avilib.h>
 #include <avimod.h>
 #include <cuda.h>
 //======================================================================================================================================================
 //	STRUCTURES, GLOBAL STRUCTURE VARIABLES
 //======================================================================================================================================================
 #include "define.c"
 params_common_change common_change;
 __constant__ params_common_change d_common_change;
 params_common common;
 __constant__ params_common d_common;
 params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
                                  // more than usually needed
 __constant__ params_unique d_unique[ALL_POINTS];
 //======================================================================================================================================================
 // KERNEL CODE
 //======================================================================================================================================================
 #include "kernel.cu"
 //	WRITE DATA FUNCTION
 //===============================================================================================================================================================================================================200
 void write_data(char *filename, int frameNo, int frames_processed,
                int endoPoints, int *input_a, int *input_b, int epiPoints,
                int *input_2a, int *input_2b) {
  //================================================================================80
  //	VARIABLES
  //================================================================================80
  FILE *fid;
  int i, j;
  char c;
  //================================================================================80
  //	OPEN FILE FOR READING
  //================================================================================80
  fid = fopen(filename, "w+");
  if (fid == NULL) {
    printf("The file was not opened for writing\n");
    return;
  }
  //================================================================================80
  //	WRITE VALUES TO THE FILE
  //================================================================================80
  fprintf(fid, "Total AVI Frames: %d\n", frameNo);
  fprintf(fid, "Frames Processed: %d\n", frames_processed);
  fprintf(fid, "endoPoints: %d\n", endoPoints);
  fprintf(fid, "epiPoints: %d", epiPoints);
  for (j = 0; j < frames_processed; j++) {
    fprintf(fid, "\n---Frame %d---", j);
    fprintf(fid, "\n--endo--\n", j);
    for (i = 0; i < endoPoints; i++) {
      fprintf(fid, "%d\t", input_a[j + i * frameNo]);
    }
    fprintf(fid, "\n");
    for (i = 0; i < endoPoints; i++) {
      // if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
      fprintf(fid, "%d\t", input_b[j + i * frameNo]);
    }
    fprintf(fid, "\n--epi--\n", j);
    for (i = 0; i < epiPoints; i++) {
      // if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
      fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
    }
    fprintf(fid, "\n");
    for (i = 0; i < epiPoints; i++) {
      // if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
      fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
    }
  }
  // 	================================================================================80
  //		CLOSE FILE
  //	================================================================================80
  fclose(fid);
 }
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	MAIN FUNCTION
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 int main(int argc, char *argv[]) {
  cudaSetDevice(0);
  printf("WG size of kernel = %d \n", NUMBER_THREADS);
  //======================================================================================================================================================
  //	VARIABLES
  //======================================================================================================================================================
  // CUDA kernel execution parameters
  dim3 threads;
  dim3 blocks;
  // counter
  int i;
  int frames_processed;
  // frames
  char *video_file_name;
  avi_t *frames;
  fp *frame;
  //======================================================================================================================================================
  // 	FRAME
  //======================================================================================================================================================
  if (argc != 3) {
    printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
    exit(1);
  }
  // open movie file
  video_file_name = argv[1];
  frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
  if (frames == NULL) {
    AVI_print_error((char *)"Error with AVI_open_input_file");
    return -1;
  }
  // common
  common.no_frames = AVI_video_frames(frames);
  common.frame_rows = AVI_video_height(frames);
  common.frame_cols = AVI_video_width(frames);
  common.frame_elem = common.frame_rows * common.frame_cols;
  common.frame_mem = sizeof(fp) * common.frame_elem;
  // pointers
  cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
  //======================================================================================================================================================
  // 	CHECK INPUT ARGUMENTS
  //======================================================================================================================================================
  frames_processed = atoi(argv[2]);
  if (frames_processed < 0 || frames_processed > common.no_frames) {
    printf("ERROR: %d is an incorrect number of frames specified, select in "
           "the range of 0-%d\n",
           frames_processed, common.no_frames);
    return 0;
  }
  //======================================================================================================================================================
  //	HARDCODED INPUTS FROM MATLAB
  //======================================================================================================================================================
  //====================================================================================================
  //	CONSTANTS
  //====================================================================================================
  common.sSize = 40;
  common.tSize = 25;
  common.maxMove = 10;
  common.alpha = 0.87;
  //====================================================================================================
  //	ENDO POINTS
  //====================================================================================================
  common.endoPoints = ENDO_POINTS;
  common.endo_mem = sizeof(int) * common.endoPoints;
  common.endoRow = (int *)malloc(common.endo_mem);
  common.endoRow[0] = 369;
  common.endoRow[1] = 400;
  common.endoRow[2] = 429;
  common.endoRow[3] = 452;
  common.endoRow[4] = 476;
  common.endoRow[5] = 486;
  common.endoRow[6] = 479;
  common.endoRow[7] = 458;
  common.endoRow[8] = 433;
  common.endoRow[9] = 404;
  common.endoRow[10] = 374;
  common.endoRow[11] = 346;
  common.endoRow[12] = 318;
  common.endoRow[13] = 294;
  common.endoRow[14] = 277;
  common.endoRow[15] = 269;
  common.endoRow[16] = 275;
  common.endoRow[17] = 287;
  common.endoRow[18] = 311;
  common.endoRow[19] = 339;
  cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
  cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
             cudaMemcpyHostToDevice);
  common.endoCol = (int *)malloc(common.endo_mem);
  common.endoCol[0] = 408;
  common.endoCol[1] = 406;
  common.endoCol[2] = 397;
  common.endoCol[3] = 383;
  common.endoCol[4] = 354;
  common.endoCol[5] = 322;
  common.endoCol[6] = 294;
  common.endoCol[7] = 270;
  common.endoCol[8] = 250;
  common.endoCol[9] = 237;
  common.endoCol[10] = 235;
  common.endoCol[11] = 241;
  common.endoCol[12] = 254;
  common.endoCol[13] = 273;
  common.endoCol[14] = 300;
  common.endoCol[15] = 328;
  common.endoCol[16] = 356;
  common.endoCol[17] = 383;
  common.endoCol[18] = 401;
  common.endoCol[19] = 411;
  cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
  cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
             cudaMemcpyHostToDevice);
  common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEndoRowLoc,
             common.endo_mem * common.no_frames);
  common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEndoColLoc,
             common.endo_mem * common.no_frames);
  //====================================================================================================
  //	EPI POINTS
  //====================================================================================================
  common.epiPoints = EPI_POINTS;
  common.epi_mem = sizeof(int) * common.epiPoints;
  common.epiRow = (int *)malloc(common.epi_mem);
  common.epiRow[0] = 390;
  common.epiRow[1] = 419;
  common.epiRow[2] = 448;
  common.epiRow[3] = 474;
  common.epiRow[4] = 501;
  common.epiRow[5] = 519;
  common.epiRow[6] = 535;
  common.epiRow[7] = 542;
  common.epiRow[8] = 543;
  common.epiRow[9] = 538;
  common.epiRow[10] = 528;
  common.epiRow[11] = 511;
  common.epiRow[12] = 491;
  common.epiRow[13] = 466;
  common.epiRow[14] = 438;
  common.epiRow[15] = 406;
  common.epiRow[16] = 376;
  common.epiRow[17] = 347;
  common.epiRow[18] = 318;
  common.epiRow[19] = 291;
  common.epiRow[20] = 275;
  common.epiRow[21] = 259;
  common.epiRow[22] = 256;
  common.epiRow[23] = 252;
  common.epiRow[24] = 252;
  common.epiRow[25] = 257;
  common.epiRow[26] = 266;
  common.epiRow[27] = 283;
  common.epiRow[28] = 305;
  common.epiRow[29] = 331;
  common.epiRow[30] = 360;
  cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
  cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
             cudaMemcpyHostToDevice);
  common.epiCol = (int *)malloc(common.epi_mem);
  common.epiCol[0] = 457;
  common.epiCol[1] = 454;
  common.epiCol[2] = 446;
  common.epiCol[3] = 431;
  common.epiCol[4] = 411;
  common.epiCol[5] = 388;
  common.epiCol[6] = 361;
  common.epiCol[7] = 331;
  common.epiCol[8] = 301;
  common.epiCol[9] = 273;
  common.epiCol[10] = 243;
  common.epiCol[11] = 218;
  common.epiCol[12] = 196;
  common.epiCol[13] = 178;
  common.epiCol[14] = 166;
  common.epiCol[15] = 157;
  common.epiCol[16] = 155;
  common.epiCol[17] = 165;
  common.epiCol[18] = 177;
  common.epiCol[19] = 197;
  common.epiCol[20] = 218;
  common.epiCol[21] = 248;
  common.epiCol[22] = 276;
  common.epiCol[23] = 304;
  common.epiCol[24] = 333;
  common.epiCol[25] = 361;
  common.epiCol[26] = 391;
  common.epiCol[27] = 415;
  common.epiCol[28] = 434;
  common.epiCol[29] = 448;
  common.epiCol[30] = 455;
  cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
  cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
             cudaMemcpyHostToDevice);
  common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
  common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
  cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
  //====================================================================================================
  //	ALL POINTS
  //====================================================================================================
  common.allPoints = ALL_POINTS;
  //======================================================================================================================================================
  // 	TEMPLATE SIZES
  //======================================================================================================================================================
  // common
  common.in_rows = common.tSize + 1 + common.tSize;
  common.in_cols = common.in_rows;
  common.in_elem = common.in_rows * common.in_cols;
  common.in_mem = sizeof(fp) * common.in_elem;
  //======================================================================================================================================================
  // 	CREATE ARRAY OF TEMPLATES FOR ALL POINTS
  //======================================================================================================================================================
  // common
  cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
  cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
  //======================================================================================================================================================
  //	SPECIFIC TO ENDO OR EPI TO BE SET HERE
  //======================================================================================================================================================
  for (i = 0; i < common.endoPoints; i++) {
    unique[i].point_no = i;
    unique[i].d_Row = common.d_endoRow;
    unique[i].d_Col = common.d_endoCol;
    unique[i].d_tRowLoc = common.d_tEndoRowLoc;
    unique[i].d_tColLoc = common.d_tEndoColLoc;
    unique[i].d_T = common.d_endoT;
  }
  for (i = common.endoPoints; i < common.allPoints; i++) {
    unique[i].point_no = i - common.endoPoints;
    unique[i].d_Row = common.d_epiRow;
    unique[i].d_Col = common.d_epiCol;
    unique[i].d_tRowLoc = common.d_tEpiRowLoc;
    unique[i].d_tColLoc = common.d_tEpiColLoc;
    unique[i].d_T = common.d_epiT;
  }
  //======================================================================================================================================================
  // 	RIGHT TEMPLATE 	FROM 	TEMPLATE ARRAY
  //======================================================================================================================================================
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    unique[i].in_pointer = unique[i].point_no * common.in_elem;
  }
  //======================================================================================================================================================
  // 	AREA AROUND POINT		FROM	FRAME
  //======================================================================================================================================================
  // common
  common.in2_rows = 2 * common.sSize + 1;
  common.in2_cols = 2 * common.sSize + 1;
  common.in2_elem = common.in2_rows * common.in2_cols;
  common.in2_mem = sizeof(float) * common.in2_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
  }
  //======================================================================================================================================================
  // 	CONVOLUTION
  //======================================================================================================================================================
  // common
  common.conv_rows =
      common.in_rows + common.in2_rows - 1; // number of rows in I
  common.conv_cols =
      common.in_cols + common.in2_cols - 1; // number of columns in I
  common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
  common.conv_mem = sizeof(float) * common.conv_elem;
  common.ioffset = 0;
  common.joffset = 0;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
  }
  //======================================================================================================================================================
  // 	CUMULATIVE SUM
  //======================================================================================================================================================
  //====================================================================================================
  // 	PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
  //====================================================================================================
  // common
  common.in2_pad_add_rows = common.in_rows;
  common.in2_pad_add_cols = common.in_cols;
  common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
  common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
  common.in2_pad_cumv_elem =
      common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
  common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
  }
  //====================================================================================================
  // 	SELECTION
  //====================================================================================================
  // common
  common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
  common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
  common.in2_pad_cumv_sel_collow = 1;
  common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
  common.in2_pad_cumv_sel_rows =
      common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
  common.in2_pad_cumv_sel_cols =
      common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
  common.in2_pad_cumv_sel_elem =
      common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
  common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
               common.in2_pad_cumv_sel_mem);
  }
  //====================================================================================================
  // 	SELECTION	2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  //====================================================================================================
  // common
  common.in2_pad_cumv_sel2_rowlow = 1;
  common.in2_pad_cumv_sel2_rowhig =
      common.in2_pad_cumv_rows - common.in_rows - 1;
  common.in2_pad_cumv_sel2_collow = 1;
  common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
  common.in2_sub_cumh_rows =
      common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
  common.in2_sub_cumh_cols =
      common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
  common.in2_sub_cumh_elem =
      common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
  common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
  }
  //====================================================================================================
  // 	SELECTION
  //====================================================================================================
  // common
  common.in2_sub_cumh_sel_rowlow = 1;
  common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
  common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
  common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
  common.in2_sub_cumh_sel_rows =
      common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
  common.in2_sub_cumh_sel_cols =
      common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
  common.in2_sub_cumh_sel_elem =
      common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
  common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
               common.in2_sub_cumh_sel_mem);
  }
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  // common
  common.in2_sub_cumh_sel2_rowlow = 1;
  common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
  common.in2_sub_cumh_sel2_collow = 1;
  common.in2_sub_cumh_sel2_colhig =
      common.in2_sub_cumh_cols - common.in_cols - 1;
  common.in2_sub2_rows =
      common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
  common.in2_sub2_cols =
      common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
  common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
  common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
  }
  //======================================================================================================================================================
  //	CUMULATIVE SUM 2
  //======================================================================================================================================================
  //====================================================================================================
  //	MULTIPLICATION
  //====================================================================================================
  // common
  common.in2_sqr_rows = common.in2_rows;
  common.in2_sqr_cols = common.in2_cols;
  common.in2_sqr_elem = common.in2_elem;
  common.in2_sqr_mem = common.in2_mem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
  }
  //====================================================================================================
  //	SELECTION 2, SUBTRACTION
  //====================================================================================================
  // common
  common.in2_sqr_sub2_rows = common.in2_sub2_rows;
  common.in2_sqr_sub2_cols = common.in2_sub2_cols;
  common.in2_sqr_sub2_elem = common.in2_sub2_elem;
  common.in2_sqr_sub2_mem = common.in2_sub2_mem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
  }
  //======================================================================================================================================================
  //	FINAL
  //======================================================================================================================================================
  // common
  common.in_sqr_rows = common.in_rows;
  common.in_sqr_cols = common.in_cols;
  common.in_sqr_elem = common.in_elem;
  common.in_sqr_mem = common.in_mem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
  }
  //======================================================================================================================================================
  //	TEMPLATE MASK CREATE
  //======================================================================================================================================================
  // common
  common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
  common.tMask_cols = common.tMask_rows;
  common.tMask_elem = common.tMask_rows * common.tMask_cols;
  common.tMask_mem = sizeof(float) * common.tMask_elem;
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
  }
  //======================================================================================================================================================
  //	POINT MASK INITIALIZE
  //======================================================================================================================================================
  // common
  common.mask_rows = common.maxMove;
  common.mask_cols = common.mask_rows;
  common.mask_elem = common.mask_rows * common.mask_cols;
  common.mask_mem = sizeof(float) * common.mask_elem;
  //======================================================================================================================================================
  //	MASK CONVOLUTION
  //======================================================================================================================================================
  // common
  common.mask_conv_rows = common.tMask_rows; // number of rows in I
  common.mask_conv_cols = common.tMask_cols; // number of columns in I
  common.mask_conv_elem =
      common.mask_conv_rows * common.mask_conv_cols; // number of elements
  common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
  common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
  if ((common.mask_rows - 1) % 2 > 0.5) {
    common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
  }
  common.mask_conv_joffset = (common.mask_cols - 1) / 2;
  if ((common.mask_cols - 1) % 2 > 0.5) {
    common.mask_conv_joffset = common.mask_conv_joffset + 1;
  }
  // pointers
  for (i = 0; i < common.allPoints; i++) {
    cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
  }
  //======================================================================================================================================================
  //	KERNEL
  //======================================================================================================================================================
  //====================================================================================================
  //	THREAD BLOCK
  //====================================================================================================
  // All kernels operations within kernel use same max size of threads. Size of
  // block size is set to the size appropriate for max size operation (on padded
  // matrix). Other use subsets of that.
  threads.x = NUMBER_THREADS; // define the number of threads in the block
  threads.y = 1;
  blocks.x = common.allPoints; // define the number of blocks in the grid
  blocks.y = 1;
  //====================================================================================================
  //	COPY ARGUMENTS
  //====================================================================================================
  cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
  cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
  //====================================================================================================
  //	PRINT FRAME PROGRESS START
  //====================================================================================================
  printf("frame progress: ");
  fflush(NULL);
  //====================================================================================================
  //	LAUNCH
  //====================================================================================================
  for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
       common_change.frame_no++) {
    printf("get frame\n");
    // Extract a cropped version of the first frame from the video file
    frame = get_frame(
        frames,                 // pointer to video file
        common_change.frame_no, // number of frame that needs to be returned
        0,                      // cropped?
        0,                      // scaled?
        1);                     // converted
    printf("memcpy\n");
    // copy frame to GPU memory
    cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
               cudaMemcpyHostToDevice);
    printf("toSymbol\n");
    cudaMemcpyToSymbol(d_common_change, &common_change,
                       sizeof(params_common_change));
    // launch GPU kernel
    printf("launch\n");
    kernel<<<1, 32>>>();
    cudaDeviceSynchronize();
    printf("return\n");
    // free frame after each loop iteration, since AVI library allocates memory
    // for every frame fetched
    printf("free\n");
    free(frame);
    // print frame progress
    printf("%d ", common_change.frame_no);
    fflush(NULL);
  }
  //====================================================================================================
  //	PRINT FRAME PROGRESS END
  //====================================================================================================
  printf("\n");
  fflush(NULL);
  //====================================================================================================
  //	OUTPUT
  //====================================================================================================
  cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
  cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
             common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
  cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
  cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
             common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
 #ifdef OUTPUT
  //==================================================50
  //	DUMP DATA TO FILE
  //==================================================50
  write_data("result.txt", common.no_frames, frames_processed,
             common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
             common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
  //==================================================50
  //	End
  //==================================================50
 #endif
  //======================================================================================================================================================
  //	DEALLOCATION
  //======================================================================================================================================================
  //====================================================================================================
  //	COMMON
  //====================================================================================================
  // frame
  cudaFree(common_change.d_frame);
  // endo points
  free(common.endoRow);
  free(common.endoCol);
  free(common.tEndoRowLoc);
  free(common.tEndoColLoc);
  cudaFree(common.d_endoRow);
  cudaFree(common.d_endoCol);
  cudaFree(common.d_tEndoRowLoc);
  cudaFree(common.d_tEndoColLoc);
  cudaFree(common.d_endoT);
  // epi points
  free(common.epiRow);
  free(common.epiCol);
  free(common.tEpiRowLoc);
  free(common.tEpiColLoc);
  cudaFree(common.d_epiRow);
  cudaFree(common.d_epiCol);
  cudaFree(common.d_tEpiRowLoc);
  cudaFree(common.d_tEpiColLoc);
  cudaFree(common.d_epiT);
  //====================================================================================================
  //	POINTERS
  //====================================================================================================
  for (i = 0; i < common.allPoints; i++) {
    cudaFree(unique[i].d_in2);
    cudaFree(unique[i].d_conv);
    cudaFree(unique[i].d_in2_pad_cumv);
    cudaFree(unique[i].d_in2_pad_cumv_sel);
    cudaFree(unique[i].d_in2_sub_cumh);
    cudaFree(unique[i].d_in2_sub_cumh_sel);
    cudaFree(unique[i].d_in2_sub2);
    cudaFree(unique[i].d_in2_sqr);
    cudaFree(unique[i].d_in2_sqr_sub2);
    cudaFree(unique[i].d_in_sqr);
    cudaFree(unique[i].d_tMask);
    cudaFree(unique[i].d_mask_conv);
  }
 }
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
 //	MAIN FUNCTION
 //===============================================================================================================================================================================================================
 //===============================================================================================================================================================================================================
--- a/examples/heartwall/run.sh
+++ b/examples/heartwall/run.sh
@ -0,0 +1,17 @@
 #!/bin/bash
 cd AVI; make; cd ..;
 clang++ -DOUTPUT main.cu -I./AVI  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
 /home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 /home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime  -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o  ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
 ./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20
--- a/examples/heartwall/setdevice.cu
+++ b/examples/heartwall/setdevice.cu
@ -0,0 +1,5 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Set Device
 ////////////////////////////////////////////////////////////////////////////////
 void setdevice(void) { cudaSetDevice(0); }
--- a/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,719 @@
 ; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "hotspot.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
 entry:
  %iteration.addr = alloca i32, align 4
  %power.addr = alloca float*, align 8
  %temp_src.addr = alloca float*, align 8
  %temp_dst.addr = alloca float*, align 8
  %grid_cols.addr = alloca i32, align 4
  %grid_rows.addr = alloca i32, align 4
  %border_cols.addr = alloca i32, align 4
  %border_rows.addr = alloca i32, align 4
  %Cap.addr = alloca float, align 4
  %Rx.addr = alloca float, align 4
  %Ry.addr = alloca float, align 4
  %Rz.addr = alloca float, align 4
  %step.addr = alloca float, align 4
  %time_elapsed.addr = alloca float, align 4
  %amb_temp = alloca float, align 4
  %step_div_Cap = alloca float, align 4
  %Rx_1 = alloca float, align 4
  %Ry_1 = alloca float, align 4
  %Rz_1 = alloca float, align 4
  %bx = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %small_block_rows = alloca i32, align 4
  %small_block_cols = alloca i32, align 4
  %blkY = alloca i32, align 4
  %blkX = alloca i32, align 4
  %blkYmax = alloca i32, align 4
  %blkXmax = alloca i32, align 4
  %yidx = alloca i32, align 4
  %xidx = alloca i32, align 4
  %loadYidx = alloca i32, align 4
  %loadXidx = alloca i32, align 4
  %index = alloca i32, align 4
  %validYmin = alloca i32, align 4
  %validYmax = alloca i32, align 4
  %validXmin = alloca i32, align 4
  %validXmax = alloca i32, align 4
  %N = alloca i32, align 4
  %S = alloca i32, align 4
  %W = alloca i32, align 4
  %E = alloca i32, align 4
  %computed = alloca i8, align 1
  %i = alloca i32, align 4
  store i32 %iteration, i32* %iteration.addr, align 4
  store float* %power, float** %power.addr, align 8
  store float* %temp_src, float** %temp_src.addr, align 8
  store float* %temp_dst, float** %temp_dst.addr, align 8
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %border_cols, i32* %border_cols.addr, align 4
  store i32 %border_rows, i32* %border_rows.addr, align 4
  store float %Cap, float* %Cap.addr, align 4
  store float %Rx, float* %Rx.addr, align 4
  store float %Ry, float* %Ry.addr, align 4
  store float %Rz, float* %Rz.addr, align 4
  store float %step, float* %step.addr, align 4
  store float %time_elapsed, float* %time_elapsed.addr, align 4
  store float 8.000000e+01, float* %amb_temp, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %bx, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call1, i32* %by, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call2, i32* %tx, align 4
  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call3, i32* %ty, align 4
  %0 = load float, float* %step.addr, align 4
  %1 = load float, float* %Cap.addr, align 4
  %div = fdiv float %0, %1
  store float %div, float* %step_div_Cap, align 4
  %2 = load float, float* %Rx.addr, align 4
  %div4 = fdiv float 1.000000e+00, %2
  store float %div4, float* %Rx_1, align 4
  %3 = load float, float* %Ry.addr, align 4
  %div5 = fdiv float 1.000000e+00, %3
  store float %div5, float* %Ry_1, align 4
  %4 = load float, float* %Rz.addr, align 4
  %div6 = fdiv float 1.000000e+00, %4
  store float %div6, float* %Rz_1, align 4
  %5 = load i32, i32* %iteration.addr, align 4
  %mul = mul nsw i32 %5, 2
  %sub = sub nsw i32 16, %mul
  store i32 %sub, i32* %small_block_rows, align 4
  %6 = load i32, i32* %iteration.addr, align 4
  %mul7 = mul nsw i32 %6, 2
  %sub8 = sub nsw i32 16, %mul7
  store i32 %sub8, i32* %small_block_cols, align 4
  %7 = load i32, i32* %small_block_rows, align 4
  %8 = load i32, i32* %by, align 4
  %mul9 = mul nsw i32 %7, %8
  %9 = load i32, i32* %border_rows.addr, align 4
  %sub10 = sub nsw i32 %mul9, %9
  store i32 %sub10, i32* %blkY, align 4
  %10 = load i32, i32* %small_block_cols, align 4
  %11 = load i32, i32* %bx, align 4
  %mul11 = mul nsw i32 %10, %11
  %12 = load i32, i32* %border_cols.addr, align 4
  %sub12 = sub nsw i32 %mul11, %12
  store i32 %sub12, i32* %blkX, align 4
  %13 = load i32, i32* %blkY, align 4
  %add = add nsw i32 %13, 16
  %sub13 = sub nsw i32 %add, 1
  store i32 %sub13, i32* %blkYmax, align 4
  %14 = load i32, i32* %blkX, align 4
  %add14 = add nsw i32 %14, 16
  %sub15 = sub nsw i32 %add14, 1
  store i32 %sub15, i32* %blkXmax, align 4
  %15 = load i32, i32* %blkY, align 4
  %16 = load i32, i32* %ty, align 4
  %add16 = add nsw i32 %15, %16
  store i32 %add16, i32* %yidx, align 4
  %17 = load i32, i32* %blkX, align 4
  %18 = load i32, i32* %tx, align 4
  %add17 = add nsw i32 %17, %18
  store i32 %add17, i32* %xidx, align 4
  %19 = load i32, i32* %yidx, align 4
  store i32 %19, i32* %loadYidx, align 4
  %20 = load i32, i32* %xidx, align 4
  store i32 %20, i32* %loadXidx, align 4
  %21 = load i32, i32* %grid_cols.addr, align 4
  %22 = load i32, i32* %loadYidx, align 4
  %mul18 = mul nsw i32 %21, %22
  %23 = load i32, i32* %loadXidx, align 4
  %add19 = add nsw i32 %mul18, %23
  store i32 %add19, i32* %index, align 4
  %24 = load i32, i32* %loadYidx, align 4
  %cmp = icmp sge i32 %24, 0
  br i1 %cmp, label %land.lhs.true, label %if.end
 land.lhs.true:                                    ; preds = %entry
  %25 = load i32, i32* %loadYidx, align 4
  %26 = load i32, i32* %grid_rows.addr, align 4
  %sub20 = sub nsw i32 %26, 1
  %cmp21 = icmp sle i32 %25, %sub20
  br i1 %cmp21, label %land.lhs.true22, label %if.end
 land.lhs.true22:                                  ; preds = %land.lhs.true
  %27 = load i32, i32* %loadXidx, align 4
  %cmp23 = icmp sge i32 %27, 0
  br i1 %cmp23, label %land.lhs.true24, label %if.end
 land.lhs.true24:                                  ; preds = %land.lhs.true22
  %28 = load i32, i32* %loadXidx, align 4
  %29 = load i32, i32* %grid_cols.addr, align 4
  %sub25 = sub nsw i32 %29, 1
  %cmp26 = icmp sle i32 %28, %sub25
  br i1 %cmp26, label %if.then, label %if.end
 if.then:                                          ; preds = %land.lhs.true24
  %30 = load float*, float** %temp_src.addr, align 8
  %31 = load i32, i32* %index, align 4
  %idxprom = sext i32 %31 to i64
  %arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
  %32 = load float, float* %arrayidx, align 4
  %33 = load i32, i32* %ty, align 4
  %idxprom27 = sext i32 %33 to i64
  %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
  %34 = load i32, i32* %tx, align 4
  %idxprom29 = sext i32 %34 to i64
  %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
  store float %32, float* %arrayidx30, align 4
  %35 = load float*, float** %power.addr, align 8
  %36 = load i32, i32* %index, align 4
  %idxprom31 = sext i32 %36 to i64
  %arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
  %37 = load float, float* %arrayidx32, align 4
  %38 = load i32, i32* %ty, align 4
  %idxprom33 = sext i32 %38 to i64
  %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
  %39 = load i32, i32* %tx, align 4
  %idxprom35 = sext i32 %39 to i64
  %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
  store float %37, float* %arrayidx36, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
  call void @llvm.nvvm.barrier0()
  %40 = load i32, i32* %blkY, align 4
  %cmp37 = icmp slt i32 %40, 0
  br i1 %cmp37, label %cond.true, label %cond.false
 cond.true:                                        ; preds = %if.end
  %41 = load i32, i32* %blkY, align 4
  %sub38 = sub nsw i32 0, %41
  br label %cond.end
 cond.false:                                       ; preds = %if.end
  br label %cond.end
 cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
  store i32 %cond, i32* %validYmin, align 4
  %42 = load i32, i32* %blkYmax, align 4
  %43 = load i32, i32* %grid_rows.addr, align 4
  %sub39 = sub nsw i32 %43, 1
  %cmp40 = icmp sgt i32 %42, %sub39
  br i1 %cmp40, label %cond.true41, label %cond.false45
 cond.true41:                                      ; preds = %cond.end
  %44 = load i32, i32* %blkYmax, align 4
  %45 = load i32, i32* %grid_rows.addr, align 4
  %sub42 = sub nsw i32 %44, %45
  %add43 = add nsw i32 %sub42, 1
  %sub44 = sub nsw i32 15, %add43
  br label %cond.end46
 cond.false45:                                     ; preds = %cond.end
  br label %cond.end46
 cond.end46:                                       ; preds = %cond.false45, %cond.true41
  %cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
  store i32 %cond47, i32* %validYmax, align 4
  %46 = load i32, i32* %blkX, align 4
  %cmp48 = icmp slt i32 %46, 0
  br i1 %cmp48, label %cond.true49, label %cond.false51
 cond.true49:                                      ; preds = %cond.end46
  %47 = load i32, i32* %blkX, align 4
  %sub50 = sub nsw i32 0, %47
  br label %cond.end52
 cond.false51:                                     ; preds = %cond.end46
  br label %cond.end52
 cond.end52:                                       ; preds = %cond.false51, %cond.true49
  %cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
  store i32 %cond53, i32* %validXmin, align 4
  %48 = load i32, i32* %blkXmax, align 4
  %49 = load i32, i32* %grid_cols.addr, align 4
  %sub54 = sub nsw i32 %49, 1
  %cmp55 = icmp sgt i32 %48, %sub54
  br i1 %cmp55, label %cond.true56, label %cond.false60
 cond.true56:                                      ; preds = %cond.end52
  %50 = load i32, i32* %blkXmax, align 4
  %51 = load i32, i32* %grid_cols.addr, align 4
  %sub57 = sub nsw i32 %50, %51
  %add58 = add nsw i32 %sub57, 1
  %sub59 = sub nsw i32 15, %add58
  br label %cond.end61
 cond.false60:                                     ; preds = %cond.end52
  br label %cond.end61
 cond.end61:                                       ; preds = %cond.false60, %cond.true56
  %cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
  store i32 %cond62, i32* %validXmax, align 4
  %52 = load i32, i32* %ty, align 4
  %sub63 = sub nsw i32 %52, 1
  store i32 %sub63, i32* %N, align 4
  %53 = load i32, i32* %ty, align 4
  %add64 = add nsw i32 %53, 1
  store i32 %add64, i32* %S, align 4
  %54 = load i32, i32* %tx, align 4
  %sub65 = sub nsw i32 %54, 1
  store i32 %sub65, i32* %W, align 4
  %55 = load i32, i32* %tx, align 4
  %add66 = add nsw i32 %55, 1
  store i32 %add66, i32* %E, align 4
  %56 = load i32, i32* %N, align 4
  %57 = load i32, i32* %validYmin, align 4
  %cmp67 = icmp slt i32 %56, %57
  br i1 %cmp67, label %cond.true68, label %cond.false69
 cond.true68:                                      ; preds = %cond.end61
  %58 = load i32, i32* %validYmin, align 4
  br label %cond.end70
 cond.false69:                                     ; preds = %cond.end61
  %59 = load i32, i32* %N, align 4
  br label %cond.end70
 cond.end70:                                       ; preds = %cond.false69, %cond.true68
  %cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
  store i32 %cond71, i32* %N, align 4
  %60 = load i32, i32* %S, align 4
  %61 = load i32, i32* %validYmax, align 4
  %cmp72 = icmp sgt i32 %60, %61
  br i1 %cmp72, label %cond.true73, label %cond.false74
 cond.true73:                                      ; preds = %cond.end70
  %62 = load i32, i32* %validYmax, align 4
  br label %cond.end75
 cond.false74:                                     ; preds = %cond.end70
  %63 = load i32, i32* %S, align 4
  br label %cond.end75
 cond.end75:                                       ; preds = %cond.false74, %cond.true73
  %cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
  store i32 %cond76, i32* %S, align 4
  %64 = load i32, i32* %W, align 4
  %65 = load i32, i32* %validXmin, align 4
  %cmp77 = icmp slt i32 %64, %65
  br i1 %cmp77, label %cond.true78, label %cond.false79
 cond.true78:                                      ; preds = %cond.end75
  %66 = load i32, i32* %validXmin, align 4
  br label %cond.end80
 cond.false79:                                     ; preds = %cond.end75
  %67 = load i32, i32* %W, align 4
  br label %cond.end80
 cond.end80:                                       ; preds = %cond.false79, %cond.true78
  %cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
  store i32 %cond81, i32* %W, align 4
  %68 = load i32, i32* %E, align 4
  %69 = load i32, i32* %validXmax, align 4
  %cmp82 = icmp sgt i32 %68, %69
  br i1 %cmp82, label %cond.true83, label %cond.false84
 cond.true83:                                      ; preds = %cond.end80
  %70 = load i32, i32* %validXmax, align 4
  br label %cond.end85
 cond.false84:                                     ; preds = %cond.end80
  %71 = load i32, i32* %E, align 4
  br label %cond.end85
 cond.end85:                                       ; preds = %cond.false84, %cond.true83
  %cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
  store i32 %cond86, i32* %E, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %cond.end85
  %72 = load i32, i32* %i, align 4
  %73 = load i32, i32* %iteration.addr, align 4
  %cmp87 = icmp slt i32 %72, %73
  br i1 %cmp87, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  store i8 0, i8* %computed, align 1
  %74 = load i32, i32* %tx, align 4
  %75 = load i32, i32* %i, align 4
  %add88 = add nsw i32 %75, 1
  %cmp89 = icmp sge i32 %74, %add88
  br i1 %cmp89, label %land.lhs.true90, label %if.end175
 land.lhs.true90:                                  ; preds = %for.body
  %76 = load i32, i32* %tx, align 4
  %77 = load i32, i32* %i, align 4
  %sub91 = sub nsw i32 16, %77
  %sub92 = sub nsw i32 %sub91, 2
  %cmp93 = icmp sle i32 %76, %sub92
  br i1 %cmp93, label %land.lhs.true94, label %if.end175
 land.lhs.true94:                                  ; preds = %land.lhs.true90
  %78 = load i32, i32* %ty, align 4
  %79 = load i32, i32* %i, align 4
  %add95 = add nsw i32 %79, 1
  %cmp96 = icmp sge i32 %78, %add95
  br i1 %cmp96, label %land.lhs.true97, label %if.end175
 land.lhs.true97:                                  ; preds = %land.lhs.true94
  %80 = load i32, i32* %ty, align 4
  %81 = load i32, i32* %i, align 4
  %sub98 = sub nsw i32 16, %81
  %sub99 = sub nsw i32 %sub98, 2
  %cmp100 = icmp sle i32 %80, %sub99
  br i1 %cmp100, label %land.lhs.true101, label %if.end175
 land.lhs.true101:                                 ; preds = %land.lhs.true97
  %82 = load i32, i32* %tx, align 4
  %83 = load i32, i32* %validXmin, align 4
  %cmp102 = icmp sge i32 %82, %83
  br i1 %cmp102, label %land.lhs.true103, label %if.end175
 land.lhs.true103:                                 ; preds = %land.lhs.true101
  %84 = load i32, i32* %tx, align 4
  %85 = load i32, i32* %validXmax, align 4
  %cmp104 = icmp sle i32 %84, %85
  br i1 %cmp104, label %land.lhs.true105, label %if.end175
 land.lhs.true105:                                 ; preds = %land.lhs.true103
  %86 = load i32, i32* %ty, align 4
  %87 = load i32, i32* %validYmin, align 4
  %cmp106 = icmp sge i32 %86, %87
  br i1 %cmp106, label %land.lhs.true107, label %if.end175
 land.lhs.true107:                                 ; preds = %land.lhs.true105
  %88 = load i32, i32* %ty, align 4
  %89 = load i32, i32* %validYmax, align 4
  %cmp108 = icmp sle i32 %88, %89
  br i1 %cmp108, label %if.then109, label %if.end175
 if.then109:                                       ; preds = %land.lhs.true107
  store i8 1, i8* %computed, align 1
  %90 = load i32, i32* %ty, align 4
  %idxprom110 = sext i32 %90 to i64
  %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
  %91 = load i32, i32* %tx, align 4
  %idxprom112 = sext i32 %91 to i64
  %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
  %92 = load float, float* %arrayidx113, align 4
  %conv = fpext float %92 to double
  %93 = load float, float* %step_div_Cap, align 4
  %conv114 = fpext float %93 to double
  %94 = load i32, i32* %ty, align 4
  %idxprom115 = sext i32 %94 to i64
  %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
  %95 = load i32, i32* %tx, align 4
  %idxprom117 = sext i32 %95 to i64
  %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
  %96 = load float, float* %arrayidx118, align 4
  %conv119 = fpext float %96 to double
  %97 = load i32, i32* %S, align 4
  %idxprom120 = sext i32 %97 to i64
  %arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
  %98 = load i32, i32* %tx, align 4
  %idxprom122 = sext i32 %98 to i64
  %arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
  %99 = load float, float* %arrayidx123, align 4
  %100 = load i32, i32* %N, align 4
  %idxprom124 = sext i32 %100 to i64
  %arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
  %101 = load i32, i32* %tx, align 4
  %idxprom126 = sext i32 %101 to i64
  %arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
  %102 = load float, float* %arrayidx127, align 4
  %add128 = fadd contract float %99, %102
  %conv129 = fpext float %add128 to double
  %103 = load i32, i32* %ty, align 4
  %idxprom130 = sext i32 %103 to i64
  %arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
  %104 = load i32, i32* %tx, align 4
  %idxprom132 = sext i32 %104 to i64
  %arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
  %105 = load float, float* %arrayidx133, align 4
  %conv134 = fpext float %105 to double
  %mul135 = fmul contract double 2.000000e+00, %conv134
  %sub136 = fsub contract double %conv129, %mul135
  %106 = load float, float* %Ry_1, align 4
  %conv137 = fpext float %106 to double
  %mul138 = fmul contract double %sub136, %conv137
  %add139 = fadd contract double %conv119, %mul138
  %107 = load i32, i32* %ty, align 4
  %idxprom140 = sext i32 %107 to i64
  %arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
  %108 = load i32, i32* %E, align 4
  %idxprom142 = sext i32 %108 to i64
  %arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
  %109 = load float, float* %arrayidx143, align 4
  %110 = load i32, i32* %ty, align 4
  %idxprom144 = sext i32 %110 to i64
  %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
  %111 = load i32, i32* %W, align 4
  %idxprom146 = sext i32 %111 to i64
  %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
  %112 = load float, float* %arrayidx147, align 4
  %add148 = fadd contract float %109, %112
  %conv149 = fpext float %add148 to double
  %113 = load i32, i32* %ty, align 4
  %idxprom150 = sext i32 %113 to i64
  %arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
  %114 = load i32, i32* %tx, align 4
  %idxprom152 = sext i32 %114 to i64
  %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
  %115 = load float, float* %arrayidx153, align 4
  %conv154 = fpext float %115 to double
  %mul155 = fmul contract double 2.000000e+00, %conv154
  %sub156 = fsub contract double %conv149, %mul155
  %116 = load float, float* %Rx_1, align 4
  %conv157 = fpext float %116 to double
  %mul158 = fmul contract double %sub156, %conv157
  %add159 = fadd contract double %add139, %mul158
  %117 = load float, float* %amb_temp, align 4
  %118 = load i32, i32* %ty, align 4
  %idxprom160 = sext i32 %118 to i64
  %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
  %119 = load i32, i32* %tx, align 4
  %idxprom162 = sext i32 %119 to i64
  %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
  %120 = load float, float* %arrayidx163, align 4
  %sub164 = fsub contract float %117, %120
  %121 = load float, float* %Rz_1, align 4
  %mul165 = fmul contract float %sub164, %121
  %conv166 = fpext float %mul165 to double
  %add167 = fadd contract double %add159, %conv166
  %mul168 = fmul contract double %conv114, %add167
  %add169 = fadd contract double %conv, %mul168
  %conv170 = fptrunc double %add169 to float
  %122 = load i32, i32* %ty, align 4
  %idxprom171 = sext i32 %122 to i64
  %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
  %123 = load i32, i32* %tx, align 4
  %idxprom173 = sext i32 %123 to i64
  %arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
  store float %conv170, float* %arrayidx174, align 4
  br label %if.end175
 if.end175:                                        ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
  call void @llvm.nvvm.barrier0()
  %124 = load i32, i32* %i, align 4
  %125 = load i32, i32* %iteration.addr, align 4
  %sub176 = sub nsw i32 %125, 1
  %cmp177 = icmp eq i32 %124, %sub176
  br i1 %cmp177, label %if.then178, label %if.end179
 if.then178:                                       ; preds = %if.end175
  br label %for.end
 if.end179:                                        ; preds = %if.end175
  %126 = load i8, i8* %computed, align 1
  %tobool = trunc i8 %126 to i1
  br i1 %tobool, label %if.then180, label %if.end189
 if.then180:                                       ; preds = %if.end179
  %127 = load i32, i32* %ty, align 4
  %idxprom181 = sext i32 %127 to i64
  %arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
  %128 = load i32, i32* %tx, align 4
  %idxprom183 = sext i32 %128 to i64
  %arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
  %129 = load float, float* %arrayidx184, align 4
  %130 = load i32, i32* %ty, align 4
  %idxprom185 = sext i32 %130 to i64
  %arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
  %131 = load i32, i32* %tx, align 4
  %idxprom187 = sext i32 %131 to i64
  %arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
  store float %129, float* %arrayidx188, align 4
  br label %if.end189
 if.end189:                                        ; preds = %if.then180, %if.end179
  call void @llvm.nvvm.barrier0()
  br label %for.inc
 for.inc:                                          ; preds = %if.end189
  %132 = load i32, i32* %i, align 4
  %inc = add nsw i32 %132, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %if.then178, %for.cond
  %133 = load i8, i8* %computed, align 1
  %tobool190 = trunc i8 %133 to i1
  br i1 %tobool190, label %if.then191, label %if.end198
 if.then191:                                       ; preds = %for.end
  %134 = load i32, i32* %ty, align 4
  %idxprom192 = sext i32 %134 to i64
  %arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
  %135 = load i32, i32* %tx, align 4
  %idxprom194 = sext i32 %135 to i64
  %arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
  %136 = load float, float* %arrayidx195, align 4
  %137 = load float*, float** %temp_dst.addr, align 8
  %138 = load i32, i32* %index, align 4
  %idxprom196 = sext i32 %138 to i64
  %arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
  store float %136, float* %arrayidx197, align 4
  br label %if.end198
 if.end198:                                        ; preds = %if.then191, %for.end
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier0() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot/hotspot.cu
+++ b/examples/hotspot/hotspot.cu
@ -0,0 +1,353 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #ifdef RD_WG_SIZE_0_0
 #define BLOCK_SIZE RD_WG_SIZE_0_0
 #elif defined(RD_WG_SIZE_0)
 #define BLOCK_SIZE RD_WG_SIZE_0
 #elif defined(RD_WG_SIZE)
 #define BLOCK_SIZE RD_WG_SIZE
 #else
 #define BLOCK_SIZE 16
 #endif
 #define STR_SIZE 256
 /* maximum power density possible (say 300W for a 10mm x 10mm chip)	*/
 #define MAX_PD (3.0e6)
 /* required precision in degrees	*/
 #define PRECISION 0.001
 #define SPEC_HEAT_SI 1.75e6
 #define K_SI 100
 /* capacitance fitting factor	*/
 #define FACTOR_CHIP 0.5
 /* chip parameters	*/
 float t_chip = 0.0005;
 float chip_height = 0.016;
 float chip_width = 0.016;
 /* ambient temperature, assuming no package at all	*/
 float amb_temp = 80.0;
 void run(int argc, char **argv);
 /* define timer macros */
 #define pin_stats_reset() startCycle()
 #define pin_stats_pause(cycles) stopCycle(cycles)
 #define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
 void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
 void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
  int i, j, index = 0;
  FILE *fp;
  char str[STR_SIZE];
  if ((fp = fopen(file, "w")) == 0)
    printf("The file was not opened\n");
  for (i = 0; i < grid_rows; i++)
    for (j = 0; j < grid_cols; j++) {
      sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
      fputs(str, fp);
      index++;
    }
  fclose(fp);
 }
 void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
  int i, j;
  FILE *fp;
  char str[STR_SIZE];
  float val;
  if ((fp = fopen(file, "r")) == 0)
    printf("The file was not opened\n");
  for (i = 0; i <= grid_rows - 1; i++)
    for (j = 0; j <= grid_cols - 1; j++) {
      fgets(str, STR_SIZE, fp);
      if (feof(fp))
        fatal("not enough lines in file");
      // if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
      // ((i-1)*(grid_cols-2)+j-1)))
      if ((sscanf(str, "%f", &val) != 1))
        fatal("invalid file format");
      vect[i * grid_cols + j] = val;
    }
  fclose(fp);
 }
 #define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
 #define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
 __global__ void calculate_temp(int iteration,   // number of iteration
                               float *power,    // power input
                               float *temp_src, // temperature input/output
                               float *temp_dst, // temperature input/output
                               int grid_cols,   // Col of grid
                               int grid_rows,   // Row of grid
                               int border_cols, // border offset
                               int border_rows, // border offset
                               float Cap,       // Capacitance
                               float Rx, float Ry, float Rz, float step,
                               float time_elapsed) {
  __shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float temp_t[BLOCK_SIZE]
                         [BLOCK_SIZE]; // saving temparary temperature result
  float amb_temp = 80.0;
  float step_div_Cap;
  float Rx_1, Ry_1, Rz_1;
  int bx = blockIdx.x;
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  step_div_Cap = step / Cap;
  Rx_1 = 1 / Rx;
  Ry_1 = 1 / Ry;
  Rz_1 = 1 / Rz;
  // each block finally computes result for a small block
  // after N iterations.
  // it is the non-overlapping small blocks that cover
  // all the input data
  // calculate the small block size
  int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
  int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
  // calculate the boundary for the block according to
  // the boundary of its small block
  int blkY = small_block_rows * by - border_rows;
  int blkX = small_block_cols * bx - border_cols;
  int blkYmax = blkY + BLOCK_SIZE - 1;
  int blkXmax = blkX + BLOCK_SIZE - 1;
  // calculate the global thread coordination
  int yidx = blkY + ty;
  int xidx = blkX + tx;
  // load data if it is within the valid input range
  int loadYidx = yidx, loadXidx = xidx;
  int index = grid_cols * loadYidx + loadXidx;
  if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
      IN_RANGE(loadXidx, 0, grid_cols - 1)) {
    temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
                                            // global memory to shared memory
    power_on_cuda[ty][tx] =
        power[index]; // Load the power data from global memory to shared memory
  }
  __syncthreads();
  // effective range within this block that falls within
  // the valid range of the input data
  // used to rule out computation outside the boundary.
  int validYmin = (blkY < 0) ? -blkY : 0;
  int validYmax = (blkYmax > grid_rows - 1)
                      ? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
                      : BLOCK_SIZE - 1;
  int validXmin = (blkX < 0) ? -blkX : 0;
  int validXmax = (blkXmax > grid_cols - 1)
                      ? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
                      : BLOCK_SIZE - 1;
  int N = ty - 1;
  int S = ty + 1;
  int W = tx - 1;
  int E = tx + 1;
  N = (N < validYmin) ? validYmin : N;
  S = (S > validYmax) ? validYmax : S;
  W = (W < validXmin) ? validXmin : W;
  E = (E > validXmax) ? validXmax : E;
  bool computed;
  for (int i = 0; i < iteration; i++) {
    computed = false;
    if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
        IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
        IN_RANGE(tx, validXmin, validXmax) &&
        IN_RANGE(ty, validYmin, validYmax)) {
      computed = true;
      temp_t[ty][tx] =
          temp_on_cuda[ty][tx] +
          step_div_Cap * (power_on_cuda[ty][tx] +
                          (temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
                           2.0 * temp_on_cuda[ty][tx]) *
                              Ry_1 +
                          (temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
                           2.0 * temp_on_cuda[ty][tx]) *
                              Rx_1 +
                          (amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
    }
    __syncthreads();
    if (i == iteration - 1)
      break;
    if (computed) // Assign the computation range
      temp_on_cuda[ty][tx] = temp_t[ty][tx];
    __syncthreads();
  }
  // update the global memory
  // after the last iteration, only threads coordinated within the
  // small block perform the calculation and switch on ``computed''
  if (computed) {
    temp_dst[index] = temp_t[ty][tx];
  }
 }
 /*
   compute N time steps
 */
 int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
                      int row, int total_iterations, int num_iterations,
                      int blockCols, int blockRows, int borderCols,
                      int borderRows) {
  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  dim3 dimGrid(blockCols, blockRows);
  float grid_height = chip_height / row;
  float grid_width = chip_width / col;
  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
  float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
  float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
  float Rz = t_chip / (K_SI * grid_height * grid_width);
  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
  float step = PRECISION / max_slope;
  float t;
  float time_elapsed;
  time_elapsed = 0.001;
  int src = 1, dst = 0;
  for (t = 0; t < total_iterations; t += num_iterations) {
    int temp = src;
    src = dst;
    dst = temp;
    calculate_temp<<<dimGrid, dimBlock>>>(
        MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
        MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
        step, time_elapsed);
    cudaDeviceSynchronize();
  }
  return dst;
 }
 void usage(int argc, char **argv) {
  fprintf(stderr,
          "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
          "<temp_file> <power_file> <output_file>\n",
          argv[0]);
  fprintf(stderr, "\t<grid_rows/grid_cols>  - number of rows/cols in the grid "
                  "(positive integer)\n");
  fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
  fprintf(stderr, "\t<sim_time>   - number of iterations\n");
  fprintf(stderr, "\t<temp_file>  - name of the file containing the initial "
                  "temperature values of each cell\n");
  fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
                  "power values of each cell\n");
  fprintf(stderr, "\t<output_file> - name of the output file\n");
  exit(1);
 }
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
  run(argc, argv);
  return EXIT_SUCCESS;
 }
 void run(int argc, char **argv) {
  int size;
  int grid_rows, grid_cols;
  float *FilesavingTemp, *FilesavingPower, *MatrixOut;
  char *tfile, *pfile, *ofile;
  int total_iterations = 60;
  int pyramid_height = 1; // number of iterations
  if (argc != 7)
    usage(argc, argv);
  if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
      (pyramid_height = atoi(argv[2])) <= 0 ||
      (total_iterations = atoi(argv[3])) <= 0)
    usage(argc, argv);
  tfile = argv[4];
  pfile = argv[5];
  ofile = argv[6];
  size = grid_rows * grid_cols;
 /* --------------- pyramid parameters --------------- */
 #define EXPAND_RATE                                                            \
  2 // add one iteration will extend the pyramid base by 2 per each borderline
  int borderCols = (pyramid_height)*EXPAND_RATE / 2;
  int borderRows = (pyramid_height)*EXPAND_RATE / 2;
  int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
  int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
  int blockCols =
      grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
  int blockRows =
      grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
  FilesavingTemp = (float *)malloc(size * sizeof(float));
  FilesavingPower = (float *)malloc(size * sizeof(float));
  MatrixOut = (float *)calloc(size, sizeof(float));
  if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
    fatal("unable to allocate memory");
  printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
         "%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
         pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
         blockCols, blockRows, smallBlockCol, smallBlockRow);
  readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
  readinput(FilesavingPower, grid_rows, grid_cols, pfile);
  float *MatrixTemp[2], *MatrixPower;
  cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
  cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
  cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
             cudaMemcpyHostToDevice);
  cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
  cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
             cudaMemcpyHostToDevice);
  printf("Start computing the transient temperature\n");
  int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
                              total_iterations, pyramid_height, blockCols,
                              blockRows, borderCols, borderRows);
  printf("Ending simulation\n");
  cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
             cudaMemcpyDeviceToHost);
  writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
  cudaFree(MatrixPower);
  cudaFree(MatrixTemp[0]);
  cudaFree(MatrixTemp[1]);
  free(MatrixOut);
 }
--- a/examples/hotspot/run.sh
+++ b/examples/hotspot/run.sh
@ -0,0 +1,21 @@
 #!/bin/bash
 set -e
 llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -Wall -L../../build/runtime  -L../../build/runtime/threadPool \
    -o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
 if head output.out | grep -q "323.829"; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,587 @@
 ; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "3D.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.__cuda_builtin_blockDim_t = type { i8 }
 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
 $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
 $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
 $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
 entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
 entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
 entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
 entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
 entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
 }
 ; Function Attrs: convergent noinline nounwind optnone
 define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
 entry:
  %p.addr = alloca float*, align 8
  %tIn.addr = alloca float*, align 8
  %tOut.addr = alloca float*, align 8
  %sdc.addr = alloca float, align 4
  %nx.addr = alloca i32, align 4
  %ny.addr = alloca i32, align 4
  %nz.addr = alloca i32, align 4
  %ce.addr = alloca float, align 4
  %cw.addr = alloca float, align 4
  %cn.addr = alloca float, align 4
  %cs.addr = alloca float, align 4
  %ct.addr = alloca float, align 4
  %cb.addr = alloca float, align 4
  %cc.addr = alloca float, align 4
  %amb_temp = alloca float, align 4
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %c = alloca i32, align 4
  %xy = alloca i32, align 4
  %W = alloca i32, align 4
  %E = alloca i32, align 4
  %N = alloca i32, align 4
  %S = alloca i32, align 4
  %temp1 = alloca float, align 4
  %temp2 = alloca float, align 4
  %temp3 = alloca float, align 4
  %k = alloca i32, align 4
  store float* %p, float** %p.addr, align 8
  store float* %tIn, float** %tIn.addr, align 8
  store float* %tOut, float** %tOut.addr, align 8
  store float %sdc, float* %sdc.addr, align 4
  store i32 %nx, i32* %nx.addr, align 4
  store i32 %ny, i32* %ny.addr, align 4
  store i32 %nz, i32* %nz.addr, align 4
  store float %ce, float* %ce.addr, align 4
  store float %cw, float* %cw.addr, align 4
  store float %cn, float* %cn.addr, align 4
  store float %cs, float* %cs.addr, align 4
  store float %ct, float* %ct.addr, align 4
  store float %cb, float* %cb.addr, align 4
  store float %cc, float* %cc.addr, align 4
  store float 8.000000e+01, float* %amb_temp, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
  %mul = mul i32 %call, %call1
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
  %add = add i32 %mul, %call2
  store i32 %add, i32* %i, align 4
  %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
  %call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
  %mul5 = mul i32 %call3, %call4
  %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
  %add7 = add i32 %mul5, %call6
  store i32 %add7, i32* %j, align 4
  %0 = load i32, i32* %i, align 4
  %1 = load i32, i32* %j, align 4
  %2 = load i32, i32* %nx.addr, align 4
  %mul8 = mul nsw i32 %1, %2
  %add9 = add nsw i32 %0, %mul8
  store i32 %add9, i32* %c, align 4
  %3 = load i32, i32* %nx.addr, align 4
  %4 = load i32, i32* %ny.addr, align 4
  %mul10 = mul nsw i32 %3, %4
  store i32 %mul10, i32* %xy, align 4
  %5 = load i32, i32* %i, align 4
  %cmp = icmp eq i32 %5, 0
  br i1 %cmp, label %cond.true, label %cond.false
 cond.true:                                        ; preds = %entry
  %6 = load i32, i32* %c, align 4
  br label %cond.end
 cond.false:                                       ; preds = %entry
  %7 = load i32, i32* %c, align 4
  %sub = sub nsw i32 %7, 1
  br label %cond.end
 cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
  store i32 %cond, i32* %W, align 4
  %8 = load i32, i32* %i, align 4
  %9 = load i32, i32* %nx.addr, align 4
  %sub11 = sub nsw i32 %9, 1
  %cmp12 = icmp eq i32 %8, %sub11
  br i1 %cmp12, label %cond.true13, label %cond.false14
 cond.true13:                                      ; preds = %cond.end
  %10 = load i32, i32* %c, align 4
  br label %cond.end16
 cond.false14:                                     ; preds = %cond.end
  %11 = load i32, i32* %c, align 4
  %add15 = add nsw i32 %11, 1
  br label %cond.end16
 cond.end16:                                       ; preds = %cond.false14, %cond.true13
  %cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
  store i32 %cond17, i32* %E, align 4
  %12 = load i32, i32* %j, align 4
  %cmp18 = icmp eq i32 %12, 0
  br i1 %cmp18, label %cond.true19, label %cond.false20
 cond.true19:                                      ; preds = %cond.end16
  %13 = load i32, i32* %c, align 4
  br label %cond.end22
 cond.false20:                                     ; preds = %cond.end16
  %14 = load i32, i32* %c, align 4
  %15 = load i32, i32* %nx.addr, align 4
  %sub21 = sub nsw i32 %14, %15
  br label %cond.end22
 cond.end22:                                       ; preds = %cond.false20, %cond.true19
  %cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
  store i32 %cond23, i32* %N, align 4
  %16 = load i32, i32* %j, align 4
  %17 = load i32, i32* %ny.addr, align 4
  %sub24 = sub nsw i32 %17, 1
  %cmp25 = icmp eq i32 %16, %sub24
  br i1 %cmp25, label %cond.true26, label %cond.false27
 cond.true26:                                      ; preds = %cond.end22
  %18 = load i32, i32* %c, align 4
  br label %cond.end29
 cond.false27:                                     ; preds = %cond.end22
  %19 = load i32, i32* %c, align 4
  %20 = load i32, i32* %nx.addr, align 4
  %add28 = add nsw i32 %19, %20
  br label %cond.end29
 cond.end29:                                       ; preds = %cond.false27, %cond.true26
  %cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
  store i32 %cond30, i32* %S, align 4
  %21 = load float*, float** %tIn.addr, align 8
  %22 = load i32, i32* %c, align 4
  %idxprom = sext i32 %22 to i64
  %arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
  %23 = load float, float* %arrayidx, align 4
  store float %23, float* %temp2, align 4
  store float %23, float* %temp1, align 4
  %24 = load float*, float** %tIn.addr, align 8
  %25 = load i32, i32* %c, align 4
  %26 = load i32, i32* %xy, align 4
  %add31 = add nsw i32 %25, %26
  %idxprom32 = sext i32 %add31 to i64
  %arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
  %27 = load float, float* %arrayidx33, align 4
  store float %27, float* %temp3, align 4
  %28 = load float, float* %cc.addr, align 4
  %29 = load float, float* %temp2, align 4
  %mul34 = fmul contract float %28, %29
  %30 = load float, float* %cw.addr, align 4
  %31 = load float*, float** %tIn.addr, align 8
  %32 = load i32, i32* %W, align 4
  %idxprom35 = sext i32 %32 to i64
  %arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
  %33 = load float, float* %arrayidx36, align 4
  %mul37 = fmul contract float %30, %33
  %add38 = fadd contract float %mul34, %mul37
  %34 = load float, float* %ce.addr, align 4
  %35 = load float*, float** %tIn.addr, align 8
  %36 = load i32, i32* %E, align 4
  %idxprom39 = sext i32 %36 to i64
  %arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
  %37 = load float, float* %arrayidx40, align 4
  %mul41 = fmul contract float %34, %37
  %add42 = fadd contract float %add38, %mul41
  %38 = load float, float* %cs.addr, align 4
  %39 = load float*, float** %tIn.addr, align 8
  %40 = load i32, i32* %S, align 4
  %idxprom43 = sext i32 %40 to i64
  %arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
  %41 = load float, float* %arrayidx44, align 4
  %mul45 = fmul contract float %38, %41
  %add46 = fadd contract float %add42, %mul45
  %42 = load float, float* %cn.addr, align 4
  %43 = load float*, float** %tIn.addr, align 8
  %44 = load i32, i32* %N, align 4
  %idxprom47 = sext i32 %44 to i64
  %arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
  %45 = load float, float* %arrayidx48, align 4
  %mul49 = fmul contract float %42, %45
  %add50 = fadd contract float %add46, %mul49
  %46 = load float, float* %cb.addr, align 4
  %47 = load float, float* %temp1, align 4
  %mul51 = fmul contract float %46, %47
  %add52 = fadd contract float %add50, %mul51
  %48 = load float, float* %ct.addr, align 4
  %49 = load float, float* %temp3, align 4
  %mul53 = fmul contract float %48, %49
  %add54 = fadd contract float %add52, %mul53
  %50 = load float, float* %sdc.addr, align 4
  %51 = load float*, float** %p.addr, align 8
  %52 = load i32, i32* %c, align 4
  %idxprom55 = sext i32 %52 to i64
  %arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
  %53 = load float, float* %arrayidx56, align 4
  %mul57 = fmul contract float %50, %53
  %add58 = fadd contract float %add54, %mul57
  %54 = load float, float* %ct.addr, align 4
  %55 = load float, float* %amb_temp, align 4
  %mul59 = fmul contract float %54, %55
  %add60 = fadd contract float %add58, %mul59
  %56 = load float*, float** %tOut.addr, align 8
  %57 = load i32, i32* %c, align 4
  %idxprom61 = sext i32 %57 to i64
  %arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
  store float %add60, float* %arrayidx62, align 4
  %58 = load i32, i32* %xy, align 4
  %59 = load i32, i32* %c, align 4
  %add63 = add nsw i32 %59, %58
  store i32 %add63, i32* %c, align 4
  %60 = load i32, i32* %xy, align 4
  %61 = load i32, i32* %W, align 4
  %add64 = add nsw i32 %61, %60
  store i32 %add64, i32* %W, align 4
  %62 = load i32, i32* %xy, align 4
  %63 = load i32, i32* %E, align 4
  %add65 = add nsw i32 %63, %62
  store i32 %add65, i32* %E, align 4
  %64 = load i32, i32* %xy, align 4
  %65 = load i32, i32* %N, align 4
  %add66 = add nsw i32 %65, %64
  store i32 %add66, i32* %N, align 4
  %66 = load i32, i32* %xy, align 4
  %67 = load i32, i32* %S, align 4
  %add67 = add nsw i32 %67, %66
  store i32 %add67, i32* %S, align 4
  store i32 1, i32* %k, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %cond.end29
  %68 = load i32, i32* %k, align 4
  %69 = load i32, i32* %nz.addr, align 4
  %sub68 = sub nsw i32 %69, 1
  %cmp69 = icmp slt i32 %68, %sub68
  br i1 %cmp69, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %70 = load float, float* %temp2, align 4
  store float %70, float* %temp1, align 4
  %71 = load float, float* %temp3, align 4
  store float %71, float* %temp2, align 4
  %72 = load float*, float** %tIn.addr, align 8
  %73 = load i32, i32* %c, align 4
  %74 = load i32, i32* %xy, align 4
  %add70 = add nsw i32 %73, %74
  %idxprom71 = sext i32 %add70 to i64
  %arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
  %75 = load float, float* %arrayidx72, align 4
  store float %75, float* %temp3, align 4
  %76 = load float, float* %cc.addr, align 4
  %77 = load float, float* %temp2, align 4
  %mul73 = fmul contract float %76, %77
  %78 = load float, float* %cw.addr, align 4
  %79 = load float*, float** %tIn.addr, align 8
  %80 = load i32, i32* %W, align 4
  %idxprom74 = sext i32 %80 to i64
  %arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
  %81 = load float, float* %arrayidx75, align 4
  %mul76 = fmul contract float %78, %81
  %add77 = fadd contract float %mul73, %mul76
  %82 = load float, float* %ce.addr, align 4
  %83 = load float*, float** %tIn.addr, align 8
  %84 = load i32, i32* %E, align 4
  %idxprom78 = sext i32 %84 to i64
  %arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
  %85 = load float, float* %arrayidx79, align 4
  %mul80 = fmul contract float %82, %85
  %add81 = fadd contract float %add77, %mul80
  %86 = load float, float* %cs.addr, align 4
  %87 = load float*, float** %tIn.addr, align 8
  %88 = load i32, i32* %S, align 4
  %idxprom82 = sext i32 %88 to i64
  %arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
  %89 = load float, float* %arrayidx83, align 4
  %mul84 = fmul contract float %86, %89
  %add85 = fadd contract float %add81, %mul84
  %90 = load float, float* %cn.addr, align 4
  %91 = load float*, float** %tIn.addr, align 8
  %92 = load i32, i32* %N, align 4
  %idxprom86 = sext i32 %92 to i64
  %arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
  %93 = load float, float* %arrayidx87, align 4
  %mul88 = fmul contract float %90, %93
  %add89 = fadd contract float %add85, %mul88
  %94 = load float, float* %cb.addr, align 4
  %95 = load float, float* %temp1, align 4
  %mul90 = fmul contract float %94, %95
  %add91 = fadd contract float %add89, %mul90
  %96 = load float, float* %ct.addr, align 4
  %97 = load float, float* %temp3, align 4
  %mul92 = fmul contract float %96, %97
  %add93 = fadd contract float %add91, %mul92
  %98 = load float, float* %sdc.addr, align 4
  %99 = load float*, float** %p.addr, align 8
  %100 = load i32, i32* %c, align 4
  %idxprom94 = sext i32 %100 to i64
  %arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
  %101 = load float, float* %arrayidx95, align 4
  %mul96 = fmul contract float %98, %101
  %add97 = fadd contract float %add93, %mul96
  %102 = load float, float* %ct.addr, align 4
  %103 = load float, float* %amb_temp, align 4
  %mul98 = fmul contract float %102, %103
  %add99 = fadd contract float %add97, %mul98
  %104 = load float*, float** %tOut.addr, align 8
  %105 = load i32, i32* %c, align 4
  %idxprom100 = sext i32 %105 to i64
  %arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
  store float %add99, float* %arrayidx101, align 4
  %106 = load i32, i32* %xy, align 4
  %107 = load i32, i32* %c, align 4
  %add102 = add nsw i32 %107, %106
  store i32 %add102, i32* %c, align 4
  %108 = load i32, i32* %xy, align 4
  %109 = load i32, i32* %W, align 4
  %add103 = add nsw i32 %109, %108
  store i32 %add103, i32* %W, align 4
  %110 = load i32, i32* %xy, align 4
  %111 = load i32, i32* %E, align 4
  %add104 = add nsw i32 %111, %110
  store i32 %add104, i32* %E, align 4
  %112 = load i32, i32* %xy, align 4
  %113 = load i32, i32* %N, align 4
  %add105 = add nsw i32 %113, %112
  store i32 %add105, i32* %N, align 4
  %114 = load i32, i32* %xy, align 4
  %115 = load i32, i32* %S, align 4
  %add106 = add nsw i32 %115, %114
  store i32 %add106, i32* %S, align 4
  br label %for.inc
 for.inc:                                          ; preds = %for.body
  %116 = load i32, i32* %k, align 4
  %inc = add nsw i32 %116, 1
  store i32 %inc, i32* %k, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  %117 = load float, float* %temp2, align 4
  store float %117, float* %temp1, align 4
  %118 = load float, float* %temp3, align 4
  store float %118, float* %temp2, align 4
  %119 = load float, float* %cc.addr, align 4
  %120 = load float, float* %temp2, align 4
  %mul107 = fmul contract float %119, %120
  %121 = load float, float* %cw.addr, align 4
  %122 = load float*, float** %tIn.addr, align 8
  %123 = load i32, i32* %W, align 4
  %idxprom108 = sext i32 %123 to i64
  %arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
  %124 = load float, float* %arrayidx109, align 4
  %mul110 = fmul contract float %121, %124
  %add111 = fadd contract float %mul107, %mul110
  %125 = load float, float* %ce.addr, align 4
  %126 = load float*, float** %tIn.addr, align 8
  %127 = load i32, i32* %E, align 4
  %idxprom112 = sext i32 %127 to i64
  %arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
  %128 = load float, float* %arrayidx113, align 4
  %mul114 = fmul contract float %125, %128
  %add115 = fadd contract float %add111, %mul114
  %129 = load float, float* %cs.addr, align 4
  %130 = load float*, float** %tIn.addr, align 8
  %131 = load i32, i32* %S, align 4
  %idxprom116 = sext i32 %131 to i64
  %arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
  %132 = load float, float* %arrayidx117, align 4
  %mul118 = fmul contract float %129, %132
  %add119 = fadd contract float %add115, %mul118
  %133 = load float, float* %cn.addr, align 4
  %134 = load float*, float** %tIn.addr, align 8
  %135 = load i32, i32* %N, align 4
  %idxprom120 = sext i32 %135 to i64
  %arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
  %136 = load float, float* %arrayidx121, align 4
  %mul122 = fmul contract float %133, %136
  %add123 = fadd contract float %add119, %mul122
  %137 = load float, float* %cb.addr, align 4
  %138 = load float, float* %temp1, align 4
  %mul124 = fmul contract float %137, %138
  %add125 = fadd contract float %add123, %mul124
  %139 = load float, float* %ct.addr, align 4
  %140 = load float, float* %temp3, align 4
  %mul126 = fmul contract float %139, %140
  %add127 = fadd contract float %add125, %mul126
  %141 = load float, float* %sdc.addr, align 4
  %142 = load float*, float** %p.addr, align 8
  %143 = load i32, i32* %c, align 4
  %idxprom128 = sext i32 %143 to i64
  %arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
  %144 = load float, float* %arrayidx129, align 4
  %mul130 = fmul contract float %141, %144
  %add131 = fadd contract float %add127, %mul130
  %145 = load float, float* %ct.addr, align 4
  %146 = load float, float* %amb_temp, align 4
  %mul132 = fmul contract float %145, %146
  %add133 = fadd contract float %add131, %mul132
  %147 = load float*, float** %tOut.addr, align 8
  %148 = load i32, i32* %c, align 4
  %idxprom134 = sext i32 %148 to i64
  %arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
  store float %add133, float* %arrayidx135, align 4
  ret void
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
 }
 ; Function Attrs: alwaysinline convergent nounwind
 define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
 entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
--- a/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
+++ b/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll
--- a/examples/hotspot3D/3D.cu
+++ b/examples/hotspot3D/3D.cu
@ -0,0 +1,205 @@
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include <time.h>
 #define BLOCK_SIZE 16
 #define STR_SIZE 256
 #define block_x_ 128
 #define block_y_ 2
 #define block_z_ 1
 #define MAX_PD (3.0e6)
 /* required precision in degrees	*/
 #define PRECISION 0.001
 #define SPEC_HEAT_SI 1.75e6
 #define K_SI 100
 /* capacitance fitting factor	*/
 #define FACTOR_CHIP 0.5
 #include "opt1.cu"
 /* chip parameters	*/
 float t_chip = 0.0005;
 float chip_height = 0.016;
 float chip_width = 0.016; /* ambient temperature, assuming no package at all
                           */
 float amb_temp = 80.0;
 void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
 void readinput(float *vect, int grid_rows, int grid_cols, int layers,
               char *file) {
  int i, j, k;
  FILE *fp;
  char str[STR_SIZE];
  float val;
  if ((fp = fopen(file, "r")) == 0)
    fatal("The file was not opened");
  for (i = 0; i <= grid_rows - 1; i++)
    for (j = 0; j <= grid_cols - 1; j++)
      for (k = 0; k <= layers - 1; k++) {
        if (fgets(str, STR_SIZE, fp) == NULL)
          fatal("Error reading file\n");
        if (feof(fp))
          fatal("not enough lines in file");
        if ((sscanf(str, "%f", &val) != 1))
          fatal("invalid file format");
        vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
      }
  fclose(fp);
 }
 void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
                 char *file) {
  int i, j, k, index = 0;
  FILE *fp;
  char str[STR_SIZE];
  if ((fp = fopen(file, "w")) == 0)
    printf("The file was not opened\n");
  for (i = 0; i < grid_rows; i++)
    for (j = 0; j < grid_cols; j++)
      for (k = 0; k < layers; k++) {
        sprintf(str, "%d\t%g\n", index,
                vect[i * grid_cols + j + k * grid_rows * grid_cols]);
        fputs(str, fp);
        index++;
      }
  fclose(fp);
 }
 void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
                    float Cap, float Rx, float Ry, float Rz, float dt,
                    int numiter) {
  float ce, cw, cn, cs, ct, cb, cc;
  float stepDivCap = dt / Cap;
  ce = cw = stepDivCap / Rx;
  cn = cs = stepDivCap / Ry;
  ct = cb = stepDivCap / Rz;
  cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
  int c, w, e, n, s, b, t;
  int x, y, z;
  int i = 0;
  do {
    for (z = 0; z < nz; z++)
      for (y = 0; y < ny; y++)
        for (x = 0; x < nx; x++) {
          c = x + y * nx + z * nx * ny;
          w = (x == 0) ? c : c - 1;
          e = (x == nx - 1) ? c : c + 1;
          n = (y == 0) ? c : c - nx;
          s = (y == ny - 1) ? c : c + nx;
          b = (z == 0) ? c : c - nx * ny;
          t = (z == nz - 1) ? c : c + nx * ny;
          tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
                    tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
                    (dt / Cap) * pIn[c] + ct * amb_temp;
        }
    float *temp = tIn;
    tIn = tOut;
    tOut = temp;
    i++;
  } while (i < numiter);
 }
 float accuracy(float *arr1, float *arr2, int len) {
  float err = 0.0;
  int i;
  for (i = 0; i < len; i++) {
    err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
  }
  return (float)sqrt(err / len);
 }
 void usage(int argc, char **argv) {
  fprintf(stderr,
          "Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
          "<outputFile>\n",
          argv[0]);
  fprintf(
      stderr,
      "\t<rows/cols>  - number of rows/cols in the grid (positive integer)\n");
  fprintf(stderr,
          "\t<layers>  - number of layers in the grid (positive integer)\n");
  fprintf(stderr, "\t<iteration> - number of iterations\n");
  fprintf(stderr, "\t<powerFile>  - name of the file containing the initial "
                  "power values of each cell\n");
  fprintf(stderr, "\t<tempFile>  - name of the file containing the initial "
                  "temperature values of each cell\n");
  fprintf(stderr, "\t<outputFile - output file\n");
  exit(1);
 }
 int main(int argc, char **argv) {
  cudaSetDevice(0);
  if (argc != 7) {
    usage(argc, argv);
  }
  char *pfile, *tfile, *ofile;
  int iterations = atoi(argv[3]);
  pfile = argv[4];
  tfile = argv[5];
  ofile = argv[6];
  int numCols = atoi(argv[1]);
  int numRows = atoi(argv[1]);
  int layers = atoi(argv[2]);
  /* calculating parameters*/
  float dx = chip_height / numRows;
  float dy = chip_width / numCols;
  float dz = t_chip / layers;
  float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
  float Rx = dy / (2.0 * K_SI * t_chip * dx);
  float Ry = dx / (2.0 * K_SI * t_chip * dy);
  float Rz = dz / (K_SI * dx * dy);
  float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
  float dt = PRECISION / max_slope;
  float *powerIn, *tempOut, *tempIn, *tempCopy;
  int size = numCols * numRows * layers;
  powerIn = (float *)calloc(size, sizeof(float));
  tempCopy = (float *)malloc(size * sizeof(float));
  tempIn = (float *)calloc(size, sizeof(float));
  tempOut = (float *)calloc(size, sizeof(float));
  float *answer = (float *)calloc(size, sizeof(float));
  readinput(powerIn, numRows, numCols, layers, pfile);
  readinput(tempIn, numRows, numCols, layers, tfile);
  memcpy(tempCopy, tempIn, size * sizeof(float));
  hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
               Rz, dt, iterations);
  computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
                 Ry, Rz, dt, iterations);
  float acc = accuracy(tempOut, answer, numRows * numCols * layers);
  printf("Accuracy: %e\n", acc);
  writeoutput(tempOut, numRows, numCols, layers, ofile);
  free(tempIn);
  free(tempOut);
  free(powerIn);
  return 0;
 }
--- a/examples/hotspot3D/run.sh
+++ b/examples/hotspot3D/run.sh
@ -0,0 +1,22 @@
 # # #!/bin/bash
 set -e
 llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
 llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
 ../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
 ../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
 g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o 3D \
    -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
 export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
 ./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
 if head output.out | grep -q "334.017"; then
    echo "Pass"
 else
    echo "Error result"
    exit 1
 fi
--- a/examples/huffman/comparison_helpers.h
+++ b/examples/huffman/comparison_helpers.h
@ -0,0 +1,24 @@
 #ifndef _COMPARISON_HELPERS_H_
 #define _COMPARISON_HELPERS_H_
 #include <stdio.h>
 template <typename T>
 __inline int compare_vectors(T *data1, T *data2, unsigned int size) {
  printf("Comparing vectors: \n");
  bool match = true;
  for (unsigned int i = 0; i < size; i++)
    if (data1[i] != data2[i]) {
      match = false;
      printf("Diff: data1[%d]=%d,  data1[%d]=%d.\n", i, data1[i], i, data2[i]);
    }
  if (match) {
    printf("PASS! vectors are matching!\n");
    return 0;
  } else {
    printf("FAIL! vectors are NOT matching!\n");
    exit(1);
    return -1;
  }
 }
 #endif
--- a/examples/huffman/cpuencode.cpp
+++ b/examples/huffman/cpuencode.cpp
@ -0,0 +1,116 @@
 #include "stdafx.h"
 #include "cpuencode.h"
 #include "print_helpers.h"
 using namespace std;
 #if 1
 // The max. codeword length for each byte symbol is 32-bits
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens) {
  unsigned int *bitstreamPt =
      (unsigned int *)outdata; /* Pointer to current byte   */
  *bitstreamPt = 0x00000000U;
  unsigned int startbit = 0;
  unsigned int totalBytes = 0;
  for (unsigned int k = 0; k < num_elements; k++) {
    unsigned int cw32 = 0;
    unsigned int val32 = indata[k];
    unsigned int numbits = 0;
    unsigned int mask32;
    for (unsigned int i = 0; i < 4; i++) {
      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
      cw32 = codewords[symbol];
      numbits = codewordlens[symbol];
      while (numbits > 0) {
        int writebits = min(32 - startbit, numbits);
        if (numbits == writebits)
          mask32 = (cw32 & ((1 << numbits) - 1))
                   << (32 - startbit -
                       numbits); // first make sure that the start of the word
                                 // is clean, then shift to the left as many
                                 // places as you need
        else
          mask32 = cw32 >>
                   (numbits - writebits); // shift out the bits that can not fit
        *bitstreamPt = (*bitstreamPt) | mask32;
        numbits = numbits - writebits;
        startbit = (startbit + writebits) % 32;
        if (startbit == 0) {
          bitstreamPt++;
          *bitstreamPt = 0x00000000;
          totalBytes += 4;
        }
      }
    }
  }
  totalBytes += (startbit / 8) +
                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
  *outsize = totalBytes;
 }
 //////////////////////////////////////////////////////////////////////
 /// ALTERNATIVE CODER
 /// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
 /// i.e. g 64 bits
 ///////////////////////////////////////////////////////////////////////
 #else
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens) {
  unsigned int *bitstreamPt =
      (unsigned int *)outdata; /* Pointer to current byte   */
  // assume memset is done.
  *bitstreamPt = 0x00000000U;
  unsigned int startbit = 0;
  unsigned int totalBytes = 0;
  for (unsigned int k = 0; k < num_elements; k++) {
    unsigned long long cw64 = 0, mask64 = 0;
    unsigned int val32 = indata[k];
    unsigned int numbits = 0;
    unsigned int mask32, temp32;
    for (unsigned int i = 0; i < 4; i++) {
      unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
      cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
      numbits += codewordlens[symbol];
      // if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
      // %d!!!!!!!\n", k, numbits);
    }
    while (numbits > 0) {
      int writebits = min(32 - startbit, numbits);
      if (numbits == writebits) {
        temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
        mask32 = temp32 << (32 - startbit - numbits);
      } else {
        mask32 = (unsigned int)(cw64 >> (numbits - writebits));
        cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
      }
      *bitstreamPt = (*bitstreamPt) | mask32;
      numbits = numbits - writebits;
      startbit = (startbit + writebits) % 32;
      if (startbit == 0) {
        bitstreamPt++;
        *bitstreamPt = 0x00000000;
        totalBytes += 4;
      }
    }
  }
  totalBytes += (startbit / 8) +
                ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
  *outsize = totalBytes;
 }
 #endif
--- a/examples/huffman/cpuencode.h
+++ b/examples/huffman/cpuencode.h
@ -0,0 +1,8 @@
 #ifndef _CE_H_
 #define _CE_H_
 extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens);
 #endif
--- a/Show More
+++ b/Show More