add backbone, including basic features for compilation

2022-01-11 11:01:42 -05:00 · 2022-01-11 11:01:42 -05:00 · addf0a95b7
commit addf0a95b7
49 changed files with 4831 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,41 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(CudaOnX86)
 set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
 set(CMAKE_CXX_STANDARD "14")
 set(MAJOR_VERSION 0)
 set(MINOR_VERSION 1)
 set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
 set(COX_VERSION ${VERSION_STRING})
 # get LLVM PATH get PATH for head file
 if(DEFINED LLVM_CONFIG_PATH)
  if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
    if(EXISTS "${LLVM_CONFIG_PATH}")
      set(LLVM_CONFIG "${LLVM_CONFIG_PATH}")
    else()
      message(FATAL_ERROR "llvm-config is not found in ${LLVM_CONFIG_PATH}")
    endif()
    message(STATUS "Using llvm-config: ${LLVM_CONFIG}")
    execute_process(
      COMMAND "${LLVM_CONFIG}" "--cxxflags"
      OUTPUT_VARIABLE LLVM_CXX_FLAG
      OUTPUT_STRIP_TRAILING_WHITESPACE)
    execute_process(
      COMMAND "${LLVM_CONFIG}" "--libdir"
      OUTPUT_VARIABLE LLVM_LIB_PATH
      OUTPUT_STRIP_TRAILING_WHITESPACE)
    execute_process(
      COMMAND "${LLVM_CONFIG}" "--libs"
      OUTPUT_VARIABLE LLVM_LINK_FLAG
      OUTPUT_STRIP_TRAILING_WHITESPACE)
  endif()
 else()
  message(FATAL_ERROR "llvm-config is required")
 endif()
 set(CMAKE_CXX_FLAGS "${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
 set(GCC_COVERAGE_LINK_FLAGS
    "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
 add_subdirectory(compilation)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,34 @@
 # Contributing to COX
 Thank you for your interest in contributing to COX!
 We appreciate all contributions, including but not limited to:
 - Add documentation
 - Add new features and components
 - Fix bugs
 ## How to contribute?
 0. (Optional) Open an issue and discuss your idea before start
 1. Fork the latest version COX
 2. Commit to the forked repo
 3. Create a Pull Request to COX main branch
 ## Code style
 We follow the Clang format in this repo.
 To make sure your contribution is following the correct style,
 we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
 ```bash
 # Python environment is required
 pip install pre-commit
 ```
 Then, from the repository folder, execute the following instruction:
 ```bash
 pre-commit install
 ```
 With pre-commit plugin, each local commit will be automatically checked.
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Ruobing Han
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,62 @@
 # COX: CUDA on X86
 ## Introduction
 This project consists of two parts: a series of LLVM passes that
 achieve a SPMD NVVM IR as input, and output the corresponding
 MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
 ## Install
 ### Prerequisites
 * Linux: Verified on Ubuntu 18.04
 * LLVM10.0
 * NVIDIA CUDA-toolkit
 * x86 CPU
 * pthread
 * GCC 7.5.0
 ### Installation
 1. Clone from github
    ```bash
    git clone https://github.com/drcut/open_source_template
    cd open_source_template
    ```
 2. Build the transformer for NVVM IR to LLVM IR for X86
    ```bash
    mkdir build && cd build
    cmake .. -DLLVM_CONFIG_PATH=`which llvm-config` # need path to llvm-config
    make
    ```
 ## Run Vecadd samples
 ```bash
 # Generate bitcode from human-readable LLVM IR
 llvm-as ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 # use LLVM passes to transform NVVM IR (SPMD) to LLVM IR (MPMD+SIMD).
 # NOTE: we hard-code the grid size (1, 1, 1)
 # and block size (1024, 1, 1) into the generated LLVM IR
 ./compilation/nvvm2x86 \
    ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc \
    kernel.bc 1 1 1 32 1 1
 # generate object file from LLVM IR
 llc --filetype=obj kernel.bc
 # link generated kernel function
 # with host function and generate excutable file
 g++ ../compilation/examples/vecadd/host.cpp \
    kernel.o -lpthread -o vecadd_example
 # execute the executable file
 ./vecadd_example
 ```
 ## Author
 [Ruobing Han](https://drcut.github.io/) is a CS phd student in
 Georgia Institute Technology, under the supervision
 of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).
--- a/compilation/CMakeLists.txt
+++ b/compilation/CMakeLists.txt
@ -0,0 +1,23 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(
  NVVM2X86
  DESCRIPTION "Translate NVVM IR to LLVM IR for X86"
  LANGUAGES CXX)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 # compile kernel translator
 include_directories(./KernelTranslation/include)
 add_subdirectory(KernelTranslation)
 add_executable(kernelTranslator KernelTranslation.cpp)
 target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
 # compile host translator
 include_directories(./HostTranslation/include)
 add_subdirectory(HostTranslation)
 add_executable(hostTranslator HostTranslation.cpp)
 target_link_libraries(hostTranslator spmd2mpmd cudaRuntime2cpuRuntime
                      ${GCC_COVERAGE_LINK_FLAGS})
--- a/compilation/HostTranslation.cpp
+++ b/compilation/HostTranslation.cpp
@ -0,0 +1,25 @@
 #include "ReplaceKernelLaunch.h"
 #include "tool.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include <assert.h>
 #include <iostream>
 #include <stdlib.h>
 using namespace llvm;
 int main(int argc, char **argv) {
  assert(argc == 3 && "incorrect number of arguments\n");
  char *input_host_path = argv[1];
  char *output_host_path = argv[2];
  // load LLVM module(s)
  llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
  VerifyModule(hostModule);
  // process host module
  ReplaceKernelLaunch(hostModule);
  VerifyModule(hostModule);
  DumpModule(hostModule, output_host_path);
  return 0;
 }
--- a/compilation/HostTranslation/CMakeLists.txt
+++ b/compilation/HostTranslation/CMakeLists.txt
@ -0,0 +1,21 @@
 cmake_minimum_required(VERSION 3.1)
 # C project
 project(
  HostTranslation
  DESCRIPTION "Translate CUDA host modules to CPU host modules,
    mainly replace CUDA Runtime APIs with CPU Runtime APIs"
  LANGUAGES CXX)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME cudaRuntime2cpuRuntime)
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_BUILD_TYPE Debug)
 include_directories(./include)
 file(GLOB proj_HEADERS "include/*.h")
 file(GLOB proj_SOURCES "lib/*.cpp")
 # Add core library.
 add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
--- a/compilation/HostTranslation/include/ReplaceKernelLaunch.h
+++ b/compilation/HostTranslation/include/ReplaceKernelLaunch.h
@ -0,0 +1,11 @@
 #ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
 #define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
 #include "llvm/IR/Module.h"
 /*
 * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 */
 void ReplaceKernelLaunch(llvm::Module *M);
 #endif
--- a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
+++ b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
@ -0,0 +1,94 @@
 #include "ReplaceKernelLaunch.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
 // Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
 // Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
 void ReplaceKernelLaunch(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto VoidTy = llvm::Type::getVoidTy(context);
  auto I8 = llvm::Type::getInt8PtrTy(context);
  std::map<std::string, BitCastInst *> kernels;
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(Type::getVoidTy(*C), NULL);
  llvm::FunctionType *LaunchFun2 =
      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
  bool done = false;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
      BasicBlock *B = &(*b);
      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
        Instruction *inst = &(*i);
        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
          if (Function *calledFunction = callInst->getCalledFunction()) {
            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
              Value *callOperand = callInst->getArgOperand(0);
              Function *functionOperand =
                  dyn_cast<Function>(callInst->getArgOperand(0));
              // call function is wrapped in a bitcast
              if (functionOperand == NULL) {
                std::vector<size_t> arg_sizes;
                functionOperand =
                    dyn_cast<Function>(callOperand->stripPointerCasts());
                FunctionType *ft = calledFunction->getFunctionType();
                std::cout << " Parent (Caller) Function Name: " << func_name
                          << ", cudaLaunchKernel Function: "
                          << functionOperand->getName().str() << ", args "
                          << functionOperand->arg_size() << std::endl;
                auto rep = kernels.find(functionOperand->getName().str());
                if (rep != kernels.end()) {
                  callInst->setArgOperand(0, rep->second);
                  continue;
                }
                std::vector<Type *> Params;
                Params.push_back(I8);
                FunctionType *FT = FunctionType::get(VoidTy, Params, false);
                std::string newName =
                    functionOperand->getName().str() + "_wrapper";
                Function *F =
                    Function::Create(FT, Function::ExternalLinkage, newName, M);
                F->setDSOLocal(true);
                BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
                callInst->setArgOperand(0, BC);
                kernels.insert({functionOperand->getName().str(), BC});
              }
            }
          }
        }
      }
    }
  }
 }
--- a/compilation/KernelTranslation.cpp
+++ b/compilation/KernelTranslation.cpp
@ -0,0 +1,53 @@
 #include "generate_x86_format.h"
 #include "handle_sync.h"
 #include "init.h"
 #include "insert_sync.h"
 #include "insert_warp_loop.h"
 #include "performance.h"
 #include "tool.h"
 #include "warp_func.h"
 #include "llvm/IR/Module.h"
 #include <assert.h>
 #include <iostream>
 #include <map>
 #include <set>
 #include <stdlib.h>
 using namespace llvm;
 int main(int argc, char **argv) {
  assert(argc == 9 && "incorrect number of arguments\n");
  llvm::Module *program = LoadModuleFromFilr(argv[1]);
  // get size of grid and dim from input arguments
  int *grid_dim = new int[3];
  int *block_dim = new int[3];
  grid_dim[0] = atoi(argv[3]);
  grid_dim[1] = atoi(argv[4]);
  grid_dim[2] = atoi(argv[5]);
  block_dim[0] = atoi(argv[6]);
  block_dim[1] = atoi(argv[7]);
  block_dim[2] = atoi(argv[8]);
  // inline, and create auxiliary global variables
  init_block(program);
  // insert sync before each vote, and replace the
  // original vote function to warp vote
  handle_warp_vote(program);
  // replace warp shuffle
  handle_warp_shfl(program);
  // insert sync
  insert_sync(program);
  // split block by sync
  split_block_by_sync(program);
  // add loop for intra&intera thread
  insert_warp_loop(program);
  // (TODO): replace this patch
  replace_built_in_function(program, grid_dim, block_dim);
  // VerifyModule(program);
  generate_x86_format(program);
  // performance optimization
  performance_optimization(program);
  DumpModule(program, argv[2]);
  return 0;
 }
--- a/compilation/KernelTranslation/CMakeLists.txt
+++ b/compilation/KernelTranslation/CMakeLists.txt
@ -0,0 +1,21 @@
 cmake_minimum_required(VERSION 3.1)
 # C project
 project(
  KernelTranslation
  DESCRIPTION
    "Translate SPMD Kernel to MPMD format with hierarchical collapsing"
  LANGUAGES CXX)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME spmd2mpmd)
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_BUILD_TYPE Debug)
 include_directories(./include)
 file(GLOB proj_HEADERS "include/*.h")
 file(GLOB proj_SOURCES "lib/*.cpp")
 # Add core library.
 add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
--- a/compilation/KernelTranslation/include/generate_x86_format.h
+++ b/compilation/KernelTranslation/include/generate_x86_format.h
@ -0,0 +1,8 @@
 #ifndef __NVVM2x86_GENERATE_X86_FORMAT__
 #define __NVVM2x86_GENERATE_X86_FORMAT__
 #include "llvm/IR/Module.h"
 void generate_x86_format(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/handle_sync.h
+++ b/compilation/KernelTranslation/include/handle_sync.h
@ -0,0 +1,10 @@
 #ifndef __NVVM2x86_HANDLE_SYNC__
 #define __NVVM2x86_HANDLE_SYNC__
 #include "llvm/IR/Module.h"
 using namespace llvm;
 void split_block_by_sync(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/init.h
+++ b/compilation/KernelTranslation/include/init.h
@ -0,0 +1,7 @@
 #ifndef __NVVM2x86_INIT__
 #define __NVVM2x86_INIT__
 #include "llvm/IR/Module.h"
 void init_block(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/insert_sync.h
+++ b/compilation/KernelTranslation/include/insert_sync.h
@ -0,0 +1,9 @@
 #ifndef __NVVM2x86_INSERT_SYNC__
 #define __NVVM2x86_INSERT_SYNC__
 #include "llvm/IR/Function.h"
 // insert extra barrier
 void insert_sync(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/insert_warp_loop.h
+++ b/compilation/KernelTranslation/include/insert_warp_loop.h
@ -0,0 +1,12 @@
 #ifndef __NVVM2x86_INSERT_WARP_LOOP__
 #define __NVVM2x86_INSERT_WARP_LOOP__
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 using namespace llvm;
 void insert_warp_loop(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/memory_hierarchy.h
+++ b/compilation/KernelTranslation/include/memory_hierarchy.h
@ -0,0 +1,9 @@
 #ifndef __NVVM2x86_MEMORY_HIERARCHY__
 #define __NVVM2x86_MEMORY_HIERARCHY__
 #include "llvm/IR/Module.h"
 using namespace llvm;
 void mem_share2global(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/performance.h
+++ b/compilation/KernelTranslation/include/performance.h
@ -0,0 +1,7 @@
 #ifndef __NVVM2x86_PERFORMANCE__
 #define __NVVM2x86_PERFORMANCE__
 #include "llvm/IR/Module.h"
 void performance_optimization(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/tool.h
+++ b/compilation/KernelTranslation/include/tool.h
@ -0,0 +1,24 @@
 #ifndef __NVVM2x86_TOOL__
 #define __NVVM2x86_TOOL__
 #include "llvm/IR/Module.h"
 llvm::Module *LoadModuleFromFilr(char *file_name);
 void DumpModule(llvm::Module *M, char *file_name);
 bool isKernelFunction(llvm::Module *M, llvm::Function *F);
 void replace_block(llvm::Function *F, llvm::BasicBlock *before,
                   llvm::BasicBlock *after);
 llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore);
 llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
 void VerifyModule(llvm::Module *);
 void phi2alloc(llvm::Module *M);
 void remove_cuda_built_in(llvm::Module *M);
 void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
 void replace_asm_call(llvm::Module *M);
 bool find_block_barrier_in_region(llvm::BasicBlock *start,
                                  llvm::BasicBlock *end);
 bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end);
 bool has_warp_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::BasicBlock *B);
 bool has_block_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::Function *F);
 #endif
--- a/compilation/KernelTranslation/include/warp_func.h
+++ b/compilation/KernelTranslation/include/warp_func.h
@ -0,0 +1,10 @@
 #ifndef __NVVM2x86_WARP_FUNC__
 #define __NVVM2x86_WARP_FUNC__
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 void handle_warp_vote(llvm::Module *M);
 void handle_warp_shfl(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/lib/generate_x86_format.cpp
+++ b/compilation/KernelTranslation/lib/generate_x86_format.cpp
@ -0,0 +1,119 @@
 #include "generate_x86_format.h"
 #include "tool.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 void set_meta_data(llvm::Module *M) {
  M->setTargetTriple("x86_64-unknown-linux-gnu");
  M->setDataLayout(
      "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
 }
 // as pthread only accept a single void* for input
 // we have to decode this input inside the kernel
 void decode_input(llvm::Module *M) {
  std::set<llvm::Function *> need_remove;
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  llvm::FunctionType *LauncherFuncT = FunctionType::get(
      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
  // generate Wrapper Function type
  // now we only support a single int32*
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    if (!isKernelFunction(M, F))
      continue;
    auto func_name = F->getName().str();
    llvm::IRBuilder<> Builder(M->getContext());
    FunctionCallee fc =
        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
    Builder.SetInsertPoint(Block);
    // WorkGroup has only a single input
    Function::arg_iterator ai = WorkGroup->arg_begin();
    SmallVector<Value *, 8> Arguments;
    Value *input_arg = &*ai;
    // convert to int**
    input_arg = Builder.CreateBitOrPointerCast(
        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
    size_t idx = 0;
    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
         ii != ee; ++ii) {
      Type *ArgType = ii->getType();
      // calculate addr
      Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx));
      // load corresponding int*
      GEP = Builder.CreateLoad(GEP);
      // bitcast
      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
      Value *Arg = Builder.CreateLoad(GEP);
      Arguments.push_back(Arg);
      ++idx;
    }
    CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
    Builder.CreateRetVoid();
  }
  for (auto f : need_remove) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
 }
 void remove_barrier(llvm::Module *M) {
  std::vector<Instruction *> need_remove;
  for (auto F = M->begin(); F != M->end(); ++F)
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync" ||
              func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.barrier.sync") {
            need_remove.push_back(Call);
          }
        }
      }
    }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 void generate_x86_format(llvm::Module *M) {
  // change metadata
  set_meta_data(M);
  // decode argument
  decode_input(M);
  // remove barrier
  remove_barrier(M);
 }
--- a/compilation/KernelTranslation/lib/handle_sync.cpp
+++ b/compilation/KernelTranslation/lib/handle_sync.cpp
@ -0,0 +1,57 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include <set>
 #include <string>
 using namespace llvm;
 void split_block_by_sync(llvm::Function *F) {
  std::set<llvm::Instruction *> sync_inst;
  bool jump_first_sync = 1;
  for (Function::iterator b = F->begin(); b != F->end(); ++b) {
    BasicBlock *B = &(*b);
    for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
      Instruction *inst = &(*i);
      if (jump_first_sync) {
        jump_first_sync = 0;
        Instruction *next_inst = &(*std::next(i));
        sync_inst.insert(next_inst);
        continue;
      }
      llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
      if (Call) {
        auto func_name = Call->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.barrier0" ||
            func_name == "llvm.nvvm.bar.warp.sync" ||
            func_name == "llvm.nvvm.barrier.sync") {
          sync_inst.insert(Call);
          // we should also sync the next instruction
          // so that we can get a block with sync inst only
          Instruction *next_inst = &(*std::next(i));
          sync_inst.insert(next_inst);
        }
      }
    }
  }
  int _tmp = 0;
  for (auto inst : sync_inst) {
    inst->getParent()->splitBasicBlock(
        inst, inst->getParent()->getName().str() + "_after_block_sync_" +
                  std::to_string(_tmp++));
  }
 }
 void split_block_by_sync(llvm::Module *M) {
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    if (isKernelFunction(M, F))
      split_block_by_sync(F);
  }
 }
--- a/compilation/KernelTranslation/lib/init.cpp
+++ b/compilation/KernelTranslation/lib/init.cpp
@ -0,0 +1,302 @@
 #include "init.h"
 #include "memory_hierarchy.h"
 #include "tool.h"
 #include <iostream>
 #include <set>
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 void inline_func_vote(llvm::Module *M) {
  std::set<llvm::Function *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
    Function::iterator I = F->begin();
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
          if (c->getCalledFunction()) {
            if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
              InlineFunctionInfo IFI;
              InlineFunction(c, IFI);
              need_remove.insert(c->getCalledFunction());
            }
          }
        }
      }
    }
  }
  for (auto f : need_remove) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
 }
 void create_global_variable(llvm::Module *M) {
  llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
  llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
  auto zero = llvm::ConstantInt::get(I32, 0, true);
  // we need global variable used for warp shuffle
  llvm::Type *WarpArrayType = llvm::ArrayType::get(I32, 32);
  llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           zero, "intra_warp_index", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           zero, "inter_warp_index", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size", NULL,
                           llvm::GlobalValue::NotThreadLocal, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_x", NULL,
                           llvm::GlobalValue::NotThreadLocal, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_y", NULL,
                           llvm::GlobalValue::NotThreadLocal, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_size_z", NULL,
                           llvm::GlobalValue::NotThreadLocal, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "grid_size", NULL,
                           llvm::GlobalValue::NotThreadLocal, 0, false);
  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
                           NULL, "block_index", NULL,
                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  // TLS variable used for warp-level collective operators
  new llvm::GlobalVariable(
      *M, WarpArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
      "warp_shfl", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  auto warp_vote = new llvm::GlobalVariable(
      *M, VoteArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
      "warp_vote", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
  warp_vote->setAlignment(llvm::MaybeAlign(32));
 }
 void remove_metadata(llvm::Module *M) {
  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    F->getAllMetadata(MDs);
    for (auto &MD : MDs) {
      F->setMetadata(MD.first, NULL);
    }
    F->removeFnAttr("target-features");
    F->removeFnAttr("target-cpu");
  }
 }
 void init_llvm_pass() {
  InitializeAllTargets();
  InitializeAllTargetMCs();
  InitializeAllAsmPrinters();
  InitializeAllAsmParsers();
  PassRegistry &Registry = *PassRegistry::getPassRegistry();
  initializeCore(Registry);
  initializeScalarOpts(Registry);
  initializeVectorization(Registry);
  initializeIPO(Registry);
  initializeAnalysis(Registry);
  initializeTransformUtils(Registry);
  initializeInstCombine(Registry);
  initializeInstrumentation(Registry);
  initializeTarget(Registry);
  llvm::StringMap<llvm::cl::Option *> &opts = llvm::cl::getRegisteredOptions();
  llvm::cl::Option *O = nullptr;
  O = opts["scalarize-load-store"];
  assert(O && "could not find LLVM option 'scalarize-load-store'");
  O->addOccurrence(1, StringRef("scalarize-load-store"), StringRef("1"), false);
  // LLVM inner loop vectorizer does not check whether the loop inside
  // another loop, in which case even a small trip count loops might be
  // worthwhile to vectorize.
  O = opts["vectorizer-min-trip-count"];
  assert(O && "could not find LLVM option 'vectorizer-min-trip-count'");
  O->addOccurrence(1, StringRef("vectorizer-min-trip-count"), StringRef("2"),
                   false);
  // Disable jump threading optimization with following two options from
  // duplicating blocks. Using jump threading will mess up parallel region
  // construction especially when kernel contains barriers.
  // TODO: If enabled then parallel region construction code needs
  // improvements and make sure it doesn't disallow other optimizations like
  // vectorization.
  O = opts["jump-threading-threshold"];
  assert(O && "could not find LLVM option 'jump-threading-threshold'");
  O->addOccurrence(1, StringRef("jump-threading-threshold"), StringRef("0"),
                   false);
  O = opts["jump-threading-implication-search-threshold"];
  assert(O && "could not find LLVM option "
              "'jump-threading-implication-search-threshold'");
  O->addOccurrence(1, StringRef("jump-threading-implication-search-threshold"),
                   StringRef("0"), false);
  // Enable diagnostics from the loop vectorizer.
  O = opts["pass-remarks-missed"];
  assert(O && "could not find LLVM option 'pass-remarks-missed'");
  O->addOccurrence(1, StringRef("pass-remarks-missed"),
                   StringRef("loop-vectorize"), false);
  O->addOccurrence(1, StringRef("pass-remarks-missed"),
                   StringRef("slp-vectorize"), false);
  O = opts["pass-remarks-analysis"];
  assert(O && "could not find LLVM option 'pass-remarks-analysis'");
  O->addOccurrence(1, StringRef("pass-remarks-analysis"),
                   StringRef("loop-vectorize"), false);
  O->addOccurrence(1, StringRef("pass-remarks-analysis"),
                   StringRef("slp-vectorize"), false);
  O = opts["pass-remarks"];
  assert(O && "could not find LLVM option 'pass-remarks'");
  O->addOccurrence(1, StringRef("pass-remarks"), StringRef("loop-vectorize"),
                   false);
  O->addOccurrence(1, StringRef("pass-remarks"), StringRef("slp-vectorize"),
                   false);
 }
 void llvm_preprocess(llvm::Module *M) {
  init_llvm_pass();
  auto Registry = PassRegistry::getPassRegistry();
  llvm::legacy::PassManager Passes;
  std::vector<std::string> passes;
  passes.push_back("lowerswitch");
  passes.push_back("mem2reg");
  passes.push_back("simplifycfg");
  passes.push_back("loop-simplify");
  for (auto pass : passes) {
    const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
    if (PIs) {
      Pass *thispass = PIs->createPass();
      Passes.add(thispass);
    } else {
      printf("Pass: %s not found\n", pass.c_str());
    }
  }
  Passes.run(*M);
 }
 bool lower_constant_expr(llvm::Module *M) {
  bool modified = false;
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<CallInst *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto load_inst = dyn_cast<llvm::LoadInst>(BI)) {
          auto load_from = load_inst->getOperand(0);
          if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
            modified = true;
            auto ReplInst = get_element_ptr->getAsInstruction();
            ReplInst->insertBefore(load_inst);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : get_element_ptr->users()) {
              if (auto InstUser = dyn_cast<Instruction>(U)) {
                Users.push_back(InstUser);
              }
            }
            for (auto &User : Users)
              User->replaceUsesOfWith(get_element_ptr, ReplInst);
          }
        } else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
          auto store_to = store_inst->getOperand(1);
          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
            modified = true;
            auto ReplInst = addr_cast->getAsInstruction();
            ReplInst->insertBefore(store_inst);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : addr_cast->users()) {
              if (auto InstUser = dyn_cast<Instruction>(U)) {
                Users.push_back(InstUser);
              }
            }
            for (auto &User : Users)
              User->replaceUsesOfWith(addr_cast, ReplInst);
          }
        } else if (auto get_element_ptr =
                       dyn_cast<llvm::GetElementPtrInst>(BI)) {
          auto get_from = get_element_ptr->getOperand(0);
          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
            modified = true;
            auto ReplInst = addr_cast->getAsInstruction();
            ReplInst->insertBefore(get_element_ptr);
            std::vector<Instruction *> Users;
            // Do not replace use during iteration of use. Do it in another loop
            for (auto U : addr_cast->users()) {
              if (auto InstUser = dyn_cast<Instruction>(U)) {
                Users.push_back(InstUser);
              }
            }
            for (auto &User : Users)
              User->replaceUsesOfWith(addr_cast, ReplInst);
          }
        }
      }
    }
  }
  return modified;
 }
 void init_block(llvm::Module *M) {
  // using official llvm preprocess
  llvm_preprocess(M);
  // remove useles Cuda function
  remove_cuda_built_in(M);
  // lower ConstantExpression
  bool modified;
  do {
    modified = lower_constant_expr(M);
  } while (modified);
  // remove useless metadata
  remove_metadata(M);
  // inline vote function
  inline_func_vote(M);
  // create global variable for warp and vote
  create_global_variable(M);
  // replace phi with data load
  phi2alloc(M);
  // replace share memory
  mem_share2global(M);
  // replace asm Inline
  replace_asm_call(M);
 }
--- a/compilation/KernelTranslation/lib/insert_sync.cpp
+++ b/compilation/KernelTranslation/lib/insert_sync.cpp
@ -0,0 +1,494 @@
 #include "insert_sync.h"
 #include "assert.h"
 #include "handle_sync.h"
 #include "tool.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <queue>
 using namespace llvm;
 class InsertBuiltInBarrier : public llvm::FunctionPass {
 public:
  static char ID;
  InsertBuiltInBarrier() : FunctionPass(ID) {}
  virtual bool runOnFunction(Function &F) {
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
    std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
    std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
    // insert sync in the entry
    BasicBlock *entry = &(*F.begin());
    for (auto i = entry->begin(); i != entry->end(); i++) {
      if (!isa<AllocaInst>(i)) {
        insert_inter_warp_sync_before.push_back(&(*(i)));
        break;
      }
    }
    for (Function::iterator I = F.begin(); I != F.end(); ++I) {
      BasicBlock::iterator BI = I->begin();
      // insert barrier before return
      for (; BI != I->end(); BI++) {
        llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
        if (Ret) {
          insert_inter_warp_sync_before.push_back(&(*BI));
        }
      }
    }
    if (insert_intra_warp_sync_before.empty() &&
        insert_inter_warp_sync_before.empty())
      return 0;
    for (auto inst : insert_intra_warp_sync_before) {
      CreateIntraWarpBarrier(inst);
    }
    for (auto inst : insert_inter_warp_sync_before) {
      CreateInterWarpBarrier(inst);
    }
    return 1;
  }
 };
 class InsertConditionalBarrier : public llvm::FunctionPass {
 public:
  static char ID;
  InsertConditionalBarrier() : FunctionPass(ID) {}
  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
    AU.addRequired<PostDominatorTreeWrapperPass>();
    AU.addPreserved<PostDominatorTreeWrapperPass>();
    AU.addRequired<DominatorTreeWrapperPass>();
    AU.addPreserved<DominatorTreeWrapperPass>();
  }
  BasicBlock *firstNonBackedgePredecessor(llvm::BasicBlock *bb) {
    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    pred_iterator I = pred_begin(bb), E = pred_end(bb);
    if (I == E)
      return NULL;
    while (DT->dominates(bb, *I) && I != E)
      ++I;
    if (I == E)
      return NULL;
    else
      return *I;
  }
  BasicBlock *firstNonBackedgeSuccessor(llvm::BasicBlock *bb) {
    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    auto t = bb->getTerminator();
    assert(t->getNumSuccessors() <= 2);
    for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
      BasicBlock *successor = t->getSuccessor(i);
      bool isBackedge = DT->dominates(successor, bb);
      if (isBackedge)
        continue;
      return successor;
    }
  };
  virtual bool runOnFunction(Function &F) {
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
    // first find all conditional barriers
    std::vector<BasicBlock *> conditionalBarriers;
    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
      BasicBlock *b = &*i;
      if (!has_barrier(b))
        continue;
      // Unconditional barrier postdominates the entry node.
      if (PDT->getPostDomTree().dominates(b, &F.getEntryBlock()))
        continue;
      conditionalBarriers.push_back(b);
    }
    if (conditionalBarriers.size() == 0)
      return 0;
    bool changed = false;
    while (!conditionalBarriers.empty()) {
      BasicBlock *b = conditionalBarriers.back();
      conditionalBarriers.pop_back();
      // insert barrier in the start of if-condition
      BasicBlock *pos = b;
      BasicBlock *pred = firstNonBackedgePredecessor(b);
      while (PDT->getPostDomTree().dominates(b, pred)) {
        pos = pred;
        // If our BB post dominates the given block, we know it is not the
        // branching block that makes the barrier conditional.
        pred = firstNonBackedgePredecessor(pred);
        if (pred == b)
          break; // Traced across a loop edge, skip this case.
      }
      // we should create warp/block barrier based on the conditional barrier
      if (has_warp_barrier(b)) {
        CreateIntraWarpBarrier(pred->getTerminator());
      } else {
        CreateInterWarpBarrier(pred->getTerminator());
      }
      changed = true;
      // insert barrier in the merge point for then-else branches
      // also insert barrier at the end of conditional branch
      DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
      std::queue<llvm::BasicBlock *> successor_queue;
      for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
        auto ss = pred->getTerminator()->getSuccessor(i);
        if (!DT->dominates(ss, pred))
          successor_queue.push(ss);
      }
      std::set<llvm::BasicBlock *> visited;
      llvm::BasicBlock *merge_point = NULL;
      while (!successor_queue.empty()) {
        auto curr = successor_queue.front();
        successor_queue.pop();
        if (visited.find(curr) != visited.end())
          continue;
        visited.insert(curr);
        if (PDT->getPostDomTree().dominates(curr, pred)) {
          // find the truly merge point
          merge_point = curr;
          if (has_warp_barrier(b)) {
            CreateIntraWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
              CreateIntraWarpBarrier(&(*Pred->getTerminator()));
            }
          } else {
            CreateInterWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
              CreateInterWarpBarrier(&(*Pred->getTerminator()));
            }
          }
          break;
        }
        for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
          auto ss = curr->getTerminator()->getSuccessor(i);
          if (!DT->dominates(ss, curr))
            successor_queue.push(ss);
        }
      }
      assert(merge_point && "do not find merge point\n");
      changed = true;
      // we may create a new conditional barrier after insert
      if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
        conditionalBarriers.push_back(pred);
      // find any block which are not dominated by header
      // but be posdiminated by merge point
      std::queue<llvm::BasicBlock *> if_body;
      std::set<llvm::BasicBlock *> visited_block;
      for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
        if_body.push(pred->getTerminator()->getSuccessor(i));
      }
      while (!if_body.empty()) {
        auto curr = if_body.front();
        if_body.pop();
        if (visited_block.find(curr) != visited_block.end())
          continue;
        visited_block.insert(curr);
        if (!PDT->getPostDomTree().dominates(merge_point, curr))
          continue;
        if (!DT->dominates(pred, curr) &&
            PDT->getPostDomTree().dominates(merge_point, curr)) {
          // we should insert barrier at the beginning and
          // end of its predecessor
          if (has_warp_barrier(b)) {
            CreateIntraWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
              CreateIntraWarpBarrier(&(*Pred->getTerminator()));
            }
          } else {
            CreateInterWarpBarrier(&(*curr->begin()));
            for (BasicBlock *Pred : predecessors(curr)) {
              CreateInterWarpBarrier(&(*Pred->getTerminator()));
            }
          }
        }
        for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
          if_body.push(curr->getTerminator()->getSuccessor(i));
        }
      }
    }
    return changed;
  }
 };
 class InsertBarrierForSpecialCase : public llvm::FunctionPass {
 public:
  static char ID;
  InsertBarrierForSpecialCase() : FunctionPass(ID) {}
  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
    AU.addRequired<PostDominatorTreeWrapperPass>();
    AU.addRequired<DominatorTreeWrapperPass>();
  }
  virtual bool runOnFunction(Function &F) {
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
    bool changed = false;
    std::set<BasicBlock *> if_head;
    // insert an extra block for the following case
    // 1) there is a merge point for an if-else branch,
    // but this merge point has other income edge
    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
    auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
      BasicBlock *b = &*i;
      BasicBlock *merge_point = NULL;
      if (b->getTerminator()->getNumSuccessors() == 2) {
        auto b1 = b->getTerminator()->getSuccessor(0);
        auto b2 = b->getTerminator()->getSuccessor(1);
        if (PDT->getPostDomTree().dominates(b1, b2)) {
          merge_point = b1;
        } else if (PDT->getPostDomTree().dominates(b2, b2)) {
          merge_point = b2;
        } else {
          assert(0 && "find complex if-else branch\n");
        }
        std::cout << std::flush;
        for (BasicBlock *Pred : predecessors(merge_point)) {
          if (!DT->dominates(b, Pred)) {
            // we need to insert an extra block to be the merge point
            // for the if-branch
            if_head.insert(b);
          }
        }
      }
    }
    auto M = F.getParent();
    for (auto head : if_head) {
      assert(head->getTerminator()->getNumSuccessors() == 2);
      BasicBlock *merge_point = NULL;
      auto s1 = head->getTerminator()->getSuccessor(0);
      auto s2 = head->getTerminator()->getSuccessor(1);
      if (PDT->getPostDomTree().dominates(s1, s2)) {
        merge_point = s1;
      } else {
        merge_point = s2;
      }
      if (!find_barrier_in_region(head, merge_point)) {
        printf("do not need to handle tri-income if: %s\n",
               merge_point->getName().str().c_str());
        continue;
      }
      BasicBlock *Block = BasicBlock::Create(M->getContext(), "if_end", &F);
      llvm::IRBuilder<> Builder(M->getContext());
      Builder.SetInsertPoint(Block);
      auto br_inst = Builder.CreateBr(merge_point);
      assert(has_barrier(head) && "preheader does not have barrier\n");
      if (has_warp_barrier(head)) {
        CreateIntraWarpBarrier(br_inst);
      } else {
        CreateInterWarpBarrier(br_inst);
      }
      // replace usage in if-branch
      std::set<Instruction *> need_replace;
      for (BasicBlock *Pred : predecessors(merge_point)) {
        if (DT->dominates(head, Pred) && Pred != Block) {
          need_replace.insert(Pred->getTerminator());
        }
      }
      for (auto inst : need_replace) {
        inst->replaceUsesOfWith(merge_point, Block);
      }
      changed = 1;
    }
    return changed;
  }
 };
 class InsertConditionalForBarrier : public llvm::LoopPass {
 public:
  static char ID;
  InsertConditionalForBarrier() : LoopPass(ID) {}
  void getAnalysisUsage(AnalysisUsage &AU) const {
    AU.addRequired<DominatorTreeWrapperPass>();
  }
  bool runOnLoop(Loop *L, LPPassManager &LPM) {
    if (!isKernelFunction(L->getHeader()->getParent()->getParent(),
                          L->getHeader()->getParent()))
      return 0;
    // check whether this loop has barrier
    bool is_conditional_loop = 0;
    bool is_warp = 0;
    for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); i != e;
         ++i) {
      for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
           ++j) {
        if (auto Call = dyn_cast<CallInst>(j)) {
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.bar.warp.sync" ||
              func_name == "llvm.nvvm.barrier.sync") {
            is_conditional_loop = true;
            if (func_name == "llvm.nvvm.bar.warp.sync") {
              is_warp = 1;
            }
            break;
          }
        }
      }
    }
    if (!is_conditional_loop)
      return 0;
    // insert barrier at the beginning of header
    // and the end of pre header, so that we can get a
    // single block connected with latch
    if (!is_warp) {
      auto prehead_block = L->getLoopPreheader();
      CreateInterWarpBarrier(prehead_block->getTerminator());
      auto header_block = L->getHeader();
      CreateInterWarpBarrier(&(*header_block->begin()));
    } else {
      auto prehead_block = L->getLoopPreheader();
      CreateIntraWarpBarrier(prehead_block->getTerminator());
      auto header_block = L->getHeader();
      CreateIntraWarpBarrier(&(*header_block->begin()));
    }
    // as we assume all loops are rotated, we have to insert
    // barrier before the condition jump of the loop exit
    if (auto exit_block = L->getExitingBlock()) {
      auto conditional_br =
          dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
      assert(conditional_br && conditional_br->isConditional());
      // insert barrier at the beginning of successor of exit
      if (!is_warp)
        CreateInterWarpBarrier(conditional_br);
      else
        CreateIntraWarpBarrier(conditional_br);
    } else {
      // handle break in for-loop
      printf("loop has multiply exists\n");
      // this time, we have also insert sync before the for-body
      auto header_block = L->getHeader();
      assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
             "has more than 2 successors of the for-head\n");
      BasicBlock *for_body = NULL;
      for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
           i++) {
        auto bb = header_block->getTerminator()->getSuccessor(i);
        if (L->contains(bb)) {
          if (is_warp) {
            CreateIntraWarpBarrier(&(*bb->begin()));
          } else {
            CreateInterWarpBarrier(&(*bb->begin()));
          }
        }
      }
      SmallVector<llvm::BasicBlock *, 8> ExitingBlocks;
      L->getExitingBlocks(ExitingBlocks);
      while (!ExitingBlocks.empty()) {
        auto exit_block = ExitingBlocks.back();
        ExitingBlocks.pop_back();
        auto conditional_br =
            dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
        assert(conditional_br && conditional_br->isConditional());
        // insert barrier at the beginning of successor of exit
        if (!is_warp)
          CreateInterWarpBarrier(conditional_br);
        else
          CreateIntraWarpBarrier(conditional_br);
      }
    }
    return 1;
  }
 };
 char InsertBuiltInBarrier::ID = 0;
 char InsertConditionalBarrier::ID = 0;
 char InsertConditionalForBarrier::ID = 0;
 char InsertBarrierForSpecialCase::ID = 0;
 namespace {
 static RegisterPass<InsertConditionalBarrier>
    insert_conditional_barrier("insert-conditional-if-barriers",
                               "Insert conditional barriers for if body");
 static RegisterPass<InsertConditionalForBarrier>
    insert_conditional_for_barrier("insert-conditional-for-barriers",
                                   "Insert conditional barriers for for loop");
 static RegisterPass<InsertBarrierForSpecialCase>
    insert_special_case("insert-special-case-barriers",
                        "Insert barriers for special cases");
 static RegisterPass<InsertBuiltInBarrier>
    insert_built_in_barrier("insert-built-in-barriers",
                            "Insert built in barriers");
 } // namespace
 void insert_sync(llvm::Module *M) {
  auto Registry = PassRegistry::getPassRegistry();
  llvm::legacy::PassManager Passes;
  std::vector<std::string> passes;
  passes.push_back("insert-built-in-barriers");
  passes.push_back("insert-conditional-if-barriers");
  passes.push_back("insert-conditional-for-barriers");
  passes.push_back("insert-special-case-barriers");
  for (auto pass : passes) {
    const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
    if (PIs) {
      Pass *thispass = PIs->createPass();
      Passes.add(thispass);
    } else {
      assert(0 && "Pass not found\n");
    }
  }
  Passes.run(*M);
 }
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -0,0 +1,848 @@
 #include "insert_warp_loop.h"
 #include "handle_sync.h"
 #include "tool.h"
 #include <assert.h>
 #include <iostream>
 #include <set>
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 #include <set>
 #include <sstream>
 #include <tuple>
 #include <vector>
 using namespace llvm;
 struct ParallelRegion {
  std::set<llvm::BasicBlock *> wrapped_block;
  llvm::BasicBlock *successor_block;
  llvm::BasicBlock *start_block;
  llvm::BasicBlock *end_block;
  bool inst_in_region(llvm::Instruction *inst) {
    for (auto bb : wrapped_block) {
      if (inst->getParent()->getName().str() == bb->getName().str())
        return true;
    }
    return false;
  }
  bool inst_used_in_region(llvm::Instruction *inst) {
    for (auto ui = inst->use_begin(); ui != inst->use_end(); ++ui) {
      auto *user = dyn_cast<Instruction>(ui->getUser());
      if (user == NULL)
        continue;
      if (inst_in_region(user)) {
        return 1;
      }
    }
    return 0;
  }
 };
 std::map<llvm::Instruction *, unsigned> tempInstructionIds;
 std::map<std::string, llvm::Instruction *> contextArrays;
 int tempInstructionIndex = 0;
 int need_nested_loop;
 bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
  if (isa<BranchInst>(instr))
    return true;
  llvm::Module *M = instr->getParent()->getParent()->getParent();
  llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
  if (load != NULL) {
    auto load_addr = load->getPointerOperand();
    if (load_addr == M->getGlobalVariable("intra_warp_index"))
      return true;
    if (load_addr == M->getGlobalVariable("inter_warp_index"))
      return true;
    if (load_addr == M->getGlobalVariable("warp_vote"))
      return true;
  }
  // TODO: we should further analyze whether the local variable
  // is same among all threads within a wrap
  return false;
 }
 // generate countpart alloc in the beginning of the Function
 llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
                                   bool intra_warp_loop) {
  std::ostringstream var;
  if (std::string(instruction->getName().str()) != "") {
    var << instruction->getName().str();
  } else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) {
    var << tempInstructionIds[instruction];
  } else {
    tempInstructionIds[instruction] = tempInstructionIndex++;
    var << tempInstructionIds[instruction];
  }
  if (intra_warp_loop)
    var << "_intra_warp_";
  else
    var << "_inter_warp_";
  std::string varName = var.str();
  if (contextArrays.find(varName) != contextArrays.end())
    return contextArrays[varName];
  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
  Function *FF = instruction->getParent()->getParent();
  Module *M = instruction->getParent()->getParent()->getParent();
  LLVMContext &C = M->getContext();
  const llvm::DataLayout &Layout = M->getDataLayout();
  llvm::Type *elementType;
  if (isa<AllocaInst>(instruction)) {
    elementType =
        dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
  } else {
    elementType = instruction->getType();
  }
  Type *AllocType = elementType;
  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
  if (InstCast) {
    unsigned Alignment = InstCast->getAlignment();
    uint64_t StoreSize = Layout.getTypeStoreSize(InstCast->getAllocatedType());
    if ((Alignment > 1) && (StoreSize & (Alignment - 1))) {
      uint64_t AlignedSize = (StoreSize & (~(Alignment - 1))) + Alignment;
      assert(AlignedSize > StoreSize);
      uint64_t RequiredExtraBytes = AlignedSize - StoreSize;
      if (isa<ArrayType>(elementType)) {
        ArrayType *StructPadding = ArrayType::get(
            Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
        std::vector<Type *> PaddedStructElements;
        PaddedStructElements.push_back(elementType);
        PaddedStructElements.push_back(StructPadding);
        const ArrayRef<Type *> NewStructElements(PaddedStructElements);
        AllocType = StructType::get(M->getContext(), NewStructElements, true);
        uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
        assert(NewStoreSize == AlignedSize);
      } else if (isa<StructType>(elementType)) {
        StructType *OldStruct = dyn_cast<StructType>(elementType);
        ArrayType *StructPadding = ArrayType::get(
            Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
        std::vector<Type *> PaddedStructElements;
        for (unsigned j = 0; j < OldStruct->getNumElements(); j++)
          PaddedStructElements.push_back(OldStruct->getElementType(j));
        PaddedStructElements.push_back(StructPadding);
        const ArrayRef<Type *> NewStructElements(PaddedStructElements);
        AllocType = StructType::get(OldStruct->getContext(), NewStructElements,
                                    OldStruct->isPacked());
        uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
        assert(NewStoreSize == AlignedSize);
      }
    }
  }
  llvm::Value *ItemSize = nullptr;
  llvm::AllocaInst *Alloca = nullptr;
  auto block_size_addr = M->getGlobalVariable("block_size");
  auto block_size = builder.CreateLoad(block_size_addr);
  Alloca = builder.CreateAlloca(AllocType, block_size, varName);
  contextArrays[varName] = Alloca;
  return Alloca;
 }
 // save the local variable into replicated array
 llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
                                  llvm::Instruction *alloca,
                                  bool intra_warp_loop) {
  if (isa<AllocaInst>(instruction)) {
    return NULL;
  }
  llvm::Module *M = instruction->getParent()->getParent()->getParent();
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  /* Save the produced variable to the array. */
  BasicBlock::iterator definition =
      (dyn_cast<Instruction>(instruction))->getIterator();
  ++definition;
  IRBuilder<> builder(&*definition);
  std::vector<llvm::Value *> gepArgs;
  auto inter_warp_index =
      builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
  auto intra_warp_index =
      builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
  auto thread_idx = builder.CreateBinOp(
      Instruction::Add, intra_warp_index,
      builder.CreateBinOp(Instruction::Mul, inter_warp_index,
                          ConstantInt::get(I32, 32)),
      "thread_idx");
  gepArgs.push_back(thread_idx);
  return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
 }
 llvm::Instruction *AddContextRestore(llvm::Value *val,
                                     llvm::Instruction *alloca,
                                     llvm::Instruction *before, bool isAlloca,
                                     bool intra_warp_loop) {
  assert(val != NULL);
  assert(alloca != NULL);
  IRBuilder<> builder(alloca);
  if (before != NULL) {
    builder.SetInsertPoint(before);
  } else if (isa<Instruction>(val)) {
    builder.SetInsertPoint(dyn_cast<Instruction>(val));
    before = dyn_cast<Instruction>(val);
  } else {
    assert(false && "Unknown context restore location!");
  }
  std::vector<llvm::Value *> gepArgs;
  auto M = before->getParent()->getParent()->getParent();
  auto I32 = llvm::Type::getInt32Ty(M->getContext());
  auto inter_warp_index =
      builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
  auto intra_warp_index =
      builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
  auto thread_idx = builder.CreateBinOp(
      Instruction::Add, intra_warp_index,
      builder.CreateBinOp(Instruction::Mul, inter_warp_index,
                          ConstantInt::get(I32, 32)),
      "thread_idx");
  gepArgs.push_back(thread_idx);
  llvm::Instruction *gep =
      dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
  if (isAlloca) {
    return gep;
  }
  return builder.CreateLoad(gep);
 }
 void AddContextSaveRestore(llvm::Instruction *instruction,
                           bool intra_warp_loop) {
  /* Allocate the context data array for the variable. */
  llvm::Instruction *alloca = GetContextArray(instruction, intra_warp_loop);
  llvm::Instruction *theStore =
      AddContextSave(instruction, alloca, intra_warp_loop);
  std::vector<Instruction *> uses;
  for (Instruction::use_iterator ui = instruction->use_begin(),
                                 ue = instruction->use_end();
       ui != ue; ++ui) {
    llvm::Instruction *user = cast<Instruction>(ui->getUser());
    if (user == NULL)
      continue;
    if (user == theStore)
      continue;
    uses.push_back(user);
  }
  for (auto user : uses) {
    Instruction *contextRestoreLocation = user;
    llvm::Value *loadedValue =
        AddContextRestore(user, alloca, contextRestoreLocation,
                          isa<AllocaInst>(instruction), intra_warp_loop);
    user->replaceUsesOfWith(instruction, loadedValue);
  }
 }
 void handle_alloc(llvm::Function *F) {
  auto M = F->getParent();
  LLVMContext &C = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(C);
  std::vector<llvm::Instruction *> instruction_to_fix;
  for (auto bb = F->begin(); bb != F->end(); bb++) {
    for (auto ii = bb->begin(); ii != bb->end(); ii++) {
      if (llvm::AllocaInst *i = dyn_cast<AllocaInst>(ii)) {
        instruction_to_fix.push_back(i);
      }
    }
  }
  std::vector<llvm::Instruction *> need_remove;
  for (auto inst : instruction_to_fix) {
    // generate a new alloc
    auto block_size_addr = M->getGlobalVariable("block_size");
    IRBuilder<> builder(inst);
    auto block_size = builder.CreateLoad(block_size_addr);
    llvm::Type *elementType = NULL;
    if (dyn_cast<AllocaInst>(inst)->getType()->getElementType()) {
      elementType = dyn_cast<AllocaInst>(inst)->getType()->getElementType();
    }
    assert(elementType != NULL);
    auto Alloca = builder.CreateAlloca(elementType, block_size,
                                       inst->getName().str() + "inter_warp");
    // replace all usage
    std::set<Instruction *> replace_user;
    for (Instruction::use_iterator ui = inst->use_begin(), ue = inst->use_end();
         ui != ue; ++ui) {
      replace_user.insert(dyn_cast<Instruction>(ui->getUser()));
    }
    for (auto user : replace_user) {
      IRBuilder<> builder(user);
      // std::vector<llvm::Value *> gepArgs;
      auto inter_warp_index =
          builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
      auto intra_warp_index =
          builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
      auto thread_idx = builder.CreateBinOp(
          Instruction::Add, intra_warp_index,
          builder.CreateBinOp(Instruction::Mul, inter_warp_index,
                              ConstantInt::get(I32, 32)),
          "thread_idx");
      auto gep = builder.CreateGEP(Alloca, thread_idx);
      user->replaceUsesOfWith(inst, gep);
    }
    need_remove.push_back(inst);
  }
  for (auto inst : need_remove) {
    inst->dropAllReferences();
    inst->eraseFromParent();
  }
 }
 void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
  bool intra_warp_loop = 1;
  // we should handle allocation generated by PHI
  {
    std::vector<llvm::Instruction *> instruction_to_fix;
    auto F = PRs[0].start_block->getParent();
    for (auto bb = F->begin(); bb != F->end(); bb++) {
      for (auto ii = bb->begin(); ii != bb->end(); ii++) {
        if (isa<AllocaInst>(&(*ii)))
          instruction_to_fix.push_back(&(*ii));
      }
      for (auto inst : instruction_to_fix) {
        AddContextSaveRestore(inst, intra_warp_loop);
      }
    }
  }
  for (auto parallel_regions : PRs) {
    std::set<llvm::Instruction *> instruction_in_region;
    std::vector<llvm::Instruction *> instruction_to_fix;
    for (auto bb : parallel_regions.wrapped_block) {
      for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
           ++instr) {
        llvm::Instruction *instruction = &*instr;
        instruction_in_region.insert(instruction);
      }
    }
    /* Find all the instructions that define new values and
       check if they need to be context saved. */
    for (auto bb : parallel_regions.wrapped_block) {
      for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
           ++instr) {
        llvm::Instruction *instruction = &*instr;
        if (ShouldNotBeContextSaved(instruction))
          continue;
        for (Instruction::use_iterator ui = instruction->use_begin(),
                                       ue = instruction->use_end();
             ui != ue; ++ui) {
          llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
          if (user == NULL)
            continue;
          if (isa<AllocaInst>(instruction) ||
              (instruction_in_region.find(user) ==
               instruction_in_region.end())) {
            instruction_to_fix.push_back(instruction);
            break;
          }
        }
      }
    }
    for (auto inst : instruction_to_fix) {
      AddContextSaveRestore(inst, intra_warp_loop);
    }
  }
 }
 BasicBlock *insert_loop_init(llvm::BasicBlock *InsertInitBefore,
                             bool intra_warp_loop) {
  llvm::Module *M = InsertInitBefore->getParent()->getParent();
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::string block_name =
      (intra_warp_loop) ? "intra_warp_init" : "inter_warp_init";
  BasicBlock *loop_init = BasicBlock::Create(
      context, block_name, InsertInitBefore->getParent(), InsertInitBefore);
  IRBuilder<> builder(context);
  builder.SetInsertPoint(loop_init);
  if (intra_warp_loop) { // intra warp
    auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
    builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
  } else { // inter warp
    auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
    builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
  }
  builder.CreateBr(InsertInitBefore);
  return loop_init;
 }
 BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore,
                             llvm::BasicBlock *LoopEnd, bool intra_warp_loop) {
  llvm::Module *M = InsertCondBefore->getParent()->getParent();
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::string block_name =
      (intra_warp_loop) ? "intra_warp_cond" : "inter_warp_cond";
  BasicBlock *loop_cond = BasicBlock::Create(
      context, block_name, InsertCondBefore->getParent(), InsertCondBefore);
  IRBuilder<> builder(context);
  builder.SetInsertPoint(loop_cond);
  llvm::Value *cmpResult = NULL;
  if (!intra_warp_loop) {
    auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
    auto block_size = M->getGlobalVariable("block_size");
    auto warp_cnt =
        builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size),
                            ConstantInt::get(I32, 32), "warp_number");
    cmpResult =
        builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt);
  } else {
    auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
    auto block_size = M->getGlobalVariable("block_size");
    if (!need_nested_loop) {
      cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
                                        builder.CreateLoad(block_size));
    } else {
      cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
                                        ConstantInt::get(I32, 32));
    }
  }
  builder.CreateCondBr(cmpResult, InsertCondBefore, LoopEnd);
  return loop_cond;
 }
 BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore,
                            bool intra_warp_loop) {
  llvm::Module *M = InsertIncBefore->getParent()->getParent();
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::string block_name =
      (intra_warp_loop) ? "intra_warp_inc" : "inter_warp_inc";
  BasicBlock *loop_inc = BasicBlock::Create(
      context, block_name, InsertIncBefore->getParent(), InsertIncBefore);
  IRBuilder<> builder(context);
  builder.SetInsertPoint(loop_inc);
  if (intra_warp_loop) { // intra warp
    auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
    auto new_index = builder.CreateBinOp(
        Instruction::Add, builder.CreateLoad(intra_warp_index),
        ConstantInt::get(I32, 1), "intra_warp_index_increment");
    builder.CreateStore(new_index, intra_warp_index);
  } else { // inter warp
    auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
    auto new_index = builder.CreateBinOp(
        Instruction::Add, builder.CreateLoad(inter_warp_index),
        ConstantInt::get(I32, 1), "inter_warp_index_increment");
    builder.CreateStore(new_index, inter_warp_index);
  }
  builder.CreateBr(InsertIncBefore);
  return loop_inc;
 }
 void add_warp_loop(std::vector<ParallelRegion> parallel_regions,
                   bool intra_warp_loop) {
  for (auto region : parallel_regions) {
    auto start_block = region.start_block;
    auto tail_block = region.end_block;
    auto next_block = region.successor_block;
    auto loop_cond = insert_loop_cond(start_block, next_block, intra_warp_loop);
    auto loop_init = insert_loop_init(loop_cond, intra_warp_loop);
    auto F = start_block->getParent();
    for (Function::iterator i = F->begin(); i != F->end(); ++i) {
      llvm::BasicBlock *bb = &(*i);
      if (bb == loop_cond)
        continue;
      bb->getTerminator()->replaceUsesOfWith(start_block, loop_init);
    }
    auto loop_inc = insert_loop_inc(loop_cond, intra_warp_loop);
    tail_block->getTerminator()->replaceUsesOfWith(next_block, loop_inc);
    // we have to reset inter/intra warp index to 0, as these maybe used
    // outside PR when there are conditional loop/branch
    llvm::Module *M = start_block->getParent()->getParent();
    LLVMContext &context = M->getContext();
    auto I32 = llvm::Type::getInt32Ty(context);
    BasicBlock *reset_index = BasicBlock::Create(start_block->getContext(),
                                                 "reset_block", F, next_block);
    IRBuilder<> builder(start_block->getContext());
    builder.SetInsertPoint(reset_index);
    if (intra_warp_loop) { // intra warp
      auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
      builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
    } else { // inter warp
      auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
      builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
    }
    builder.CreateBr(next_block);
    loop_cond->getTerminator()->replaceUsesOfWith(next_block, reset_index);
    // add metadata
    MDNode *Dummy =
        MDNode::getTemporary(context, ArrayRef<Metadata *>()).release();
    MDNode *AccessGroupMD = MDNode::getDistinct(context, {});
    MDNode *ParallelAccessMD = MDNode::get(
        context,
        {MDString::get(context, "llvm.loop.parallel_accesses"), AccessGroupMD});
    MDNode *Root = MDNode::get(context, {Dummy, ParallelAccessMD});
    Root->replaceOperandWith(0, Root);
    MDNode::deleteTemporary(Dummy);
    // We now have
    //   !1 = metadata !{metadata !1} <- self-referential root
    loop_cond->getTerminator()->setMetadata("llvm.loop", Root);
    for (auto bb : region.wrapped_block) {
      for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); ii != ee;
           ii++) {
        if (!ii->mayReadOrWriteMemory()) {
          continue;
        }
        MDNode *NewMD = MDNode::get(bb->getContext(), AccessGroupMD);
        MDNode *OldMD = ii->getMetadata("llvm.mem.parallel_loop_access");
        if (OldMD != nullptr) {
          NewMD = llvm::MDNode::concatenate(OldMD, NewMD);
        }
        ii->setMetadata("llvm.mem.parallel_loop_access", NewMD);
      }
    }
  }
 }
 void print_parallel_region(std::vector<ParallelRegion> parallel_regions) {
  printf("get PR:\n");
  for (auto region : parallel_regions) {
    auto start = region.start_block;
    auto end = region.end_block;
    auto next = region.successor_block;
    printf("parallel region: %s->%s next: %s\n", start->getName().str().c_str(),
           end->getName().str().c_str(), next->getName().str().c_str());
    printf("have: \n");
    for (auto b : region.wrapped_block) {
      printf("%s\n", b->getName().str().c_str());
    }
  }
 }
 void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
  std::vector<Instruction *> need_remove;
  for (auto BB = F->begin(); BB != F->end(); ++BB) {
    for (auto BI = BB->begin(); BI != BB->end(); BI++) {
      if (auto Call = dyn_cast<CallInst>(BI)) {
        auto func_name = Call->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.bar.warp.sync") {
          need_remove.push_back(Call);
        }
        if (!intra_warp_loop && (func_name == "llvm.nvvm.barrier0" ||
                                 func_name == "llvm.nvvm.barrier.sync")) {
          need_remove.push_back(Call);
        }
      }
    }
  }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 class InsertWarpLoopPass : public llvm::FunctionPass {
 public:
  static char ID;
  bool intra_warp_loop;
  DominatorTree *DT;
  PostDominatorTree *PDT;
  InsertWarpLoopPass(bool intra_warp = 0)
      : FunctionPass(ID), intra_warp_loop(intra_warp) {}
  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
    AU.addRequired<DominatorTreeWrapperPass>();
    AU.addRequired<PostDominatorTreeWrapperPass>();
  }
  void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop,
                               std::vector<ParallelRegion> &parallel_regions) {
    ParallelRegion current_region;
    SmallVector<BasicBlock *, 4> pending_blocks;
    BasicBlock *region_entry_barrier = NULL;
    BasicBlock *entry = NULL;
    BasicBlock *exit = B->getSinglePredecessor();
    for (BasicBlock *Pred : predecessors(B)) {
      pending_blocks.push_back(Pred);
    }
    if (pending_blocks.size() > 1) {
      // becuase we have insert the sync and split by them,
      // so if B has several income edges, it must be a merge point
      // for a conditional if. We can safely ignore it
      // TODO: we have to further check whether this conditional if
      // is for inter warp or intra warp
      return;
    }
    while (!pending_blocks.empty()) {
      BasicBlock *current = pending_blocks.back();
      pending_blocks.pop_back();
      // avoid infinite recursion of loops
      if (current_region.wrapped_block.count(current) != 0) {
        continue;
      }
      // If we reach another barrier this must be the
      // parallel region entry.
      bool has_barrier = 0;
      for (auto i = current->begin(), e = current->end(); i != e; ++i) {
        if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
          auto func_name = call_inst->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.barrier.sync")
            has_barrier = 1;
          if (func_name == "llvm.nvvm.bar.warp.sync" && intra_warp_loop)
            has_barrier = 1;
        }
      }
      // if we reach a block which only has a single condtional branch,
      // it is the start point of a B-condition, we have to stop here
      bool is_single_conditional_branch_block = 0;
      if (auto br = dyn_cast<llvm::BranchInst>(current->getTerminator())) {
        if (br->isConditional()) {
          if (current->size() == 1) {
            is_single_conditional_branch_block = 1;
          } else {
            // generate by replicate local variable
            printf(
                "[WARNING] match single conditional branch with HARD CODE\n");
            bool branch_to_intra_init = false;
            for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
              llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
              auto block_name = entryCandidate->getName().str();
              if (find_block_barrier_in_region(current, B)) {
                if (block_name.find("warp_init") != block_name.npos) {
                  is_single_conditional_branch_block = 1;
                  break;
                }
              }
            }
          }
        }
      }
      if (has_barrier || is_single_conditional_branch_block) {
        if (region_entry_barrier == NULL)
          region_entry_barrier = current;
        else if (region_entry_barrier != current) {
          // this means there is not PR before B, just return
          return;
        }
        continue;
      }
      // Non-barrier block, this must be on the region.
      current_region.wrapped_block.insert(current);
      // Add predecessors to pending queue.
      for (BasicBlock *Pred : predecessors(current)) {
        pending_blocks.push_back(Pred);
      }
    }
    if (current_region.wrapped_block.empty()) {
      return;
    }
    // if do not find entry node, this means all predecessor
    // blocks do not need to execute multiply times
    if (region_entry_barrier == NULL) {
      return;
    }
    // Find the entry node.
    assert(region_entry_barrier != NULL);
    for (unsigned
             suc = 0,
             num = region_entry_barrier->getTerminator()->getNumSuccessors();
         suc < num; ++suc) {
      llvm::BasicBlock *entryCandidate =
          region_entry_barrier->getTerminator()->getSuccessor(suc);
      if (current_region.wrapped_block.count(entryCandidate) == 0)
        continue;
      entry = entryCandidate;
      break;
    }
    // delete useless PR, those PRs only have branch
    if (entry == exit) {
      if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
        return;
      }
    }
    bool is_useless = true;
    auto iter = entry;
    do {
      if (iter->size() != 1 || !isa<llvm::BranchInst>(entry->begin())) {
        is_useless = false;
        break;
      }
      if (iter->getTerminator()->getNumSuccessors() > 1) {
        is_useless = false;
        break;
      }
      iter = iter->getTerminator()->getSuccessor(0);
    } while (iter != exit);
    if (is_useless) {
      return;
    }
    assert(current_region.wrapped_block.count(entry) != 0);
    current_region.start_block = entry;
    current_region.end_block = exit;
    current_region.successor_block = B;
    parallel_regions.push_back(current_region);
  }
  std::vector<ParallelRegion> getParallelRegions(llvm::Function *F,
                                                 bool intra_warp_loop) {
    std::vector<ParallelRegion> parallel_regions;
    SmallVector<BasicBlock *, 4> exit_blocks;
    for (Function::iterator s = F->begin(); s != F->end(); s++) {
      if (llvm::CallInst *call_inst =
              llvm::dyn_cast<llvm::CallInst>(s->begin())) {
        auto func_name = call_inst->getCalledFunction()->getName().str();
        if (func_name == "llvm.nvvm.barrier0" ||
            func_name == "llvm.nvvm.barrier.sync") {
          exit_blocks.push_back(&(*s));
        }
        // when handling intra warp loop, we need also split the blocks
        // between warp barrier
        if (intra_warp_loop && func_name == "llvm.nvvm.bar.warp.sync") {
          exit_blocks.push_back(&(*s));
        }
      }
    }
    // First find all the ParallelRegions in the Function.
    while (!exit_blocks.empty()) {
      BasicBlock *exit = exit_blocks.back();
      exit_blocks.pop_back();
      getParallelRegionBefore(exit, intra_warp_loop, parallel_regions);
    }
    return parallel_regions;
  }
  virtual bool runOnFunction(Function &F) {
    if (!isKernelFunction(F.getParent(), &F))
      return 0;
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
    // find parallel region we need to wrap
    auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
    assert(!parallel_regions.empty() && "can not find any parallel regions\n");
    // print_parallel_region(parallel_regions);
    add_warp_loop(parallel_regions, intra_warp_loop);
    if (intra_warp_loop) {
      handle_local_variable_intra_warp(parallel_regions);
    }
    remove_barrier(&F, intra_warp_loop);
    return 1;
  }
 };
 char InsertWarpLoopPass::ID = 0;
 namespace {
 static RegisterPass<InsertWarpLoopPass> X("insert-warp-loop",
                                          "Insert inter/intra warp loop");
 } // namespace
 bool has_warp_barrier(llvm::Module *M) {
  for (auto F = M->begin(); F != M->end(); ++F)
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync") {
            return true;
          }
        }
      }
    }
  return false;
 }
 void insert_warp_loop(llvm::Module *M) {
  llvm::legacy::PassManager Passes;
  need_nested_loop = has_warp_barrier(M);
  // use nested loop only when there are warp-level barrier
  if (need_nested_loop) {
    bool intra_warp = true;
    Passes.add(new InsertWarpLoopPass(intra_warp));
    // insert inter warp loop
    Passes.add(new InsertWarpLoopPass(!intra_warp));
    Passes.run(*M);
  } else {
    bool intra_warp = true;
    // only need a single loop, with size=block_size
    Passes.add(new InsertWarpLoopPass(intra_warp));
    Passes.run(*M);
    // remove all barriers
    for (auto F = M->begin(); F != M->end(); ++F)
      remove_barrier(dyn_cast<llvm::Function>(F), false);
  }
 }
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -0,0 +1,126 @@
 #include "memory_hierarchy.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <map>
 #include <set>
 #include <sstream>
 #include <tuple>
 #include <vector>
 void mem_share2global(llvm::Module *M) {
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int64T = Type::getInt64Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
  std::set<llvm::Instruction *> need_remove;
  std::set<GlobalVariable *> need_remove_share_memory;
  // find all share memory and generate corresponding global memory
  for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
    if (GlobalVariable *share_memory = dyn_cast<GlobalVariable>(I)) {
      if (auto PT = dyn_cast<PointerType>(I->getType())) {
        unsigned AS = PT->getAddressSpace();
        if (AS == 3) { // find a share memory
          need_remove_share_memory.insert(share_memory);
          // generate the corresponding global memory variable
          auto new_name = "wrapper_global_" + share_memory->getName().str();
          auto element_type = PT->getElementType();
          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
            ConstantAggregateZero *const_array =
                ConstantAggregateZero::get(array_type);
            global_memory->setInitializer(const_array);
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
            auto zero = llvm::ConstantInt::get(int_type, 0, true);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, int_type, false, llvm::GlobalValue::ExternalLinkage, zero,
                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
                false);
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
          }
        }
      }
    }
  }
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
      BasicBlock *b = &*i;
      for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
        if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
          auto read_array = get_element_ptr->getPointerOperand();
          if (GlobalVariable *read_share_memory =
                  dyn_cast<llvm::GlobalVariable>(read_array)) {
            // find a GetElementPtr which read share memory
            if (corresponding_global_memory.find(read_share_memory) !=
                corresponding_global_memory.end()) {
              std::vector<Value *> Indices;
              for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
                Indices.push_back(get_element_ptr->getOperand(i + 1));
              auto new_GEP = GetElementPtrInst::Create(
                  NULL, // Pointee type
                  corresponding_global_memory.find(read_share_memory)
                      ->second, // Alloca
                  Indices,      // Indices
                  "", get_element_ptr);
              // replace all get_element_ptr with new_GEP:
              // we can not directly use:
              // get_element_ptr->replaceAllUsesWith(new_GEP);
              // as get_element_ptr and new_GEP have different return type
              llvm::Type *original_type = get_element_ptr->getType();
              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
                  new_GEP, original_type, "", get_element_ptr);
              get_element_ptr->replaceAllUsesWith(FormatASC);
              need_remove.insert(get_element_ptr);
            }
          }
        } else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
          auto read_array = addr_cast->getOperand(0);
          if (GlobalVariable *read_share_memory =
                  dyn_cast<llvm::GlobalVariable>(read_array)) {
            // find a GetElementPtr which read share memory
            if (corresponding_global_memory.find(read_share_memory) !=
                corresponding_global_memory.end()) {
              llvm::Type *original_type = addr_cast->getType();
              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
                  corresponding_global_memory.find(read_share_memory)->second,
                  original_type, "", addr_cast);
              addr_cast->replaceAllUsesWith(FormatASC);
              need_remove.insert(addr_cast);
            }
          }
        }
      }
    }
  }
  for (auto i : need_remove) {
    i->dropAllReferences();
    i->eraseFromParent();
  }
  for (auto i : need_remove_share_memory) {
    i->dropAllReferences();
    i->eraseFromParent();
  }
 }
--- a/compilation/KernelTranslation/lib/performance.cpp
+++ b/compilation/KernelTranslation/lib/performance.cpp
@ -0,0 +1,88 @@
 #include "performance.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 #include <set>
 #include <sstream>
 #include <tuple>
 #include <vector>
 using namespace llvm;
 void performance_optimization(llvm::Module *M) {
  for (auto F = M->begin(); F != M->end(); F++) {
    for (auto I = F->arg_begin(); I != F->arg_end(); ++I) {
      if (I->getType()->isPointerTy()) {
        I->addAttr(llvm::Attribute::NoAlias);
      }
    }
  }
  llvm::legacy::PassManager Passes;
  // add target machine info
  llvm::Triple triple("x86_64-unknown-linux-gnu");
  std::string Error;
  const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
  if (!TheTarget) {
    printf("Error: %s\n", Error.c_str());
    assert(0);
  }
  llvm::TargetOptions Options;
  Options.FloatABIType = FloatABI::Hard;
  TargetMachine *TM = TheTarget->createTargetMachine(
      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
      Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
  assert(TM && "No Machine Information\n");
  Passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
  TargetLibraryInfoImpl TLII(triple);
  TLII.disableAllFunctions();
  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
  // Add O3 optimization
  llvm::PassManagerBuilder Builder;
  Builder.OptLevel = 3;
  Builder.SizeLevel = 0;
  Builder.LoopVectorize = true;
  Builder.SLPVectorize = true;
  Builder.VerifyInput = true;
  Builder.VerifyOutput = true;
  Builder.populateModulePassManager(Passes);
  Passes.run(*M);
 }
--- a/compilation/KernelTranslation/lib/tool.cpp
+++ b/compilation/KernelTranslation/lib/tool.cpp
@ -0,0 +1,480 @@
 #include "tool.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <set>
 using namespace llvm;
 llvm::Module *LoadModuleFromFilr(char *file_name) {
  llvm::SMDiagnostic Err;
  llvm::LLVMContext *globalContext = new llvm::LLVMContext;
  auto program = parseIRFile(file_name, Err, *globalContext).release();
  if (!program) {
    printf("error when opening the bitcode\n");
    exit(1);
  }
  return program;
 }
 void VerifyModule(llvm::Module *program) {
  std::string msg;
  llvm::raw_string_ostream os(msg);
  if (llvm::verifyModule(*program, &(llvm::errs())))
    llvm::report_fatal_error(os.str().c_str());
 }
 void DumpModule(llvm::Module *M, char *file_name) {
  // modify the program, add a wrapper
  std::string msg;
  llvm::raw_string_ostream os(msg);
  std::error_code EC;
  ToolOutputFile Out(file_name, EC, sys::fs::F_None);
  if (EC) {
    errs() << "Fails to open output file: " << EC.message();
    return;
  }
  WriteBitcodeToFile(*M, Out.os());
  Out.keep();
 }
 bool isKernelFunction(llvm::Module *M, llvm::Function *F) {
  NamedMDNode *NamedMD = M->getNamedMetadata("nvvm.annotations");
  if (!NamedMD) {
    printf("there must be nvvm.annotations!\n");
    exit(1);
  }
  for (unsigned I = 0, E = NamedMD->getNumOperands(); I != E; ++I) {
    MDNode *MD = NamedMD->getOperand(I);
    if (!MD || MD->getNumOperands() == 0)
      continue;
    if (MD->getNumOperands() != 3)
      continue;
    Metadata *Op = MD->getOperand(1);
    if (auto Str = llvm::cast<MDString>(Op)) {
      if (Str->getString().str() != "kernel")
        continue;
      llvm::Value *meta =
          dyn_cast<llvm::ValueAsMetadata>(MD->getOperand(0))->getValue();
      Function *FF = llvm::cast<Function>(meta);
      if (FF->getName().str() == F->getName().str())
        return true;
    }
  }
  return false;
 }
 void replace_block(llvm::Function *F, llvm::BasicBlock *before,
                   llvm::BasicBlock *after) {
  for (Function::iterator i = F->begin(); i != F->end(); ++i) {
    llvm::BasicBlock *bb = &(*i);
    if (bb == after)
      continue;
    bb->getTerminator()->replaceUsesOfWith(before, after);
  }
 }
 llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore) {
  llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
  llvm::FunctionCallee f =
      M->getOrInsertFunction("llvm.nvvm.barrier0", LauncherFuncT);
  llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
  return llvm::CallInst::Create(F, "", InsertBefore);
 }
 llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore) {
  llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
  llvm::FunctionCallee f =
      M->getOrInsertFunction("llvm.nvvm.bar.warp.sync", LauncherFuncT);
  llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
  return llvm::CallInst::Create(F, "", InsertBefore);
 }
 llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
  std::string allocaName = std::string(phi->getName().str()) + ".ex_phi";
  llvm::Function *function = phi->getParent()->getParent();
  IRBuilder<> builder(&*(function->getEntryBlock().getFirstInsertionPt()));
  llvm::Instruction *alloca =
      builder.CreateAlloca(phi->getType(), 0, allocaName);
  for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
       ++incoming) {
    Value *val = phi->getIncomingValue(incoming);
    BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
    builder.SetInsertPoint(incomingBB->getTerminator());
    llvm::Instruction *store = builder.CreateStore(val, alloca);
  }
  builder.SetInsertPoint(phi);
  llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
  phi->replaceAllUsesWith(loadedValue);
  phi->eraseFromParent();
  return loadedValue;
 }
 void phi2alloc(llvm::Module *M) {
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
    typedef std::vector<llvm::Instruction *> InstructionVec;
    InstructionVec PHIs;
    for (Function::iterator bb = F->begin(); bb != F->end(); ++bb) {
      for (BasicBlock::iterator p = bb->begin(); p != bb->end(); ++p) {
        Instruction *instr = &*p;
        if (isa<PHINode>(instr)) {
          PHIs.push_back(instr);
        }
      }
    }
    bool changed = false;
    for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
      Instruction *instr = *i;
      BreakPHIToAllocas(dyn_cast<PHINode>(instr));
    }
  }
 }
 void remove_cuda_built_in(llvm::Module *M) {
  // initialize function name
  std::set<std::string> useless_func_name;
  useless_func_name.insert("cudaMalloc");
  useless_func_name.insert("cudaFuncGetAttributes");
  useless_func_name.insert("cudaGetDevice");
  useless_func_name.insert("cudaDeviceGetAttribute");
  useless_func_name.insert("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
  useless_func_name.insert(
      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
  std::set<llvm::Function *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (useless_func_name.find(func_name) != useless_func_name.end()) {
      need_remove.insert(F);
    }
  }
  for (auto F : need_remove) {
    F->dropAllReferences();
    F->eraseFromParent();
  }
 }
 void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<llvm::Instruction *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
    IRBuilder<> builder(&*(F->getEntryBlock().getFirstInsertionPt()));
    auto global_intra_warp_idx =
        F->getParent()->getGlobalVariable("intra_warp_index");
    auto local_intra_warp_idx =
        builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
                             0, "local_intra_warp_idx");
    global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
    auto global_inter_warp_idx =
        F->getParent()->getGlobalVariable("inter_warp_index");
    auto local_inter_warp_idx =
        builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
                             0, "local_inter_warp_idx");
    global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Load = dyn_cast<LoadInst>(BI)) {
          auto load_from = Load->getOperand(0);
          if (load_from == F->getParent()->getGlobalVariable("block_size")) {
            Load->replaceAllUsesWith(ConstantInt::get(
                I32, block_dim[0] * block_dim[1] * block_dim[2]));
            need_remove.push_back(Load);
          }
        } else if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
            if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
              // replace it by warp_id
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto thread_idx = builder.CreateBinOp(
                  Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
                  ConstantInt::get(I32, 32), "");
              thread_idx = builder.CreateBinOp(
                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
                  thread_idx, "thread_idx");
              if (block_dim[1] != 1 || block_dim[2] != 1) {
                printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
                thread_idx = builder.CreateBinOp(
                    Instruction::SRem, thread_idx,
                    ConstantInt::get(I32, block_dim[0]), "thread_id_x");
              }
              Call->replaceAllUsesWith(thread_idx);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.y") {
              // replace it by warp_id
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto thread_idx = builder.CreateBinOp(
                  Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
                  ConstantInt::get(I32, 32), "");
              thread_idx = builder.CreateBinOp(
                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
                  thread_idx, "thread_idx");
              // tidy = tid / block_dim.x
              thread_idx = builder.CreateBinOp(
                  Instruction::SDiv, thread_idx,
                  ConstantInt::get(I32, block_dim[0]),
                  // builder.CreateLoad(M->getGlobalVariable("block_size_x")),
                  "thread_id_y");
              Call->replaceAllUsesWith(thread_idx);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
              printf("[WARNING] We DO NOT support multi-dim block\n");
              auto zero = ConstantInt::get(I32, 0);
              Call->replaceAllUsesWith(zero);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
              auto block_index_addr = M->getGlobalVariable("block_index");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto block_idx = builder.CreateLoad(block_index_addr);
              Call->replaceAllUsesWith(block_idx);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
                       func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
              printf("[WARNING We DO NOT support multi-dim grid\n");
              auto zero = ConstantInt::get(I32, 0);
              Call->replaceAllUsesWith(zero);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
              auto block_size_addr = M->getGlobalVariable("block_size_x");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto block_size = ConstantInt::get(I32, block_dim[0]);
              Call->replaceAllUsesWith(block_size);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
              auto block_size_addr = M->getGlobalVariable("block_size_y");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto block_size = ConstantInt::get(I32, block_dim[1]);
              Call->replaceAllUsesWith(block_size);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
              auto block_size_addr = M->getGlobalVariable("block_size_z");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto block_size = ConstantInt::get(I32, block_dim[2]);
              Call->replaceAllUsesWith(block_size);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
              auto grid_size_addr = M->getGlobalVariable("grid_size");
              IRBuilder<> builder(context);
              builder.SetInsertPoint(Call);
              auto grid_size = ConstantInt::get(I32, grid_dim[0]);
              Call->replaceAllUsesWith(grid_size);
              need_remove.push_back(Call);
            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
                       func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
              printf("[WARNING We DO NOT support multi-dim grid\n");
              auto one = ConstantInt::get(I32, 1);
              Call->replaceAllUsesWith(one);
              need_remove.push_back(Call);
            }
          }
          if (Call->isInlineAsm()) {
            auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
            if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
              printf("unknown InlineAsm\n");
              exit(1);
            }
            // return the rank within the warp
            IRBuilder<> builder(context);
            builder.SetInsertPoint(Call);
            auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx);
            Call->replaceAllUsesWith(intra_warp_index);
            need_remove.push_back(Call);
          }
        }
      }
    }
  }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 void replace_asm_call(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<CallInst *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->isInlineAsm()) {
            auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
            if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
              printf("unknown InlineAsm\n");
              exit(1);
            }
            // return the rank within the warp
            IRBuilder<> builder(context);
            builder.SetInsertPoint(Call);
            auto intra_warp_index_addr =
                M->getGlobalVariable("intra_warp_index");
            auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr);
            Call->replaceAllUsesWith(intra_warp_index);
            need_remove.push_back(Call);
          }
        }
      }
    }
  }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 bool has_warp_barrier(llvm::BasicBlock *B) {
  for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.bar.warp.sync") {
        return true;
      }
    }
  }
  return false;
 }
 bool has_barrier(llvm::BasicBlock *B) {
  for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.barrier0" ||
          func_name == "llvm.nvvm.bar.warp.sync" ||
          func_name == "llvm.nvvm.barrier.sync") {
        return true;
      }
    }
  }
  return false;
 }
 bool has_block_barrier(llvm::BasicBlock *B) {
  for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
    Instruction *inst = &(*i);
    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
    if (Call) {
      auto func_name = Call->getCalledFunction()->getName().str();
      if (func_name == "llvm.nvvm.barrier0" ||
          func_name == "llvm.nvvm.barrier.sync") {
        return true;
      }
    }
  }
  return false;
 }
 bool has_barrier(llvm::Function *F) {
  for (auto B = F->begin(); B != F->end(); B++) {
    if (has_barrier(&(*B)))
      return true;
  }
  return false;
 }
 bool find_block_barrier_in_region(llvm::BasicBlock *start,
                                  llvm::BasicBlock *end) {
  std::set<llvm::BasicBlock *> visit;
  std::vector<llvm::BasicBlock *> pending_blocks;
  for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
    pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
  }
  while (!pending_blocks.empty()) {
    BasicBlock *current = pending_blocks.back();
    pending_blocks.pop_back();
    if (visit.find(current) != visit.end())
      continue;
    visit.insert(current);
    if (current == end)
      continue;
    if (has_block_barrier(current)) {
      return 1;
    }
    for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
      pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
    }
  }
  return 0;
 }
 bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
  std::set<llvm::BasicBlock *> visit;
  std::vector<llvm::BasicBlock *> pending_blocks;
  for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
    pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
  }
  while (!pending_blocks.empty()) {
    BasicBlock *current = pending_blocks.back();
    pending_blocks.pop_back();
    if (visit.find(current) != visit.end())
      continue;
    visit.insert(current);
    if (current == end)
      continue;
    if (has_barrier(current)) {
      return 1;
    }
    for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
      pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
    }
  }
  return 0;
 }
--- a/compilation/KernelTranslation/lib/warp_func.cpp
+++ b/compilation/KernelTranslation/lib/warp_func.cpp
@ -0,0 +1,217 @@
 #include "warp_func.h"
 #include "tool.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <set>
 using namespace llvm;
 /*
 * Insert sync before each vote, and replace the
 * original vote function to warp vote version
 */
 void handle_warp_vote(llvm::Module *M) {
  std::set<llvm::CallInst *> need_replace;
  llvm::Type *Int1T = Type::getInt1Ty(M->getContext());
  llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
  llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
  auto zero = llvm::ConstantInt::get(I32, 0, true);
  auto one = llvm::ConstantInt::get(I32, 1, true);
  llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32)->getPointerTo();
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(Int1T, {VoteArrayType}, false);
  llvm::FunctionCallee _f = M->getOrInsertFunction("warp_any", LauncherFuncT);
  llvm::Function *func_warp_any = llvm::cast<llvm::Function>(_f.getCallee());
  _f = M->getOrInsertFunction("warp_all", LauncherFuncT);
  llvm::Function *func_warp_all = llvm::cast<llvm::Function>(_f.getCallee());
  // replace llvm.nvvm.vote.any.sync to warp vote function
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
    Function::iterator I = F->begin();
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
        if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
          auto func_name = vote_any_sync->getCalledFunction()->getName();
          if (func_name == "llvm.nvvm.vote.any.sync" ||
              func_name == "llvm.nvvm.vote.all.sync") {
            // insert sync before call
            need_replace.insert(vote_any_sync);
          }
        }
      }
    }
  }
  GlobalVariable *warp_vote_ptr = M->getNamedGlobal("warp_vote");
  for (auto sync_inst : need_replace) {
    // create barrier
    CreateIntraWarpBarrier(sync_inst);
    /*
     * store into warp_vote[tid]
     */
    assert(warp_vote_ptr != NULL);
    auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index");
    auto intra_warp_index =
        new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst);
    auto GEP = GetElementPtrInst::Create(NULL,          // Pointee type
                                         warp_vote_ptr, // Alloca
                                         {zero, intra_warp_index}, // Indices
                                         "", sync_inst);
    // as AVX only support 8bit for each thread
    // so we have to cast the predict into int8
    auto predict = llvm::CastInst::CreateIntegerCast(
        sync_inst->getArgOperand(1), I8, false, "", sync_inst);
    // we need to concern mask
    auto mask = llvm::CastInst::CreateIntegerCast(sync_inst->getArgOperand(0),
                                                  I32, false, "", sync_inst);
    auto bit_flag = BinaryOperator::Create(Instruction::LShr, mask,
                                           intra_warp_index, "", sync_inst);
    auto valid =
        BinaryOperator::Create(Instruction::And, one, bit_flag, "", sync_inst);
    auto valid_8bit =
        llvm::CastInst::CreateIntegerCast(valid, I8, false, "", sync_inst);
    llvm::Instruction *res;
    if (sync_inst->getCalledFunction()->getName() ==
        "llvm.nvvm.vote.any.sync") {
      res = BinaryOperator::Create(Instruction::Mul, valid_8bit, predict, "",
                                   sync_inst);
    } else if (sync_inst->getCalledFunction()->getName() ==
               "llvm.nvvm.vote.all.sync") {
      auto reverse_valid = BinaryOperator::CreateNot(valid_8bit, "", sync_inst);
      res = BinaryOperator::Create(Instruction::Or, reverse_valid, predict, "",
                                   sync_inst);
      // as AVX do not have all, we have to
      // reverse the result and call AVX-any instead
      res = BinaryOperator::CreateNot(res, "", sync_inst);
    }
    auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
    // create barrier
    CreateIntraWarpBarrier(sync_inst);
    /*
     * replace llvm.nvvm.vote.any.sync(i32 mask, i1 predict)
     * to warp_any(i32 mask, i8* predict)
     */
    std::vector<Value *> args;
    // args.push_back(mask);
    args.push_back(warp_vote_ptr);
    llvm::Instruction *warp_inst;
    if (sync_inst->getCalledFunction()->getName() ==
        "llvm.nvvm.vote.any.sync") {
      warp_inst = llvm::CallInst::Create(func_warp_any, args, "", sync_inst);
    } else if (sync_inst->getCalledFunction()->getName() ==
               "llvm.nvvm.vote.all.sync") {
      warp_inst = llvm::CallInst::Create(func_warp_all, args, "", sync_inst);
    }
    sync_inst->replaceAllUsesWith(warp_inst);
    sync_inst->eraseFromParent();
  }
 }
 void handle_warp_shfl(llvm::Module *M) {
  std::set<llvm::CallInst *> need_replace;
  llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
  auto ZERO = llvm::ConstantInt::get(I32, 0, true);
  // replace llvm.nvvm.vote.any.sync to warp vote function
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    if (!isKernelFunction(M, F))
      continue;
    Function::iterator I = F->begin();
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
        if (CallInst *warp_shfl = dyn_cast<CallInst>(BI)) {
          auto func_name = warp_shfl->getCalledFunction()->getName();
          if (func_name == "llvm.nvvm.shfl.sync.down.i32" ||
              func_name == "llvm.nvvm.shfl.sync.up.i32" ||
              func_name == "llvm.nvvm.shfl.sync.bfly.i32") {
            // insert sync before call
            need_replace.insert(warp_shfl);
          }
        }
      }
    }
  }
  GlobalVariable *warp_shfl_ptr = M->getNamedGlobal("warp_shfl");
  for (auto shfl_inst : need_replace) {
    /*
     * %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add32, i32
     * 16, i32 31)
     * ->
     * warp_shfl[warp_id] = add32
     * warp.barrier()
     * %10 = warp_shfl[warp_id + offset]
     */
    IRBuilder<> builder(shfl_inst);
    auto shfl_variable = shfl_inst->getArgOperand(1);
    auto shfl_offset = shfl_inst->getArgOperand(2);
    auto intra_warp_index =
        builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
    builder.CreateStore(
        shfl_variable,
        builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index}));
    // we should create barrier before store
    CreateIntraWarpBarrier(intra_warp_index);
    // load shuffled data
    auto new_intra_warp_index =
        builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
    auto shfl_name = shfl_inst->getCalledFunction()->getName().str();
    if (shfl_name.find("down") != shfl_name.npos) {
      auto calculate_offset = builder.CreateBinOp(
          Instruction::Add, new_intra_warp_index, shfl_offset);
      auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
                                           ConstantInt::get(I32, 32));
      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
      auto load_inst = builder.CreateLoad(gep);
      // create barrier
      CreateIntraWarpBarrier(new_intra_warp_index);
      shfl_inst->replaceAllUsesWith(load_inst);
      shfl_inst->eraseFromParent();
    } else if (shfl_name.find("up") != shfl_name.npos) {
      auto calculate_offset = builder.CreateBinOp(
          Instruction::Sub, new_intra_warp_index, shfl_offset);
      auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
                                           ConstantInt::get(I32, 32));
      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
      auto load_inst = builder.CreateLoad(gep);
      // create barrier
      CreateIntraWarpBarrier(new_intra_warp_index);
      shfl_inst->replaceAllUsesWith(load_inst);
      shfl_inst->eraseFromParent();
    } else if (shfl_name.find("bfly") != shfl_name.npos) {
      auto calculate_offset = builder.CreateBinOp(
          Instruction::Xor, new_intra_warp_index, shfl_offset);
      auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
                                           ConstantInt::get(I32, 32));
      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
      auto load_inst = builder.CreateLoad(gep);
      // create barrier
      CreateIntraWarpBarrier(new_intra_warp_index);
      shfl_inst->replaceAllUsesWith(load_inst);
      shfl_inst->eraseFromParent();
    }
  }
 }
--- a/compilation/examples/reduce/host.cpp
+++ b/compilation/examples/reduce/host.cpp
@ -0,0 +1,82 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #define NUM_WARP 2
 #define NUM_BLOCK 1
 int block_size = 32 * NUM_WARP;
 int block_size_x = block_size;
 int block_size_y = 1;
 int block_size_z = 1;
 __thread int block_index = 0;
 int grid_size = NUM_BLOCK;
 extern "C" {
 void *_Z7reduce0PiS_j_wrapper(void *);
 __thread int warp_shfl[32];
 }
 void *wrap(void *p) {
  int **res = (int **)p;
  block_index = (*(int *)res[3]);
  _Z7reduce0PiS_j_wrapper(p);
  return NULL;
 }
 void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
  int **ret = new int *[4];
  int **p0 = new int *;
  *p0 = g_idata;
  ret[0] = (int *)(p0);
  int **p1 = new int *;
  *p1 = g_odata;
  ret[1] = (int *)(p1);
  unsigned int *p2 = new unsigned int;
  *p2 = n;
  ret[2] = (int *)p2;
  int *p3 = new int;
  *p3 = bid;
  ret[3] = (int *)p3;
  return (void *)ret;
 }
 int main(int argc, char *argv[]) {
  int *g_idata;
  int size = block_size * NUM_BLOCK;
  g_idata = new int[size * 2];
  int *res = new int[size];
  for (int i = 0; i < size; i++) {
    g_idata[i] = i;
  }
  pthread_t threads[NUM_BLOCK];
  void *inp[NUM_BLOCK];
  for (long t = 0; t < NUM_BLOCK; t++) {
    inp[t] = gen_input(t, g_idata, res, size);
  }
  for (long t = 0; t < NUM_BLOCK; t++) {
    pthread_create(&threads[t], NULL, wrap, inp[t]);
  }
  for (long t = 0; t < NUM_BLOCK; t++)
    pthread_join(threads[t], NULL);
  int gold = 0;
  for (int i = 0; i < size; i++) {
    gold += g_idata[i];
  }
  assert(*res == gold && "Incorrect res\n");
  printf("PASS\n");
  pthread_exit(NULL);
 }
--- a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,150 @@
 ; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: convergent nounwind
 define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
 entry:
  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
  %mul = mul i32 %2, %1
  %add = add i32 %mul, %0
  %cmp = icmp ult i32 %add, %n
  br i1 %cmp, label %cond.true, label %cond.end
 cond.true:                                        ; preds = %entry
  %idxprom = zext i32 %add to i64
  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
  %3 = load i32, i32* %arrayidx, align 4, !tbaa !13
  br label %cond.end
 cond.end:                                         ; preds = %entry, %cond.true
  %cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
  %idxprom5 = zext i32 %0 to i64
  %arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
  %arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
  store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
  %cmp839 = icmp ugt i32 %2, 1
  br i1 %cmp839, label %for.body, label %for.cond.cleanup
 for.cond.cleanup:                                 ; preds = %if.end, %cond.end
  %cmp18 = icmp eq i32 %0, 0
  br i1 %cmp18, label %if.then19, label %if.end23
 for.body:                                         ; preds = %cond.end, %if.end
  %s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
  %mul9 = shl nuw nsw i32 %s.040, 1
  %rem = urem i32 %0, %mul9
  %cmp10 = icmp eq i32 %rem, 0
  br i1 %cmp10, label %if.then, label %if.end
 if.then:                                          ; preds = %for.body
  %add11 = add i32 %s.040, %0
  %idxprom12 = zext i32 %add11 to i64
  %arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
  %arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
  %4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
  %5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
  %add16 = add nsw i32 %5, %4
  store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
  br label %if.end
 if.end:                                           ; preds = %if.then, %for.body
  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
  %cmp8 = icmp ult i32 %mul9, %2
  br i1 %cmp8, label %for.body, label %for.cond.cleanup
 if.then19:                                        ; preds = %for.cond.cleanup
  %idxprom21 = zext i32 %1 to i64
  %arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
  %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
  store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
  br label %if.end23
 if.end23:                                         ; preds = %if.then19, %for.cond.cleanup
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier.sync(i32) #3
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 attributes #4 = { nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
 !10 = !{i32 0, i32 1024}
 !11 = !{i32 0, i32 2147483647}
 !12 = !{i32 1, i32 1025}
 !13 = !{!14, !14, i64 0}
 !14 = !{!"int", !15, i64 0}
 !15 = !{!"omnipotent char", !16, i64 0}
 !16 = !{!"Simple C++ TBAA"}
--- a/compilation/examples/reduce/run.sh
+++ b/compilation/examples/reduce/run.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 ../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
 llc --filetype=obj kernel.bc
 g++ host.cpp kernel.o -lpthread -o test
 ./test
--- a/compilation/examples/reduce_shuffle/host.cpp
+++ b/compilation/examples/reduce_shuffle/host.cpp
@ -0,0 +1,82 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #define NUM_WARP 2
 #define NUM_BLOCK 1
 int block_size = 32 * NUM_WARP;
 int block_size_x = block_size;
 int block_size_y = 1;
 int block_size_z = 1;
 __thread int block_index = 0;
 int grid_size = NUM_BLOCK;
 extern "C" {
 void *_Z7reduce5PiS_j_wrapper(void *);
 __thread int warp_shfl[32];
 }
 void *wrap(void *p) {
  int **res = (int **)p;
  block_index = (*(int *)res[3]);
  _Z7reduce5PiS_j_wrapper(p);
  return NULL;
 }
 void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
  int **ret = new int *[4];
  int **p0 = new int *;
  *p0 = g_idata;
  ret[0] = (int *)(p0);
  int **p1 = new int *;
  *p1 = g_odata;
  ret[1] = (int *)(p1);
  unsigned int *p2 = new unsigned int;
  *p2 = n;
  ret[2] = (int *)p2;
  int *p3 = new int;
  *p3 = bid;
  ret[3] = (int *)p3;
  return (void *)ret;
 }
 int main(int argc, char *argv[]) {
  int *g_idata;
  int size = block_size * NUM_BLOCK;
  g_idata = new int[size * 2];
  int *res = new int[size];
  for (int i = 0; i < size; i++) {
    g_idata[i] = i;
  }
  pthread_t threads[NUM_BLOCK];
  void *inp[NUM_BLOCK];
  for (long t = 0; t < NUM_BLOCK; t++) {
    inp[t] = gen_input(t, g_idata, res, size);
  }
  for (long t = 0; t < NUM_BLOCK; t++) {
    pthread_create(&threads[t], NULL, wrap, inp[t]);
  }
  for (long t = 0; t < NUM_BLOCK; t++)
    pthread_join(threads[t], NULL);
  int gold = 0;
  for (int i = 0; i < size; i++) {
    gold += g_idata[i];
  }
  assert(*res == gold && "Incorrect res\n");
  printf("PASS\n");
  pthread_exit(NULL);
 }
--- a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,179 @@
 ; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: convergent nounwind
 define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
 entry:
  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
  %mul = shl i32 %1, 7
  %add = add i32 %mul, %0
  %cmp = icmp ult i32 %add, %n
  br i1 %cmp, label %cond.true, label %cond.end
 cond.true:                                        ; preds = %entry
  %idxprom = zext i32 %add to i64
  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
  %2 = load i32, i32* %arrayidx, align 4, !tbaa !12
  br label %cond.end
 cond.end:                                         ; preds = %entry, %cond.true
  %cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
  %add4 = add i32 %add, 64
  %cmp5 = icmp ult i32 %add4, %n
  br i1 %cmp5, label %if.then, label %if.end
 if.then:                                          ; preds = %cond.end
  %idxprom7 = zext i32 %add4 to i64
  %arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
  %3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
  %add9 = add nsw i32 %3, %cond
  br label %if.end
 if.end:                                           ; preds = %if.then, %cond.end
  %mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
  %idxprom10 = zext i32 %0 to i64
  %arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
  %arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
  store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
  %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
  %mul.i.i52 = mul nuw nsw i32 %5, %4
  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
  %mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
  %add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
  %add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
  %cmp14 = icmp ult i32 %add8.i.i55, 32
  br i1 %cmp14, label %if.then15, label %if.end32
 if.then15:                                        ; preds = %if.end
  %add16 = add nuw nsw i32 %0, 32
  %idxprom17 = zext i32 %add16 to i64
  %arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
  %arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
  %8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
  %add19 = add nsw i32 %8, %mySum.0
  %9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
  %add23 = add nsw i32 %9, %add19
  %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
  %add23.1 = add nsw i32 %10, %add23
  %11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
  %add23.2 = add nsw i32 %11, %add23.1
  %12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
  %add23.3 = add nsw i32 %12, %add23.2
  %13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
  %cmp27 = icmp eq i32 %add8.i.i55, 0
  br i1 %cmp27, label %if.then28, label %if.end32
 if.then28:                                        ; preds = %if.then15
  %add23.4 = add nsw i32 %13, %add23.3
  %idxprom30 = zext i32 %1 to i64
  %arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
  store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
  br label %if.end32
 if.end32:                                         ; preds = %if.end, %if.then28, %if.then15
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
 ; Function Attrs: convergent nounwind
 declare void @llvm.nvvm.barrier.sync(i32) #3
 ; Function Attrs: convergent inaccessiblememonly nounwind
 declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { convergent nounwind }
 attributes #4 = { convergent inaccessiblememonly nounwind }
 attributes #5 = { nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
 !10 = !{i32 0, i32 1024}
 !11 = !{i32 0, i32 2147483647}
 !12 = !{!13, !13, i64 0}
 !13 = !{!"int", !14, i64 0}
 !14 = !{!"omnipotent char", !15, i64 0}
 !15 = !{!"Simple C++ TBAA"}
 !16 = !{i32 0, i32 64}
 !17 = !{i32 1, i32 1025}
--- a/compilation/examples/reduce_shuffle/run.sh
+++ b/compilation/examples/reduce_shuffle/run.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 ../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
 llc --filetype=obj kernel.bc
 g++ host.cpp kernel.o -lpthread -o test
 ./test
--- a/compilation/examples/run_example.sh
+++ b/compilation/examples/run_example.sh
@ -0,0 +1,11 @@
 #!bin/sh
 for file in ./*
 do
    if test -d $file
    then
        echo executing $file
        cd $file
        bash run.sh
        cd ..
    fi
 done
--- a/compilation/examples/vecadd/host.cpp
+++ b/compilation/examples/vecadd/host.cpp
@ -0,0 +1,84 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #define NUM_BLOCK 1
 int N = 32;
 int block_size = 32;
 int block_size_x = block_size;
 int block_size_y = 1;
 int block_size_z = 1;
 __thread int block_index = 0;
 int grid_size = NUM_BLOCK;
 extern "C" {
 void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
 }
 void *wrap(void *p) {
  int **res = (int **)p;
  block_index = (*(int *)res[4]);
  _Z9vectorAddPKfS0_Pfi_wrapper(p);
  return NULL;
 }
 void *gen_input(int bid, float *A, float *B, float *C, int N) {
  int **ret = new int *[5];
  float **p0 = new float *;
  *p0 = A;
  ret[0] = (int *)(p0);
  float **p1 = new float *;
  *p1 = B;
  ret[1] = (int *)(p1);
  float **p2 = new float *;
  *p2 = C;
  ret[2] = (int *)(p2);
  int *p3 = new int;
  *p3 = N;
  ret[3] = (int *)p3;
  int *p4 = new int;
  *p4 = bid;
  ret[4] = (int *)p4;
  return (void *)ret;
 }
 int main() {
  float *A, *B, *C;
  A = new float[N];
  B = new float[N];
  C = new float[N];
  for (int i = 0; i < N; i++) {
    A[i] = i;
    B[i] = 1;
    C[i] = 0;
  }
  pthread_t threads[NUM_BLOCK];
  int rc;
  for (long t = 0; t < NUM_BLOCK; t++) {
    void *inp = gen_input(t, A, B, C, N);
    rc = pthread_create(&threads[t], NULL, wrap, inp);
  }
  clock_t t1 = clock();
  /* Last thing that main() should do */
  for (long t = 0; t < NUM_BLOCK; t++)
    pthread_join(threads[t], NULL);
  for (int i = 0; i < N; i++) {
    assert(C[i] == (A[i] + B[i]));
  }
  printf("PASS\n");
  pthread_exit(NULL);
 }
--- a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+++ b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@ -0,0 +1,86 @@
 ; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
 source_filename = "kernel.cu"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nounwind
 define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
 entry:
  ret i32 999
 }
 ; Function Attrs: nofree nounwind
 define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
 entry:
  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
  %idxprom8 = zext i32 %0 to i64
  %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
  %1 = load float, float* %arrayidx, align 4, !tbaa !11
  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
  %2 = load float, float* %arrayidx2, align 4, !tbaa !11
  %add = fadd contract float %1, %2
  %arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
  store float %add, float* %arrayidx4, align 4, !tbaa !11
  ret void
 }
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 !llvm.module.flags = !{!0, !1, !2}
 !nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
 !llvm.ident = !{!8}
 !nvvmir.version = !{!9}
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
 !4 = !{null, !"align", i32 8}
 !5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !6 = !{null, !"align", i32 16}
 !7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
 !9 = !{i32 1, i32 4}
 !10 = !{i32 0, i32 1024}
 !11 = !{!12, !12, i64 0}
 !12 = !{!"float", !13, i64 0}
 !13 = !{!"omnipotent char", !14, i64 0}
 !14 = !{!"Simple C++ TBAA"}
--- a/compilation/examples/vecadd/run.sh
+++ b/compilation/examples/vecadd/run.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 ../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
 llc --filetype=obj kernel.bc
 g++ host.cpp kernel.o -lpthread -o test
 ./test
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@ -0,0 +1,16 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(
  X86runtime
  DESCRIPTION "Implementation CUDA runtime API with x86"
  LANGUAGES CXX)
 set(LIB_NAME x86Runtime)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 # compile threadPool implementation
 add_subdirectory(threadPool)
 # compile x86 runtime library
 include_directories(./include)
 include_directories(./threadPool/include)
 file(GLOB proj_SOURCES "lib/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
--- a/runtime/include/cudaRuntimeImpl.h
+++ b/runtime/include/cudaRuntimeImpl.h
@ -0,0 +1,19 @@
 #ifndef __RUNTIME_IMPL__
 #define __RUNTIME_IMPL__
 #include "cudaStatus.h"
 #include "structures.h"
 cudaError_t cudaDeviceReset(void);
 cudaError_t cudaDeviceSynchronize(void);
 cudaError_t cudaFree(void *devPtr);
 cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
                             void **args, size_t sharedMem,
                             cudaStream_t stream);
 cudaError_t cudaMalloc(void **devPtr, size_t size);
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
                       cudaMemcpyKind kind);
 cudaError_t cudaSetDevice(int device);
 cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
 cudaError_t cudaStreamCreate(cudaStream_t *pStream);
 cudaError_t cudaStreamDestroy(cudaStream_t stream);
 cudaError_t cudaStreamSynchronize(cudaStream_t stream);
 #endif
--- a/runtime/include/cudaStatus.h
+++ b/runtime/include/cudaStatus.h
@ -0,0 +1,18 @@
 #ifndef __RUNTIME_STATUS__
 #define __RUNTIME_STATUS__
 #include <stdio.h>
 enum cudaError_t {
  CudaSuccess = 0,
  CudaErrorInvalidValue = 1,
  CudaErrorInvalidMemoryAllocation = 2,
 };
 enum cudaMemcpyKind {
  cudaMemcpyHostToHost = 0,
  cudaMemcpyHostToDevice = 1,
  cudaMemcpyDeviceToHost = 2,
  cudaMemcpyDeviceToDevice = 3,
  cudaMemcpyDefault = 4,
 };
 #endif
--- a/runtime/lib/cudaRuntimeImpl.cpp
+++ b/runtime/lib/cudaRuntimeImpl.cpp
@ -0,0 +1,100 @@
 #include "cudaRuntimeImpl.h"
 #include "api.h"
 #include <stdio.h>
 #include <stdlib.h>
 cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
 cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
 cudaError_t cudaFree(void *devPtr) { free(devPtr); }
 cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
                             void **args, size_t sharedMem,
                             cudaStream_t stream) {
  // if scheduler is null init device
  cu_kernel *ker =
      create_kernel(func, gridDim, blockDim, &args, sharedMem, stream);
  int lstatus = cuLaunchKernel(&ker);
 }
 cudaError_t cudaMalloc(void **devPtr, size_t size) {
  *devPtr = malloc(size);
  if (devPtr == NULL)
    return cudaErrorMemoryAllocation;
  return cudaSuccess;
 }
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
                       cudaMemcpyKind kind) {
  if (kind == cudaMemcpyHostToHost) {
    memcpy(dst, src, count);
  } else if (kind == cudaMemcpyDeviceToHost) {
    // how does the code know which device accessing the memory
    memcpy(dst, src, count);
  } else if (kind == cudaMemcpyHostToDevice) {
    // how does the code know which device accessing the memory
    memcpy(dst, src, count);
  } else if (kind == cudaMemcpyDeviceToHost) {
    // how does the code know which device accessing the memory
    memcpy(dst, src, count);
  } else if (kind == cudaMemcpyDeviceToDevice) {
    memcpy(dst, dst, count);
  } else if (kind == cudaMemcpyDefault) {
    memcpy(dst, src, count);
  }
  return cudaSuccess;
 }
 cudaError_t cudaSetDevice(int device) {
  // error checking
  init_device();
 }
 cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
  cstreamData *dst_stream = (cstreamData *)dst;
  cstreamData *src_stream = (cstreamData *)src;
  if (dst_stream == NULL || src_stream == NULL) {
    return cudaErrorInvalidValue; // 1
  }
  dst_stream->stream_priority = src_stream->stream_priority;
  dst_stream->stream_flags = src_stream->stream_flags;
  return cudaSuccess; // 0
 }
 cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
  cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData));
  if (s == NULL)
    return cudaErrorMemoryAllocation;
  s->ev.status = C_RUN;
  s->id = stream_counter;
  stream_counter++;
  s->stream_priority = DEFAULT;
  create_KernelQueue(&(s->kernelQueue));
  INIT_LOCK(s->stream_lock);
  *pStream = (cudaStream_t)(s);
  return cudaSuccess;
 }
 cudaError_t cudaStreamDestroy(cudaStream_t stream) {
  cstreamData *s = (cstreamData *)(stream);
  free(s->kernelQueue);
  DESTROY_LOCK(s->stream_lock);
  free(s);
  return cudaSuccess;
 }
 cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
  cstreamData *e = ((cstreamData *)(stream));
  MUTEX_LOCK(e->stream_lock);
  e->ev.status = C_SYNCHRONIZE;
  e->ev.numKernelsToWait = e->kernelQueue->waiting_count;
  MUTEX_UNLOCK(e->stream_lock);
 }
--- a/runtime/threadPool/CMakeLists.txt
+++ b/runtime/threadPool/CMakeLists.txt
@ -0,0 +1,17 @@
 cmake_minimum_required(VERSION 3.1)
 # C project
 project(
  ThreadPool
  DESCRIPTION "Using pthread to implement ThreadPool"
  LANGUAGES CXX)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME threadPool)
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_BUILD_TYPE Debug)
 include_directories(./include)
 file(GLOB proj_SOURCES "lib/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
--- a/runtime/threadPool/include/api.h
+++ b/runtime/threadPool/include/api.h
@ -0,0 +1,25 @@
 #ifndef C_API_H
 #define C_API_H
 #include "structures.h"
 cu_kernel *create_kernel(void (*wrap)(cu_argument *));
 int getWorkItem(struct kernel_queue **qu, cu_kernel *ker,
                struct argument *kernel_arg, int **blockId);
 int create_KernelQueue(kernel_queue **q);
 int dequeKernelLL(struct kernel_queue **qu);
 int dequeKernel(struct kernel_queue **qu, cu_kernel *ker);
 int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker);
 int scheduler_init(cu_device device);
 void scheduler_uninit();
 void cuSynchronizeBarrier();
 int set_kernel_arguments(cu_kernel **k, unsigned int arg_num, void **arg_value);
 int setKernelDimensions(cu_kernel *k, struct argument **arg,
                        void **totalBlockSize, void *blockId);
 #endif
--- a/runtime/threadPool/include/def.h
+++ b/runtime/threadPool/include/def.h
@ -0,0 +1,26 @@
 #ifndef C_DEF_H
 #define C_DEF_H
 // Error
 #define C_SUCCESS 0x0
 #define C_ERROR 0x1
 // execution status
 #define C_COMPLETE 0x2
 #define C_RUNNING 0x3
 #define C_SUBMITTED 0x4
 #define C_QUEUED 0x5
 #define C_CREATED 0x5
 // stream status
 #define C_RUN 0x1
 #define C_WAIT 0x2
 #define C_SYNCHRONIZE 0x3
 // Not Initliazed Error
 #define C_ERROR_NOT_INITIALIZED 0x6
 #define C_ERROR_MEMALLOC 0x7
 #define C_QUEUE_EMPTY 0x8
 #endif
--- a/runtime/threadPool/include/macros.h
+++ b/runtime/threadPool/include/macros.h
@ -0,0 +1,38 @@
 #ifndef C_MACROS_H
 #define C_MACROS_H
 #include "assert.h"
 #include <pthread.h>
 #define INIT_LOCK(__LOCK__)                                                    \
  {                                                                            \
    do {                                                                       \
      int r = pthread_mutex_init(&(__LOCK__), NULL);                           \
      assert(r == 0);                                                          \
    } while (0);                                                               \
  }
 #define MUTEX_LOCK(__LOCK__)                                                   \
  {                                                                            \
    do {                                                                       \
      int r = pthread_mutex_lock(&(__LOCK__));                                 \
      assert(r == 0);                                                          \
    } while (0);                                                               \
  }
 #define MUTEX_UNLOCK(__LOCK__)                                                 \
  {                                                                            \
    do {                                                                       \
      int r = pthread_mutex_unlock(&(__LOCK__));                               \
      assert(r == 0);                                                          \
    } while (0);                                                               \
  }
 #define DESTROY_LOCK(__LOCK__)                                                 \
  {                                                                            \
    do {                                                                       \
      int r = pthread_mutex_destroy(&(__LOCK__));                              \
      assert(r == 0);                                                          \
    } while (0);                                                               \
  }
 #endif // HEADER_FILE
--- a/runtime/threadPool/include/structures.h
+++ b/runtime/threadPool/include/structures.h
@ -0,0 +1,191 @@
 #ifndef C_STRUCTURES_H
 #define C_STRUCTURES_H
 #include "pthread.h"
 #define cudaStream_t cstreamData
 typedef struct device {
  int max_compute_units;
  int device_id;
 } cu_device;
 typedef struct c_thread {
  pthread_t thread;
  unsigned long executed_commands;
  unsigned index;
  bool exit;
 } cu_ptd;
 typedef struct scheduler_pool {
  struct c_thread *thread_pool;
  size_t num_worker_threads;
  size_t local_mem_size;
  int num_kernel_launch;
  int num_kernel_finished;
  int num_kernel_queued;
  size_t idle_threads;
  pthread_cond_t wake_pool;
  int threadpool_shutdown_requested;
  // lock for scheduler
  pthread_mutex_t work_queue_lock;
  // C99 array at the end
  // user kernel queue for only user called functions
  struct kernel_queue *kernelQueue;
 } cu_pool;
 struct kernel_queue {
  struct kernel *head;
  struct kernel *tail;
  // finish command count
  unsigned long finish_count;
  // waiting to be run on threads
  unsigned long waiting_count;
  // running count
  unsigned long running_count;
  // total count
  unsigned long kernel_count;
  // current index for task to be run
  unsigned long current_index;
 };
 typedef struct command {
  struct kernel *ker;
  struct command *next;
  struct command *prev;
 } cu_command;
 typedef struct argument {
  // size of the argument to allocation
  size_t size;
  void *value;
  unsigned int index;
 } cu_argument;
 typedef struct input_arg {
  // real values for the input
  char *p;
  struct argument *argus[];
  // (TODO): implement meta_data
  // the type of metadata will need to change to list of ints or something
  // so that we can parse the arguments p
 } cu_input;
 struct dim3 {
  size_t x;
  size_t y;
  size_t z;
  dim3(int d1) {
    x = d1;
    y = z = 1;
  }
  dim3() { x = y = z = 1; }
 };
 enum StreamType {
  DEFAULT,
  LOW,
  HIGH,
  EXT,
 };
 struct cStreamDataInternal {
  /*
      status of the stream (run , wait)
      Run: Stream will asynchronously assign the kernel assign with this stream
      Wait: Stream will halt kernels from exiting the scheduler
  */
  int status;
  /*
     if status == wait, wait on the number of kernels to wait to become 0
  */
  unsigned long numKernelsToWait;
  unsigned int lastKernelIdToWait;
  unsigned int count; // number of task left in the stream
 };
 typedef struct streamData {
  // execution status of current event monitor
  struct cStreamDataInternal ev;
  pthread_mutex_t stream_lock; // lock on the stream
  StreamType stream_priority;
  unsigned int id;
  unsigned int stream_flags;
  // queue of the kernels in this stream
  struct kernel_queue *kernelQueue;
 } cstreamData;
 // kernel information
 typedef struct kernel {
  void *(*start_routine)(void *);
  void **args;
  dim3 gridDim;
  dim3 blockDim;
  struct kernel *next;
  struct kernel *prev;
  size_t shared_mem;
  cstreamData *stream;
  struct event *barrier;
  int status;
  int totalBlocks;
  int N;
  int blockSize;
  int kernelId;
  // current blockId
  int blockId;
  void *shared_mem_loc;
 } cu_kernel;
 typedef struct asyncKernel {
  unsigned int numBlocks;
  unsigned int numThreads;
  struct event *evt;
  struct kernel *ker;
  struct asyncKernel *prev;
  struct asyncKernel *next;
 } asyncKernel;
 // command queue of command nodes
 typedef struct kernel_arg_array {
  size_t size;
  unsigned int index;
 } karg_arr;
 typedef struct kernel_image_arg {
  size_t size;
  unsigned int index;
 } k_arg;
 #endif // HEADER_FILE
--- a/runtime/threadPool/lib/api.cpp
+++ b/runtime/threadPool/lib/api.cpp
@ -0,0 +1,456 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <thread>
 #include "api.h"
 #include "def.h"
 #include "macros.h"
 #include "structures.h"
 /*
 Initialize the device
 */
 int init_device() {
  cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
  if (device == NULL)
    return C_ERROR_MEMALLOC;
  device->max_compute_units = std::thread::hardware_concurrency();
  // initialize scheduler
  int ret = scheduler_init(*device);
  if (ret != C_SUCCESS)
    return ret;
  return C_SUCCESS;
 }
 /*
    Create Kernel
 */
 static int kernelIds = 0;
 cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
                         void ***args, size_t sharedMem, cstreamData *stream) {
  cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
  // set the function pointer
  ker->start_routine = (void *(*)(void *))func;
  // ker->start_routine(args);
  ker->args = *args;
  ker->gridDim = gridDim;
  ker->blockDim = blockDim;
  ker->shared_mem = sharedMem;
  // malloc shared memory dynamic (heap , needs to be on the stack)
  // each thread create their own shared memory // after the task submission
  ker->shared_mem_loc = calloc(1, sharedMem);
  ker->stream = stream;
  ker->blockId = 0;
  ker->totalBlocks = gridDim.x;
  ker->N = blockDim.x;
  ker->kernelId = kernelIds;
  kernelIds += 1;
  ker->blockSize = blockDim.x;
  return ker;
 }
 /*
    Create Kernel Queue
 */
 int create_KernelQueue(kernel_queue **q) {
  *q = (kernel_queue *)calloc(1, sizeof(kernel_queue));
  if (*q == NULL) {
    return C_ERROR_MEMALLOC;
  }
  (*q)->kernel_count = 0;
  (*q)->running_count = 0;
  (*q)->waiting_count = 0;
  (*q)->finish_count = 0;
  (*q)->current_index = 0;
  return C_SUCCESS;
 }
 int dequeKernelLL(struct kernel_queue **qu) {
  struct kernel_queue *q = *qu;
  q->finish_count += 1;
  // free the pointer
  if (q->head == NULL) {
    return C_QUEUE_EMPTY;
  } else {
    //*ker = *(q->head);
    q->head = (q->head)->next;
    if (q->head != NULL) {
      q->head->prev = NULL;
    }
  }
  return C_SUCCESS;
 }
 int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) {
  struct kernel_queue *q = *qu;
  cu_kernel *p = *ker;
  if (q->head == NULL) {
    q->head = p;
    q->tail = p;
  } else {
    p->prev = q->tail;
    q->tail->next = p;
    q->tail = p;
    p->next = NULL;
  }
  q->kernel_count += 1;
  q->waiting_count += 1;
  // user kernel command
  return C_SUCCESS;
 }
 // scheduler
 static cu_pool *scheduler;
 __thread int block_index = 0;
 __thread int thread_memory_size = 0;
 /*
    Enqueue Kernel (k) to the scheduler kernelQueue
 */
 int schedulerEnqueueKernel(cu_kernel **k) {
  cu_kernel *ker = *k;
  MUTEX_LOCK(scheduler->work_queue_lock);
  enqueueKernel(&scheduler->kernelQueue, &ker);
  pthread_cond_broadcast(&(scheduler->wake_pool));
  MUTEX_UNLOCK(scheduler->work_queue_lock);
 }
 /*
  Kernel Launch with numBlocks and numThreadsPerBlock
 */
 int cuLaunchKernel(cu_kernel **k) {
  // Calculate Block Size N/numBlocks
  cu_kernel *ker = *k;
  int status = C_RUN;
  MUTEX_LOCK(scheduler->work_queue_lock);
  scheduler->num_kernel_queued += 1;
  MUTEX_UNLOCK(scheduler->work_queue_lock);
  // stream == 0 add to the kernelQueue
  if (ker->stream == 0) {
    schedulerEnqueueKernel(&ker);
  } else {
    // add to it's stream queue
    // stream queue can be waiting or running with or without tasks
    MUTEX_LOCK(((cstreamData *)(ker->stream))->stream_lock);
    status = ((cstreamData *)(ker->stream))->ev.status;
    // if stream queue status is run (first kernel) (enqueue to the kernel
    // queue)
    cstreamData *e = ((cstreamData *)(ker->stream));
    // synchronized is called after no job in the queue so stream is stuck on
    // synchronize
    if (e->ev.status == C_SYNCHRONIZE) {
      if ((e->kernelQueue->finish_count) == (e->kernelQueue->kernel_count)) {
        e->ev.status = C_RUN;
      }
    }
    if (e->ev.status == C_RUN) {
      // change the status to wait
      e->ev.status == C_WAIT;
      MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
      schedulerEnqueueKernel(&ker);
    } else {
      // the status of stream queue is wait so just enqueue to the stream
      enqueueKernel(&((cstreamData *)(ker->stream))->kernelQueue, &ker);
      MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
    }
  }
 }
 /*
    Get Work Item: get the kernel from the queue and increment blockId
 */
 int getWorkItem(struct kernel_queue **qu, cu_kernel **kern, int blockId) {
  struct kernel_queue *q = *qu;
  if (q->waiting_count > 0) {
    *kern = q->head;
    cu_kernel *ker = *kern;
    if (blockId + 1 == q->head->totalBlocks) {
      // deque the head
      dequeKernelLL(qu);
      ker->status = C_COMPLETE;
      q->waiting_count -= 1;
    } else {
      q->head->blockId += 1;
    }
    q->finish_count += 1;
  } else {
    return C_QUEUE_EMPTY;
  }
  return C_SUCCESS;
 }
 /*
    Thread Gets Work
 */
 int get_work(c_thread *th) {
  cu_kernel ker;
  MUTEX_LOCK(scheduler->work_queue_lock);
 RETRY:
  int is_exit = 0;
  int is_command_not_null = 0;
  int blockId;
  int localBlockSize;
  int status;
  int completion_status = 0;
  is_exit = scheduler->threadpool_shutdown_requested;
  MUTEX_UNLOCK(scheduler->work_queue_lock);
  if (!is_exit) {
    MUTEX_LOCK(scheduler->work_queue_lock);
    // if kernel waiting to be complete is not zero
    if (scheduler->kernelQueue->waiting_count > 0) {
      blockId = scheduler->kernelQueue->head->blockId;
      localBlockSize = scheduler->kernelQueue->head->blockSize;
      // set status as success fully queue
      status = C_SUCCESS;
      ker = *(scheduler->kernelQueue->head);
      // if the blockId + 1 is equal to the goal block size ,
      // then its the last block
      if (blockId + 1 == scheduler->kernelQueue->head->totalBlocks) {
        // deque the head
        dequeKernelLL(&scheduler->kernelQueue);
        ker.status = C_COMPLETE;
        scheduler->kernelQueue->waiting_count -= 1;
      } else {
        // increment the blockId
        scheduler->kernelQueue->head->blockId =
            scheduler->kernelQueue->head->blockId + 1;
      }
      // status = getWorkItem(&scheduler->kernelQueue, &ker, blockId);
    } else {
      status = C_QUEUE_EMPTY;
    }
    MUTEX_UNLOCK(scheduler->work_queue_lock);
  }
  if (status != C_QUEUE_EMPTY) {
    block_index = blockId;
    thread_memory_size = ker.shared_mem;
    ker.start_routine(ker.args);
    is_command_not_null = 1;
    if (ker.status == C_COMPLETE) {
      // check if this kernel's stream has more jobs to run (enqueue the next
      // job)
      if (ker.stream != NULL) {
        bool synchronize = false;
        MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
        if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
          // synchronize stream
          if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
            ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
          }
          MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
          if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
            // synchronize stream
            if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
              ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
            }
            if (((cstreamData *)(ker.stream))->ev.numKernelsToWait == 0) {
              synchronize = false;
            } else {
              synchronize = true;
            }
          }
          if (synchronize == false) {
            if (((cstreamData *)(ker.stream))->kernelQueue->waiting_count > 0) {
              ((cstreamData *)(ker.stream))->ev.status = C_WAIT;
              MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
              cu_kernel *kern =
                  ((cstreamData *)(ker.stream))->kernelQueue->head;
              schedulerEnqueueKernel(&kern);
              dequeKernelLL(&((cstreamData *)(ker.stream))->kernelQueue);
            } else {
              // switch the stream to run to allow for the next execution
              ((cstreamData *)(ker.stream))->ev.status = C_RUN;
              MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
            }
          }
        }
      }
      MUTEX_LOCK(scheduler->work_queue_lock);
      scheduler->num_kernel_finished += 1;
      MUTEX_UNLOCK(scheduler->work_queue_lock);
    }
  }
  MUTEX_LOCK(scheduler->work_queue_lock);
  if ((is_exit == 0 && is_command_not_null == 0)) {
    // all threads in condition wait
    scheduler->idle_threads += 1;
    pthread_cond_wait(&(scheduler->wake_pool), &(scheduler->work_queue_lock));
    scheduler->idle_threads -= 1;
    goto RETRY;
  }
  MUTEX_UNLOCK(scheduler->work_queue_lock);
  return is_exit;
 }
 void *driver_thread(void *p) {
  struct c_thread *td = (struct c_thread *)p;
  int is_exit = 0;
  td->exit = false;
  while (1) {
    // get work
    is_exit = get_work(td);
    // exit the routine
    if (is_exit) {
      td->exit = true;
      // pthread_exit
      pthread_exit(NULL);
    }
  }
 }
 /*
 Initialize the scheduler
 */
 int scheduler_init(cu_device device) {
  scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
  scheduler->num_worker_threads = device.max_compute_units;
  scheduler->thread_pool = (struct c_thread *)calloc(
      scheduler->num_worker_threads, sizeof(c_thread));
  kernel_queue *asq;
  create_KernelQueue(&asq);
  scheduler->kernelQueue = asq;
  INIT_LOCK(scheduler->work_queue_lock);
  pthread_cond_init(&scheduler->wake_pool, NULL);
  scheduler->idle_threads = 0;
  for (int i = 0; i < scheduler->num_worker_threads; i++) {
    scheduler->thread_pool[i].index = i;
    pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread,
                   (void *)&scheduler->thread_pool[i]);
  }
  return C_SUCCESS;
 }
 void scheduler_uninit() {
  unsigned i;
  int r = pthread_mutex_lock(&scheduler->work_queue_lock);
  assert(r == 0);
  scheduler->threadpool_shutdown_requested = 1;
  pthread_cond_broadcast(&scheduler->wake_pool);
  int r1 = pthread_mutex_unlock(&scheduler->work_queue_lock);
  assert(r1 == 0);
  for (i = 0; i < scheduler->num_worker_threads; i++) {
    pthread_join(scheduler->thread_pool[i].thread, NULL);
  }
  free(scheduler->thread_pool);
  free(scheduler->kernelQueue);
  pthread_mutex_destroy(&scheduler->work_queue_lock);
  pthread_cond_destroy(&scheduler->wake_pool);
  scheduler->threadpool_shutdown_requested = 0;
 }
 int cuWait(cstreamData *evt) {
 AGAIN:
  int r = pthread_mutex_lock(&evt->stream_lock);
  assert(r == 0);
  if (evt->ev.status != C_COMPLETE) {
    int r1 = pthread_mutex_unlock(&evt->stream_lock);
    assert(r1 == 0);
    goto AGAIN;
  }
  return C_SUCCESS;
 }
 /*
  Barrier for Kernel Launch
  During kernel launch, increment the number of work items required to finish
  Each kernel will point to the same event
  During Running Command, decrement the event.work_item count
  when count is 0, all work items for this kernel launch is finish
  Sense Like Barrier
  Counting Barrier basically
 */
 void cuSynchronizeBarrier() {
 AGAIN:
  MUTEX_LOCK(scheduler->work_queue_lock);
  if (scheduler->num_kernel_finished != scheduler->num_kernel_queued ||
      scheduler->idle_threads != scheduler->num_worker_threads) {
    MUTEX_UNLOCK(scheduler->work_queue_lock);
    goto AGAIN;
  } else {
    MUTEX_UNLOCK(scheduler->work_queue_lock);
  }
 }