Refactoring the codebase. Remove useless variables; Add comments; Remove useless header files; Remove hard code and support both x86 and ARM CPU

2023-12-13 14:29:17 -05:00 · 2023-12-13 14:29:17 -05:00 · fd56811650
parent 50d615da64
commit fd56811650
50 changed files with 249 additions and 531 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -51,7 +51,7 @@ jobs:
          hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
          llc --relocation-model=pic --filetype=obj  kernel.bc
          llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
          ./reverse
      - name: Execute the dynamic shared memory demo
        run: |
@ -63,7 +63,7 @@ jobs:
          hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
          llc --relocation-model=pic --filetype=obj  kernel.bc
          llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
          ./reverse
      - name: Execute Hetero-mark benchmark
        run: |
@ -79,5 +79,5 @@ jobs:
          hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
          llc --relocation-model=pic --filetype=obj  kernel.bc
          llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
+          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread
          ./lavaMD -boxes1d 10
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,13 +1,9 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-project(CudaOnX86)
+project(CuPBoP)
-set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
+set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.")
 set(CMAKE_CXX_STANDARD "14")
-set(MAJOR_VERSION 0)
+
 set(MINOR_VERSION 1)
 set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
 set(COX_VERSION ${VERSION_STRING})
 # get LLVM PATH get PATH for head file
 if(DEFINED LLVM_CONFIG_PATH)
  if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
    if(EXISTS "${LLVM_CONFIG_PATH}")
@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH)
 else()
  message(FATAL_ERROR "llvm-config is required")
 endif()
-# get CUDA PATH
+
 if(DEFINED CUDA_PATH)
  message(STATUS "Using CUDA: ${CUDA_PATH}")
 else()
@ -45,7 +41,7 @@ if(DEBUG)
 endif()
 set(CMAKE_CXX_FLAGS
-    "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
+    "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused")
 set(GCC_COVERAGE_LINK_FLAGS
    "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
@ -54,5 +50,8 @@ add_subdirectory(compilation)
 add_subdirectory(runtime)
 enable_testing()
-option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data)
+set(HETERO_MARK_DATA
    "/tmp/data"
    CACHE PATH "The path to download hetero-mark dataset.")
 add_subdirectory(test)
--- a/README.md
+++ b/README.md
@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \
      -L$CuPBoP_PATH/build/runtime  \
      -L$CuPBoP_PATH/build/runtime/threadPool \
      host.o kernel.o \
-      -I../.. -lc -lx86Runtime -lthreadPool -lpthread
+      -I../.. -lc -lCPUruntime -lthreadPool -lpthread
 # Execute
 ./vecadd
 ```
--- a/compilation/CMakeLists.txt
+++ b/compilation/CMakeLists.txt
@ -1,20 +1,16 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(
  NVVM2X86
  DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend"
  LANGUAGES CXX)
 set(CMAKE_VERBOSE_MAKEFILE ON)
-# compile kernel translator
+# build kernel translator
-include_directories(./KernelTranslation/include/x86)
+include_directories(./KernelTranslation/include/cpu)
 add_subdirectory(KernelTranslation)
 add_executable(kernelTranslator KernelTranslation.cpp)
 target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
-# compile host translator
+# build host translator
-include_directories(./HostTranslation/include/x86)
+include_directories(./HostTranslation/include/cpu)
 add_subdirectory(HostTranslation)
 add_executable(hostTranslator HostTranslation.cpp)
--- a/compilation/HostTranslation.cpp
+++ b/compilation/HostTranslation.cpp
@ -4,12 +4,7 @@
 #include "ReplaceCudaBuiltin.h"
 #include "ReplaceKernelArgs.h"
 #include "tool.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <stdlib.h>
 using namespace llvm;
--- a/compilation/HostTranslation/CMakeLists.txt
+++ b/compilation/HostTranslation/CMakeLists.txt
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME cudaRuntime2cpuRuntime)
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
+include_directories(./include/cpu)
 include_directories(./include/x86)
 include_directories(../../common)
-file(GLOB proj_HEADERS "include/x86/*.h")
+file(GLOB proj_HEADERS "include/cpu/*.h")
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 # Add core library.
 add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
--- a/compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h
+++ b/compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h
--- a/compilation/HostTranslation/include/cpu/RemoveMetadata.h
+++ b/compilation/HostTranslation/include/cpu/RemoveMetadata.h
--- a/compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h
+++ b/compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h
--- a/compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h
+++ b/compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h
--- a/compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h
+++ b/compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h
--- a/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
@ -3,16 +3,8 @@
 */
 #include "RemoveCudaBuiltin.h"
 #include "debug.hpp"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include <iostream>
 #include <map>
 #include <set>
 using namespace llvm;
--- a/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
+++ b/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
@ -1,16 +1,14 @@
 #include "RemoveMetadata.h"
-#include "llvm/IR/Function.h"
+#include "llvm/Support/Host.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 using namespace llvm;
 void RemoveMetadata(llvm::Module *M) {
  // change the target triple to the host triple
  M->setTargetTriple(llvm::sys::getProcessTriple());
  // use the default DataLayout
  M->setDataLayout("");
  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) {
    F->removeFnAttr("min-legal-vector-width");
    F->removeFnAttr("no-trapping-math");
    F->removeFnAttr(llvm::Attribute::OptimizeNone);
    F->removeFnAttr("target-cpu");
    F->removeFnAttr("target-features");
  }
 }
--- a/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
@ -1,12 +1,7 @@
 #include "ReplaceConstantMemory.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <set>
--- a/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
@ -1,13 +1,6 @@
 #include "ReplaceCudaBuiltin.h"
 #include "debug.hpp"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 #include <map>
 #include <regex>
 #include <set>
@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
  std::map<std::string, Function *> kernels;
  std::set<llvm::Function *> need_remove;
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  llvm::FunctionType *LauncherFuncT =
      FunctionType::get(Type::getVoidTy(*C), NULL);
  llvm::FunctionType *LaunchFun2 =
      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
  bool done = false;
  std::set<std::string> cuda_register_kernel_names;
@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
                std::vector<size_t> arg_sizes;
                functionOperand =
                    dyn_cast<Function>(callOperand->stripPointerCasts());
                FunctionType *ft = calledFunction->getFunctionType();
                DEBUG_INFO("Parent (Caller) Function Name: %s, "
                           "cudaLaunchKernel Function: %s, args : %d\n",
                           func_name.c_str(),
--- a/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
@ -1,12 +1,5 @@
 #include "ReplaceKernelArgs.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 #include <map>
 #include <set>
@ -23,12 +16,9 @@ using namespace llvm;
 // to use use-analysis to find the arguments in the future
 void ReplaceKernelArg(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto VoidTy = llvm::Type::getVoidTy(context);
  auto I8 = llvm::Type::getInt8PtrTy(context);
  std::map<std::string, Function *> kernels;
  std::set<llvm::Function *> need_replace;
  LLVMContext *C = &M->getContext();
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
--- a/compilation/KernelTranslation.cpp
+++ b/compilation/KernelTranslation.cpp
@ -1,4 +1,4 @@
-#include "generate_x86_format.h"
+#include "generate_cpu_format.h"
 #include "handle_sync.h"
 #include "init.h"
 #include "insert_sync.h"
@ -6,17 +6,14 @@
 #include "performance.h"
 #include "tool.h"
 #include "warp_func.h"
 #include "llvm/IR/Module.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <llvm/Support/raw_ostream.h>
 #include <map>
 #include <set>
 #include <stdlib.h>
 using namespace llvm;
 // to support constant memory variables, we need to convert information
 // from kernelTranslator to HostTranslator, since HostTranslator knows nothing
 // about the kernel functions, we need to write the information to a file
 // by KernelTranslator and read it in HostTranslator
 std::string PATH = "kernel_meta.log";
 int main(int argc, char **argv) {
@ -26,8 +23,9 @@ int main(int argc, char **argv) {
  std::ofstream fout;
  fout.open(PATH);
-  // inline, and create auxiliary global variables
+  // inline __device__ functions, and create auxiliary global variables
  init_block(program, fout);
  // insert sync before each vote, and replace the
  // original vote function to warp vote
  handle_warp_vote(program);
@ -40,17 +38,18 @@ int main(int argc, char **argv) {
  // split block by sync
  split_block_by_sync(program);
-  // add loop for intra&intera thread
+
  // add loop for intra&intera thread, it refers 'hierarchical collapsing' in
  // COX paper.
  insert_warp_loop(program);
  // (TODO): replace this patch
  replace_built_in_function(program);
-  // TODO: replace with a more general function
+  // the input kernel programs have NVIDIA metadata, they need to be replaced to
-  // Not only for x86 backend
+  // CPU metadata
-  generate_x86_format(program);
+  generate_cpu_format(program);
-  // performance optimization
+  // execute O3 pipeline on the transformed program
  performance_optimization(program);
  VerifyModule(program);
--- a/compilation/KernelTranslation/CMakeLists.txt
+++ b/compilation/KernelTranslation/CMakeLists.txt
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME spmd2mpmd)
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
+include_directories(./include/cpu)
 include_directories(./include/x86)
 include_directories(../../common)
-file(GLOB proj_HEADERS "include/x86/*.h")
+file(GLOB proj_HEADERS "include/cpu/*.h")
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 # Add core library.
 add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
--- a/compilation/KernelTranslation/include/cpu/generate_cpu_format.h
+++ b/compilation/KernelTranslation/include/cpu/generate_cpu_format.h
@ -0,0 +1,10 @@
 #ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__
 #define __NVVM2CPU_GENERATE_CPU_FORMAT__
 #include "llvm/IR/Module.h"
 void generate_cpu_format(llvm::Module *M);
 void set_meta_data(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/include/cpu/handle_sync.h
+++ b/compilation/KernelTranslation/include/cpu/handle_sync.h
--- a/compilation/KernelTranslation/include/cpu/init.h
+++ b/compilation/KernelTranslation/include/cpu/init.h
--- a/compilation/KernelTranslation/include/cpu/insert_sync.h
+++ b/compilation/KernelTranslation/include/cpu/insert_sync.h
--- a/compilation/KernelTranslation/include/cpu/insert_warp_loop.h
+++ b/compilation/KernelTranslation/include/cpu/insert_warp_loop.h
--- a/compilation/KernelTranslation/include/cpu/memory_hierarchy.h
+++ b/compilation/KernelTranslation/include/cpu/memory_hierarchy.h
--- a/compilation/KernelTranslation/include/cpu/performance.h
+++ b/compilation/KernelTranslation/include/cpu/performance.h
--- a/compilation/KernelTranslation/include/cpu/tool.h
+++ b/compilation/KernelTranslation/include/cpu/tool.h
--- a/compilation/KernelTranslation/include/cpu/warp_func.h
+++ b/compilation/KernelTranslation/include/cpu/warp_func.h
--- a/compilation/KernelTranslation/include/x86/generate_x86_format.h
+++ b/compilation/KernelTranslation/include/x86/generate_x86_format.h
@ -1,10 +0,0 @@
 #ifndef __NVVM2x86_GENERATE_X86_FORMAT__
 #define __NVVM2x86_GENERATE_X86_FORMAT__
 #include "llvm/IR/Module.h"
 void generate_x86_format(llvm::Module *M);
 void set_meta_data(llvm::Module *M);
 #endif
--- a/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
+++ b/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
@ -0,0 +1,125 @@
 #include "generate_cpu_format.h"
 #include "debug.hpp"
 #include "tool.h"
 #include "llvm/Support/Host.h"
 using namespace llvm;
 // set TargetTriple and DataLayout same as the host CPU
 void set_meta_data(llvm::Module *M) {
  M->setTargetTriple(llvm::sys::getProcessTriple());
  // use the default DataLayout
  M->setDataLayout("");
 }
 // as pthread only accept a single void* for input
 // we have to decode this input inside the kernel
 void decode_input(llvm::Module *M) {
  std::set<llvm::Function *> need_remove;
  llvm::Type *Int32T = Type::getInt32Ty(M->getContext());
  llvm::Type *Int8T = Type::getInt8Ty(M->getContext());
  llvm::FunctionType *LauncherFuncT = FunctionType::get(
      Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false);
  // generate Wrapper Function type
  // now we only support a single int32*
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    if (!isKernelFunction(M, F))
      continue;
    auto func_name = F->getName().str();
    // filter out _Z24 and other mangled prefix
    for (int pos = 2; pos < func_name.length(); pos++) {
      if (func_name[pos] >= '0' && func_name[pos] <= '9')
        continue;
      func_name = func_name.substr(pos);
      break;
    }
    llvm::IRBuilder<> Builder(M->getContext());
    FunctionCallee fc =
        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
    Builder.SetInsertPoint(Block);
    // WorkGroup has only a single input
    Function::arg_iterator ai = WorkGroup->arg_begin();
    SmallVector<Value *, 8> Arguments;
    Value *input_arg = &*ai;
    // convert to int**
    input_arg = Builder.CreateBitOrPointerCast(
        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
    size_t idx = 0;
    // replace original arguments with the unpacked values
    // for example, for a function f(int* a, char* b),
    // we will generate a function f_wrapper(int** input)
    // and replace the original arguments with the unpacked values
    // e.g., a = (int*)input[0], b = (char*)input[1]
    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
         ii != ee; ++ii) {
      Type *ArgType = ii->getType();
      // calculate addr
      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
      // load corresponding int*
      GEP = createLoad(Builder, GEP);
      // bitcast
      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
      Value *Arg = createLoad(Builder, GEP);
      Arguments.push_back(Arg);
      ++idx;
    }
    Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
    Builder.CreateRetVoid();
  }
  for (auto f : need_remove) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
 }
 // after flat/hierarchical collapsing, the barrier instructions are useless
 void remove_barrier(llvm::Module *M) {
  std::vector<Instruction *> need_remove;
  for (auto F = M->begin(); F != M->end(); ++F)
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) {
        if (auto Call = dyn_cast<CallInst>(Inst)) {
          if (Call->isInlineAsm())
            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync" ||
              func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.barrier.sync") {
            need_remove.push_back(Call);
          }
        }
      }
    }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 void remove_useless_var(llvm::Module *M) {
  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
 }
 void generate_cpu_format(llvm::Module *M) {
  DEBUG_INFO("generate cpu format\n");
  // change metadata
  set_meta_data(M);
  // decode argument
  decode_input(M);
  // remove barrier
  remove_barrier(M);
  // remove useless func/variable
  remove_useless_var(M);
 }
--- a/compilation/KernelTranslation/src/cpu/handle_sync.cpp
+++ b/compilation/KernelTranslation/src/cpu/handle_sync.cpp
@ -1,13 +1,7 @@
 #include "handle_sync.h"
 #include "debug.hpp"
 #include "tool.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include <set>
 #include <string>
--- a/compilation/KernelTranslation/src/cpu/init.cpp
+++ b/compilation/KernelTranslation/src/cpu/init.cpp
@ -2,26 +2,11 @@
 #include "debug.hpp"
 #include "memory_hierarchy.h"
 #include "tool.h"
 #include <fstream>
 #include <iostream>
 #include <set>
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <set>
 using namespace llvm;
@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) {
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
-    Function::iterator I = F->begin();
+    for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
    for (Function::iterator E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
          if (c->getCalledFunction()) {
@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) {
 }
 bool find_sreg_inst(llvm::Function *F) {
-  Function::iterator I = F->begin();
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
  for (Function::iterator E = F->end(); I != E; ++I) {
    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
      if (CallInst *c = dyn_cast<CallInst>(BI++)) {
        if (c->getCalledFunction()) {
@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) {
  Passes.run(*M);
 }
 // transform constant expression into sequence of instructions
 bool lower_constant_expr(llvm::Module *M) {
  bool modified = false;
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<CallInst *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) {
  return modified;
 }
 // replace _ZL3expd, just delete its body
 void replace_cuda_math_built_in(llvm::Module *M) {
  // replace _ZL3expd, just delete its body
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
--- a/compilation/KernelTranslation/src/cpu/insert_sync.cpp
+++ b/compilation/KernelTranslation/src/cpu/insert_sync.cpp
@ -4,29 +4,9 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <queue>
 using namespace llvm;
@ -44,7 +24,7 @@ public:
    std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
    std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
-    // insert sync in the entry
+    // insert sync after the entry and before the first non-AllocaInst
    BasicBlock *entry = &(*F.begin());
    for (auto i = entry->begin(); i != entry->end(); i++) {
      if (!isa<AllocaInst>(i)) {
@ -54,10 +34,8 @@ public:
    }
    for (Function::iterator I = F.begin(); I != F.end(); ++I) {
      BasicBlock::iterator BI = I->begin();
      // insert barrier before return
-      for (; BI != I->end(); BI++) {
+      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
        llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
        if (Ret) {
          insert_inter_warp_sync_before.push_back(&(*BI));
@ -125,7 +103,7 @@ public:
    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
-    // first find all conditional barriers
+    // find all conditional barriers
    std::vector<BasicBlock *> conditionalBarriers;
    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
      BasicBlock *b = &*i;
@ -148,12 +126,9 @@ public:
      conditionalBarriers.pop_back();
      // insert barrier in the start of if-condition
      BasicBlock *pos = b;
      BasicBlock *pred = firstNonBackedgePredecessor(b);
      while (PDT->getPostDomTree().dominates(b, pred)) {
        pos = pred;
        // If our BB post dominates the given block, we know it is not the
        // branching block that makes the barrier conditional.
        pred = firstNonBackedgePredecessor(pred);
@ -468,7 +443,6 @@ public:
      auto header_block = L->getHeader();
      assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
             "has more than 2 successors of the for-head\n");
      BasicBlock *for_body = NULL;
      for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
           i++) {
        auto bb = header_block->getTerminator()->getSuccessor(i);
--- a/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
@ -4,43 +4,20 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include <assert.h>
 #include <iostream>
 #include <set>
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 #include <set>
 #include <sstream>
 #include <tuple>
 #include <vector>
 using namespace llvm;
@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
  Function *FF = instruction->getParent()->getParent();
  Module *M = instruction->getParent()->getParent()->getParent();
  LLVMContext &C = M->getContext();
  const llvm::DataLayout &Layout = M->getDataLayout();
  llvm::Type *elementType;
  if (isa<AllocaInst>(instruction)) {
@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
  }
  Type *AllocType = elementType;
  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
  llvm::Value *ItemSize = nullptr;
  llvm::AllocaInst *Alloca = nullptr;
  auto block_size_addr = M->getGlobalVariable("block_size");
@ -697,9 +669,6 @@ public:
            is_single_conditional_branch_block = 1;
          } else {
            // generate by replicate local variable
            printf(
                "[WARNING] match single conditional branch with HARD CODE\n");
            bool branch_to_intra_init = false;
            for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
              llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
              auto block_name = entryCandidate->getName().str();
@ -755,7 +724,7 @@ public:
      entry = entryCandidate;
      break;
    }
-    // delete useless PR, those PRs only have branch
+    // delete useless PR, those PRs only have branch instructions
    if (entry == exit) {
      if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
        return;
--- a/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
@ -1,29 +1,10 @@
 #include "memory_hierarchy.h"
 #include "debug.hpp"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
 #include <tuple>
 #include <vector>
 void mem_share2global(llvm::Module *M) {
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int64T = Type::getInt64Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
  std::set<llvm::Instruction *> need_remove;
  std::set<GlobalVariable *> need_remove_share_memory;
@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) {
              // generate global type pointer
              PointerType *PointerTy =
                  PointerType::get(array_type->getElementType(), 0);
              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
              llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
                  *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
                  NULL, "dynamic_shared_memory", NULL,
@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) {
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else if (element_type->isFloatTy()) {
-            auto FP_type = llvm::Type::getFloatTy(*C);
+            auto FP_type = llvm::Type::getFloatTy(M->getContext());
            auto zero = llvm::ConstantFP::get(FP_type, 0);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) {
 }
 void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int64T = Type::getInt64Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
  std::set<llvm::Instruction *> need_remove;
  std::set<GlobalVariable *> need_remove_constant_memory;
@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
    if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
      if (auto PT = dyn_cast<PointerType>(I->getType())) {
        unsigned AS = PT->getAddressSpace();
-        if (AS == 4) { // find a share memory
+        if (AS == 4) { // find a constant memory
          need_remove_constant_memory.insert(constant_memory);
          // generate the corresponding global memory variable
          auto new_name = "wrapper_global_" + constant_memory->getName().str();
@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
            if (constant_memory->hasExternalLinkage() &&
                array_type->getArrayNumElements() == 0) {
-              // external shared memory of []
+              // external constant memory of []
              // generate global type pointer
              PointerType *PointerTy =
                  PointerType::get(array_type->getElementType(), 0);
--- a/compilation/KernelTranslation/src/cpu/performance.cpp
+++ b/compilation/KernelTranslation/src/cpu/performance.cpp
@ -1,43 +1,13 @@
 #include "performance.h"
 #include "debug.hpp"
 #include "tool.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 #include <set>
 #include <sstream>
 #include <tuple>
 #include <vector>
 using namespace llvm;
@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) {
  llvm::legacy::PassManager Passes;
  // add target machine info
-  llvm::Triple triple("x86_64-unknown-linux-gnu");
+  llvm::Triple triple(llvm::sys::getProcessTriple());
  std::string Error;
  const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) {
  Options.FloatABIType = FloatABI::Hard;
  TargetMachine *TM = TheTarget->createTargetMachine(
-      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
+      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""),
      Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
  assert(TM && "No Machine Information\n");
@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) {
  Builder.LoopVectorize = true;
  Builder.SLPVectorize = true;
  Builder.VerifyInput = true;
  Builder.VerifyOutput = true;
  Builder.populateModulePassManager(Passes);
  Passes.run(*M);
 }
--- a/compilation/KernelTranslation/src/cpu/tool.cpp
+++ b/compilation/KernelTranslation/src/cpu/tool.cpp
@ -1,29 +1,13 @@
 #include "tool.h"
 #include "debug.hpp"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <set>
 using namespace llvm;
@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
    Value *val = phi->getIncomingValue(incoming);
    BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
    builder.SetInsertPoint(incomingBB->getTerminator());
-    llvm::Instruction *store = builder.CreateStore(val, alloca);
+    builder.CreateStore(val, alloca);
  }
  builder.SetInsertPoint(phi);
@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) {
      }
    }
    bool changed = false;
    for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
      Instruction *instr = *i;
      BreakPHIToAllocas(dyn_cast<PHINode>(instr));
@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) {
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
-        if (auto Load = dyn_cast<LoadInst>(BI)) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
          auto load_from = Load->getOperand(0);
        } else if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
            if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
            auto callFn = Call->getCalledFunction();
            if (func_name == "vprintf") {
              /*
               * replace CUDA's printf to C's printf
@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) {
                    dyn_cast<PointerType>(BC->getOperand(0)->getType());
                auto SrcTy = SrcPointTy->getElementType();
                // reverse the bitcast
-                auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
+                new BitCastInst(BC, SrcPointTy, "", Call);
                assert(SrcTy->isStructTy() == 1);
                auto StructTy = dyn_cast<StructType>(SrcTy);
                for (int i = 0; i < StructTy->getNumElements(); i++) {
@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) {
 void replace_asm_call(llvm::Module *M) {
  LLVMContext &context = M->getContext();
  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<CallInst *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
--- a/compilation/KernelTranslation/src/cpu/warp_func.cpp
+++ b/compilation/KernelTranslation/src/cpu/warp_func.cpp
@ -2,15 +2,6 @@
 #include "warp_func.h"
 #include "debug.hpp"
 #include "tool.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <set>
 using namespace llvm;
@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) {
      res = BinaryOperator::CreateNot(res, "", sync_inst);
    }
-    auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
+    new llvm::StoreInst(res, GEP, "", sync_inst);
    // create barrier
    CreateIntraWarpBarrier(sync_inst);
    /*
--- a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
+++ b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
@ -1,186 +0,0 @@
 #include "generate_x86_format.h"
 #include "debug.hpp"
 #include "tool.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
 #include <map>
 using namespace llvm;
 void set_meta_data(llvm::Module *M) {
  M->setTargetTriple("x86_64-unknown-linux-gnu");
  M->setDataLayout(
      "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
 }
 // as pthread only accept a single void* for input
 // we have to decode this input inside the kernel
 void decode_input(llvm::Module *M) {
  std::set<llvm::Function *> need_remove;
  LLVMContext *C = &M->getContext();
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  llvm::FunctionType *LauncherFuncT = FunctionType::get(
      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
  std::set<GlobalVariable *> dynmaic_memory;
  std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
  // generate Wrapper Function type
  // now we only support a single int32*
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    if (!isKernelFunction(M, F))
      continue;
    auto func_name = F->getName().str();
    // remove mangle prefix
    // remove _Z24
    for (int pos = 2; pos < func_name.length(); pos++) {
      if (func_name[pos] >= '0' && func_name[pos] <= '9')
        continue;
      func_name = func_name.substr(pos);
      break;
    }
    llvm::IRBuilder<> Builder(M->getContext());
    FunctionCallee fc =
        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
    Builder.SetInsertPoint(Block);
    // WorkGroup has only a single input
    Function::arg_iterator ai = WorkGroup->arg_begin();
    SmallVector<Value *, 8> Arguments;
    Value *input_arg = &*ai;
    // convert to int**
    input_arg = Builder.CreateBitOrPointerCast(
        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
    // dynamic memory load in the wrapper function
    GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
    if (share_memory != NULL) {
      dynmaic_memory.insert(share_memory);
      llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
          *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
          "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
          0, false);
      Value *loadedValue = createLoad(Builder, global_mem);
      llvm::FunctionType *LaunchFun2 = FunctionType::get(
          PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
      FunctionCallee fc2 =
          M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
      Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
      WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
      WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
      Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
      co->setSelectionKind(Comdat::SelectionKind::Any);
      WorkGroup2->setComdat(co);
      BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
      llvm::IRBuilder<> Builder2(M->getContext());
      Builder2.SetInsertPoint(Block2);
      Builder2.CreateRet(share_memory);
      auto PT = dyn_cast<PointerType>(share_memory->getType());
      auto element_type = PT->getElementType();
      AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
      Value *new_ar = new_arr;
      Value *gptr = Builder.CreateBitOrPointerCast(
          share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
      Builder.CreateStore(new_ar, gptr);
    }
    size_t idx = 0;
    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
         ii != ee; ++ii) {
      Type *ArgType = ii->getType();
      // calculate addr
      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
      // load corresponding int*
      GEP = createLoad(Builder, GEP);
      // bitcast
      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
      Value *Arg = createLoad(Builder, GEP);
      Arguments.push_back(Arg);
      ++idx;
    }
    CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
    Builder.CreateRetVoid();
  }
  for (auto f : need_remove) {
    f->dropAllReferences();
    f->eraseFromParent();
  }
 }
 void remove_barrier(llvm::Module *M) {
  std::vector<Instruction *> need_remove;
  for (auto F = M->begin(); F != M->end(); ++F)
    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->isInlineAsm())
            continue;
          auto func_name = Call->getCalledFunction()->getName().str();
          if (func_name == "llvm.nvvm.bar.warp.sync" ||
              func_name == "llvm.nvvm.barrier0" ||
              func_name == "llvm.nvvm.barrier.sync") {
            need_remove.push_back(Call);
          }
        }
      }
    }
  for (auto inst : need_remove) {
    inst->eraseFromParent();
  }
 }
 void remove_useless_var(llvm::Module *M) {
  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
 }
 void generate_x86_format(llvm::Module *M) {
  DEBUG_INFO("generate x86 format\n");
  // change metadata
  set_meta_data(M);
  // decode argument
  decode_input(M);
  // remove barrier
  remove_barrier(M);
  // remove useless func/variable
  remove_useless_var(M);
 }
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@ -1,9 +1,9 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(
-  X86runtime
+  CPUruntime
-  DESCRIPTION "Implementation CUDA runtime API with x86"
+  DESCRIPTION "Implementation CUDA runtime API with CPUs"
  LANGUAGES CXX)
-set(LIB_NAME x86Runtime)
+set(LIB_NAME CPUruntime)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 # compile threadPool implementation
@ -12,9 +12,9 @@ add_subdirectory(threadPool)
 # compile x86 runtime library
 include_directories(../common)
 include_directories(./include/)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(./threadPool/include/)
-include_directories(./threadPool/include/x86)
+include_directories(./threadPool/include/cpu)
 include_directories(../external/moodycamel/)
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
--- a/runtime/include/cpu/cudaKernelImpl.h
+++ b/runtime/include/cpu/cudaKernelImpl.h
@ -1,4 +1,4 @@
-#ifndef __RUNTIME_IMPL__
+#ifndef __KERNEL_IMPL__
 #define __KERNEL_IMPL__
 #include "structures.h"
 #include <stdint.h>
--- a/runtime/include/cpu/cudaRuntimeImpl.h
+++ b/runtime/include/cpu/cudaRuntimeImpl.h
--- a/runtime/src/cpu/cudaKernelImpl.cpp
+++ b/runtime/src/cpu/cudaKernelImpl.cpp
--- a/runtime/src/cpu/cudaRuntimeImpl.cpp
+++ b/runtime/src/cpu/cudaRuntimeImpl.cpp
@ -10,27 +10,31 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 cudaError_t cudaGetDevice(int *devPtr) {
  *devPtr = 0;
  return cudaSuccess;
 }
 const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
-cudaError_t cudaDeviceReset(void) {
+
-  scheduler_uninit();
+cudaError_t cudaDeviceReset(void) { return cudaSuccess; }
-  return cudaSuccess;
+
 }
 cudaError_t cudaDeviceSynchronize(void) {
  cuSynchronizeBarrier();
  return cudaSuccess;
 }
 cudaError_t cudaThreadSynchronize(void) {
  cuSynchronizeBarrier();
  return cudaSuccess;
 }
 cudaError_t cudaFree(void *devPtr) {
  free(devPtr);
  return cudaSuccess;
 }
 cudaError_t cudaFreeHost(void *devPtr) {
  free(devPtr);
  return cudaSuccess;
@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  cu_kernel *ker =
      create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
-  int lstatus = cuLaunchKernel(&ker);
+  cuLaunchKernel(&ker);
  return cudaSuccess;
 }
 cudaError_t cudaMalloc(void **devPtr, size_t size) {
  *devPtr = malloc(size);
  if (devPtr == NULL)
    return cudaErrorMemoryAllocation;
  return cudaSuccess;
 }
 cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
  memset(devPtr, value, count);
  return cudaSuccess;
 }
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
                       cudaMemcpyKind kind) {
  if (kind == cudaMemcpyHostToHost) {
@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
  return cudaSuccess;
 }
 static int stream_counter = 1;
 /*
 From our evaluation, CPU backend can gain little benefit
 from multi stream. Thus, we only use single stream
@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess;
 const char *cudaGetErrorString(cudaError_t error) {
  if (error == cudaSuccess) {
    return "Cuda Get Error Success";
  } else {
    return "Cuda Get Error Failed";
  }
 }
--- a/runtime/threadPool/CMakeLists.txt
+++ b/runtime/threadPool/CMakeLists.txt
@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME threadPool)
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_BUILD_TYPE Debug)
 include_directories(../../common)
 include_directories(./include)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../external/moodycamel)
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
--- a/runtime/threadPool/include/cpu/api.h
+++ b/runtime/threadPool/include/cpu/api.h
--- a/runtime/threadPool/include/cpu/def.h
+++ b/runtime/threadPool/include/cpu/def.h
--- a/runtime/threadPool/include/cpu/macros.h
+++ b/runtime/threadPool/include/cpu/macros.h
--- a/runtime/threadPool/include/cpu/structures.h
+++ b/runtime/threadPool/include/cpu/structures.h
--- a/runtime/threadPool/src/cpu/api.cpp
+++ b/runtime/threadPool/src/cpu/api.cpp
@ -1,3 +1,19 @@
 /*
  This file contains the implementation of the CPU thread pool. For a kernel
  launch, the host thread will enqueue the kernel to the kernelQueue, and the
  threads in the thread pool will try to fetch work from the queue. After a
  thread fetches a kernel from the queue, it will execute the kernel. After the
  kernel execution, the thread will try to fetch another kernel from the queue.
  If the queue is empty, the thread will wait for the next kernel launch.
  By default, we try to use all CPU cores for execution. Thus, for a kernel
  launch, the host thread pushes P kernel variables to the queue, where P is the
  number of CPU cores.
  For some lightweight kernels, useing fewer CPU cores can speed up the overall
  execution time, due to fewer CPU cores lead to lower synchronization overhead.
 */
 #include "api.h"
 #include "blockingconcurrentqueue.h"
 #include "debug.hpp"
@ -9,9 +25,6 @@
 #include <stdlib.h>
 #include <thread>
 /*
 Initialize the device
 */
 int device_max_compute_units = 1;
 bool device_initilized = false;
 int init_device() {
@ -32,7 +45,6 @@ int init_device() {
 }
 // Create Kernel
 static int kernelIds = 0;
 cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
                         void **args, size_t sharedMem, cudaStream_t stream) {
  cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
@ -71,13 +83,11 @@ __thread int warp_shfl[32] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 /*
    Enqueue Kernel (k) to the scheduler kernelQueue
 */
 int TaskToExecute;
 // Enqueue Kernel to the scheduler kernelQueue
 int schedulerEnqueueKernel(cu_kernel *k) {
-  int totalBlocks =
+  int totalBlocks = k->totalBlocks;
-      k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
+  // by default, all CPU cores are used to execute GPU blocks equally
  int gpuBlockToExecutePerCpuThread =
      (totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
  TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) {
  return C_SUCCESS;
 }
-/*
+// Push kernel to the kernelQueue
  Kernel Launch with numBlocks and numThreadsPerBlock
 */
 int cuLaunchKernel(cu_kernel **k) {
  if (!device_initilized) {
    init_device();
  }
  // Calculate Block Size N/numBlocks
  cu_kernel *ker = *k;
  int status = C_RUN;
  // set complete to false, this variable is used for sync
  for (int i = 0; i < scheduler->num_worker_threads; i++) {
    scheduler->thread_pool[i].completeTask = 0;
  }
-  schedulerEnqueueKernel(ker);
+  schedulerEnqueueKernel(*k);
  return 0;
 }
-/*
+// threads in thread-pool try to fetch work from the queue
    Thread Gets Work
 */
 int get_work(c_thread *th) {
  int dynamic_shared_mem_size = 0;
  dim3 gridDim;
@ -136,6 +139,7 @@ int get_work(c_thread *th) {
      grid_size_x = gridDim.x;
      grid_size_y = gridDim.y;
      grid_size_z = gridDim.z;
      // allocate dynamic shared memory
      if (dynamic_shared_mem_size > 0)
        dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
      // execute GPU blocks
@ -153,7 +157,8 @@ int get_work(c_thread *th) {
    }
    // if cannot get tasks, check whether programs stop
    if (scheduler->threadpool_shutdown_requested) {
-      return true; // thread exit
+      // thread exit
      break;
    }
  }
  return 0;
@ -176,9 +181,7 @@ void *driver_thread(void *p) {
  }
 }
-/*
+// Initialize the scheduler
 Initialize the scheduler
 */
 int scheduler_init(cu_device device) {
  scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
  scheduler->num_worker_threads = device.max_compute_units;
@ -198,8 +201,6 @@ int scheduler_init(cu_device device) {
  return C_SUCCESS;
 }
 void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
 /*
  Barrier for Kernel Launch
 */
--- a/test/runHeteroMark.sh
+++ b/test/runHeteroMark.sh
@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \
    $HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc  $HeteroMark_PATH/src/common/benchmark/*.cc \
    $HeteroMark_PATH/src/common/command_line_option/*.cc  $HeteroMark_PATH/src/common/time_measurement/*.cc \
    -L$CuPBoP_BUILD_PATH/runtime   -L$CuPBoP_BUILD_PATH/runtime/threadPool \
-    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool
+    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool
 case $1 in
  aes)