Refactoring the codebase. Remove useless variables; Add comments; Remove useless header files; Remove hard code and support both x86 and ARM CPU

2023-12-13 14:29:17 -05:00 · 2023-12-13 14:29:17 -05:00 · fd56811650
parent 50d615da64
commit fd56811650
50 changed files with 249 additions and 531 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -51,7 +51,7 @@ jobs:
          hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
          llc --relocation-model=pic --filetype=obj  kernel.bc
          llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
          ./reverse
      - name: Execute the dynamic shared memory demo
        run: |
@ -63,7 +63,7 @@ jobs:
          hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
          llc --relocation-model=pic --filetype=obj  kernel.bc
          llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
          ./reverse
      - name: Execute Hetero-mark benchmark
        run: |
@ -79,5 +79,5 @@ jobs:
          hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
          llc --relocation-model=pic --filetype=obj  kernel.bc
          llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
+          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread
          ./lavaMD -boxes1d 10
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,13 +1,9 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)

-project(CudaOnX86)
-set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
+project(CuPBoP)
+set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.")
 set(CMAKE_CXX_STANDARD "14")
-set(MAJOR_VERSION 0)
-set(MINOR_VERSION 1)
-set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
-set(COX_VERSION ${VERSION_STRING})
-# get LLVM PATH get PATH for head file
+
 if(DEFINED LLVM_CONFIG_PATH)
  if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
    if(EXISTS "${LLVM_CONFIG_PATH}")
@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH)
 else()
  message(FATAL_ERROR "llvm-config is required")
 endif()
-# get CUDA PATH
+
 if(DEFINED CUDA_PATH)
  message(STATUS "Using CUDA: ${CUDA_PATH}")
 else()
@ -45,7 +41,7 @@ if(DEBUG)
 endif()

 set(CMAKE_CXX_FLAGS
-    "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
+    "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused")

 set(GCC_COVERAGE_LINK_FLAGS
    "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
@ -54,5 +50,8 @@ add_subdirectory(compilation)
 add_subdirectory(runtime)
 enable_testing()

-option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data)
+set(HETERO_MARK_DATA
+    "/tmp/data"
+    CACHE PATH "The path to download hetero-mark dataset.")
+
 add_subdirectory(test)
--- a/README.md
+++ b/README.md
@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \
      -L$CuPBoP_PATH/build/runtime  \
      -L$CuPBoP_PATH/build/runtime/threadPool \
      host.o kernel.o \
-      -I../.. -lc -lx86Runtime -lthreadPool -lpthread
+      -I../.. -lc -lCPUruntime -lthreadPool -lpthread
 # Execute
 ./vecadd
 ```
--- a/compilation/CMakeLists.txt
+++ b/compilation/CMakeLists.txt
@ -1,20 +1,16 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-project(
-  NVVM2X86
-  DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend"
-  LANGUAGES CXX)

 set(CMAKE_VERBOSE_MAKEFILE ON)

-# compile kernel translator
-include_directories(./KernelTranslation/include/x86)
+# build kernel translator
+include_directories(./KernelTranslation/include/cpu)
 add_subdirectory(KernelTranslation)

 add_executable(kernelTranslator KernelTranslation.cpp)
 target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})

-# compile host translator
-include_directories(./HostTranslation/include/x86)
+# build host translator
+include_directories(./HostTranslation/include/cpu)
 add_subdirectory(HostTranslation)

 add_executable(hostTranslator HostTranslation.cpp)
--- a/compilation/HostTranslation.cpp
+++ b/compilation/HostTranslation.cpp
@ -4,12 +4,7 @@
 #include "ReplaceCudaBuiltin.h"
 #include "ReplaceKernelArgs.h"
 #include "tool.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
 #include <assert.h>
-#include <fstream>
-#include <iostream>
-#include <stdlib.h>

 using namespace llvm;

--- a/compilation/HostTranslation/CMakeLists.txt
+++ b/compilation/HostTranslation/CMakeLists.txt
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME cudaRuntime2cpuRuntime)

 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../common)

-file(GLOB proj_HEADERS "include/x86/*.h")
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_HEADERS "include/cpu/*.h")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")

 # Add core library.
 add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
--- a/compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h
+++ b/compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h
--- a/compilation/HostTranslation/include/cpu/RemoveMetadata.h
+++ b/compilation/HostTranslation/include/cpu/RemoveMetadata.h
--- a/compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h
+++ b/compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h
--- a/compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h
+++ b/compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h
--- a/compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h
+++ b/compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h
--- a/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
@ -3,16 +3,8 @@
 */
 #include "RemoveCudaBuiltin.h"
 #include "debug.hpp"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Transforms/Utils/CtorUtils.h"
-#include <iostream>
-#include <map>
 #include <set>

 using namespace llvm;
--- a/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
+++ b/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
@ -1,16 +1,14 @@
 #include "RemoveMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <iostream>
+#include "llvm/Support/Host.h"

 using namespace llvm;

 void RemoveMetadata(llvm::Module *M) {
+  // change the target triple to the host triple
+  M->setTargetTriple(llvm::sys::getProcessTriple());
+  // use the default DataLayout
+  M->setDataLayout("");
+
  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) {
    F->removeFnAttr("min-legal-vector-width");
    F->removeFnAttr("no-trapping-math");
    F->removeFnAttr(llvm::Attribute::OptimizeNone);
+    F->removeFnAttr("target-cpu");
+    F->removeFnAttr("target-features");
  }
 }
--- a/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
@ -1,12 +1,7 @@
 #include "ReplaceConstantMemory.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include <assert.h>
 #include <fstream>
-#include <iostream>
 #include <map>
 #include <set>

--- a/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
@ -1,13 +1,6 @@
 #include "ReplaceCudaBuiltin.h"
 #include "debug.hpp"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <iostream>
 #include <map>
 #include <regex>
 #include <set>
@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
  std::map<std::string, Function *> kernels;

  std::set<llvm::Function *> need_remove;
-  LLVMContext *C = &M->getContext();
-
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
-  llvm::FunctionType *LauncherFuncT =
-      FunctionType::get(Type::getVoidTy(*C), NULL);
-
-  llvm::FunctionType *LaunchFun2 =
-      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
-
-  bool done = false;

  std::set<std::string> cuda_register_kernel_names;

@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
                std::vector<size_t> arg_sizes;
                functionOperand =
                    dyn_cast<Function>(callOperand->stripPointerCasts());
-
-                FunctionType *ft = calledFunction->getFunctionType();
                DEBUG_INFO("Parent (Caller) Function Name: %s, "
                           "cudaLaunchKernel Function: %s, args : %d\n",
                           func_name.c_str(),
--- a/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
@ -1,12 +1,5 @@
 #include "ReplaceKernelArgs.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <iostream>
 #include <map>
 #include <set>

@ -23,12 +16,9 @@ using namespace llvm;
 // to use use-analysis to find the arguments in the future
 void ReplaceKernelArg(llvm::Module *M) {
  LLVMContext &context = M->getContext();
-  auto VoidTy = llvm::Type::getVoidTy(context);
-  auto I8 = llvm::Type::getInt8PtrTy(context);
  std::map<std::string, Function *> kernels;

  std::set<llvm::Function *> need_replace;
-  LLVMContext *C = &M->getContext();

  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
--- a/compilation/KernelTranslation.cpp
+++ b/compilation/KernelTranslation.cpp
@ -1,4 +1,4 @@
-#include "generate_x86_format.h"
+#include "generate_cpu_format.h"
 #include "handle_sync.h"
 #include "init.h"
 #include "insert_sync.h"
@ -6,17 +6,14 @@
 #include "performance.h"
 #include "tool.h"
 #include "warp_func.h"
-#include "llvm/IR/Module.h"
 #include <assert.h>
-#include <fstream>
-#include <iostream>
-#include <llvm/Support/raw_ostream.h>
-#include <map>
-#include <set>
-#include <stdlib.h>

 using namespace llvm;

+// to support constant memory variables, we need to convert information
+// from kernelTranslator to HostTranslator, since HostTranslator knows nothing
+// about the kernel functions, we need to write the information to a file
+// by KernelTranslator and read it in HostTranslator
 std::string PATH = "kernel_meta.log";

 int main(int argc, char **argv) {
@ -26,8 +23,9 @@ int main(int argc, char **argv) {
  std::ofstream fout;
  fout.open(PATH);

-  // inline, and create auxiliary global variables
+  // inline __device__ functions, and create auxiliary global variables
  init_block(program, fout);
+
  // insert sync before each vote, and replace the
  // original vote function to warp vote
  handle_warp_vote(program);
@ -40,17 +38,18 @@ int main(int argc, char **argv) {

  // split block by sync
  split_block_by_sync(program);
-  // add loop for intra&intera thread
+
+  // add loop for intra&intera thread, it refers 'hierarchical collapsing' in
+  // COX paper.
  insert_warp_loop(program);

-  // (TODO): replace this patch
  replace_built_in_function(program);

-  // TODO: replace with a more general function
-  // Not only for x86 backend
-  generate_x86_format(program);
+  // the input kernel programs have NVIDIA metadata, they need to be replaced to
+  // CPU metadata
+  generate_cpu_format(program);

-  // performance optimization
+  // execute O3 pipeline on the transformed program
  performance_optimization(program);

  VerifyModule(program);
--- a/compilation/KernelTranslation/CMakeLists.txt
+++ b/compilation/KernelTranslation/CMakeLists.txt
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME spmd2mpmd)

 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../common)

-file(GLOB proj_HEADERS "include/x86/*.h")
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_HEADERS "include/cpu/*.h")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")

 # Add core library.
 add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
--- a/compilation/KernelTranslation/include/cpu/generate_cpu_format.h
+++ b/compilation/KernelTranslation/include/cpu/generate_cpu_format.h
@ -0,0 +1,10 @@
+#ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__
+#define __NVVM2CPU_GENERATE_CPU_FORMAT__
+
+#include "llvm/IR/Module.h"
+
+void generate_cpu_format(llvm::Module *M);
+
+void set_meta_data(llvm::Module *M);
+
+#endif
--- a/compilation/KernelTranslation/include/cpu/handle_sync.h
+++ b/compilation/KernelTranslation/include/cpu/handle_sync.h
--- a/compilation/KernelTranslation/include/cpu/init.h
+++ b/compilation/KernelTranslation/include/cpu/init.h
--- a/compilation/KernelTranslation/include/cpu/insert_sync.h
+++ b/compilation/KernelTranslation/include/cpu/insert_sync.h
--- a/compilation/KernelTranslation/include/cpu/insert_warp_loop.h
+++ b/compilation/KernelTranslation/include/cpu/insert_warp_loop.h
--- a/compilation/KernelTranslation/include/cpu/memory_hierarchy.h
+++ b/compilation/KernelTranslation/include/cpu/memory_hierarchy.h
--- a/compilation/KernelTranslation/include/cpu/performance.h
+++ b/compilation/KernelTranslation/include/cpu/performance.h
--- a/compilation/KernelTranslation/include/cpu/tool.h
+++ b/compilation/KernelTranslation/include/cpu/tool.h
--- a/compilation/KernelTranslation/include/cpu/warp_func.h
+++ b/compilation/KernelTranslation/include/cpu/warp_func.h
--- a/compilation/KernelTranslation/include/x86/generate_x86_format.h
+++ b/compilation/KernelTranslation/include/x86/generate_x86_format.h
@ -1,10 +0,0 @@
-#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
-#define __NVVM2x86_GENERATE_X86_FORMAT__
-
-#include "llvm/IR/Module.h"
-
-void generate_x86_format(llvm::Module *M);
-
-void set_meta_data(llvm::Module *M);
-
-#endif
--- a/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
+++ b/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
@ -0,0 +1,125 @@
+#include "generate_cpu_format.h"
+#include "debug.hpp"
+#include "tool.h"
+#include "llvm/Support/Host.h"
+
+using namespace llvm;
+
+// set TargetTriple and DataLayout same as the host CPU
+void set_meta_data(llvm::Module *M) {
+  M->setTargetTriple(llvm::sys::getProcessTriple());
+  // use the default DataLayout
+  M->setDataLayout("");
+}
+
+// as pthread only accept a single void* for input
+// we have to decode this input inside the kernel
+void decode_input(llvm::Module *M) {
+
+  std::set<llvm::Function *> need_remove;
+
+  llvm::Type *Int32T = Type::getInt32Ty(M->getContext());
+  llvm::Type *Int8T = Type::getInt8Ty(M->getContext());
+
+  llvm::FunctionType *LauncherFuncT = FunctionType::get(
+      Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false);
+
+  // generate Wrapper Function type
+  // now we only support a single int32*
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    if (!isKernelFunction(M, F))
+      continue;
+    auto func_name = F->getName().str();
+    // filter out _Z24 and other mangled prefix
+    for (int pos = 2; pos < func_name.length(); pos++) {
+      if (func_name[pos] >= '0' && func_name[pos] <= '9')
+        continue;
+      func_name = func_name.substr(pos);
+      break;
+    }
+    llvm::IRBuilder<> Builder(M->getContext());
+
+    FunctionCallee fc =
+        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
+    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
+
+    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
+    Builder.SetInsertPoint(Block);
+
+    // WorkGroup has only a single input
+    Function::arg_iterator ai = WorkGroup->arg_begin();
+
+    SmallVector<Value *, 8> Arguments;
+    Value *input_arg = &*ai;
+    // convert to int**
+    input_arg = Builder.CreateBitOrPointerCast(
+        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
+
+    size_t idx = 0;
+    // replace original arguments with the unpacked values
+    // for example, for a function f(int* a, char* b),
+    // we will generate a function f_wrapper(int** input)
+    // and replace the original arguments with the unpacked values
+    // e.g., a = (int*)input[0], b = (char*)input[1]
+    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+         ii != ee; ++ii) {
+      Type *ArgType = ii->getType();
+      // calculate addr
+      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
+      // load corresponding int*
+      GEP = createLoad(Builder, GEP);
+      // bitcast
+      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
+      Value *Arg = createLoad(Builder, GEP);
+      Arguments.push_back(Arg);
+      ++idx;
+    }
+    Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
+    Builder.CreateRetVoid();
+  }
+  for (auto f : need_remove) {
+    f->dropAllReferences();
+    f->eraseFromParent();
+  }
+}
+
+// after flat/hierarchical collapsing, the barrier instructions are useless
+void remove_barrier(llvm::Module *M) {
+  std::vector<Instruction *> need_remove;
+  for (auto F = M->begin(); F != M->end(); ++F)
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) {
+        if (auto Call = dyn_cast<CallInst>(Inst)) {
+          if (Call->isInlineAsm())
+            continue;
+          auto func_name = Call->getCalledFunction()->getName().str();
+          if (func_name == "llvm.nvvm.bar.warp.sync" ||
+              func_name == "llvm.nvvm.barrier0" ||
+              func_name == "llvm.nvvm.barrier.sync") {
+            need_remove.push_back(Call);
+          }
+        }
+      }
+    }
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+
+void remove_useless_var(llvm::Module *M) {
+  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
+  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
+}
+
+void generate_cpu_format(llvm::Module *M) {
+  DEBUG_INFO("generate cpu format\n");
+  // change metadata
+  set_meta_data(M);
+  // decode argument
+  decode_input(M);
+  // remove barrier
+  remove_barrier(M);
+  // remove useless func/variable
+  remove_useless_var(M);
+}
--- a/compilation/KernelTranslation/src/cpu/handle_sync.cpp
+++ b/compilation/KernelTranslation/src/cpu/handle_sync.cpp
@ -1,13 +1,7 @@
 #include "handle_sync.h"
 #include "debug.hpp"
 #include "tool.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include <set>
 #include <string>

--- a/compilation/KernelTranslation/src/cpu/init.cpp
+++ b/compilation/KernelTranslation/src/cpu/init.cpp
@ -2,26 +2,11 @@
 #include "debug.hpp"
 #include "memory_hierarchy.h"
 #include "tool.h"
-#include <fstream>
-#include <iostream>
-#include <set>
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <set>

 using namespace llvm;

@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) {

  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
-    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;
-    Function::iterator I = F->begin();
-    for (Function::iterator E = F->end(); I != E; ++I) {
+    for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
          if (c->getCalledFunction()) {
@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) {
 }

 bool find_sreg_inst(llvm::Function *F) {
-  Function::iterator I = F->begin();
-  for (Function::iterator E = F->end(); I != E; ++I) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
      if (CallInst *c = dyn_cast<CallInst>(BI++)) {
        if (c->getCalledFunction()) {
@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) {
  Passes.run(*M);
 }

+// transform constant expression into sequence of instructions
 bool lower_constant_expr(llvm::Module *M) {
  bool modified = false;
-  LLVMContext &context = M->getContext();
-  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<CallInst *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
-    auto func_name = F->getName().str();
    if (!isKernelFunction(M, F))
      continue;

@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) {
  return modified;
 }

-void replace_cuda_math_built_in(llvm::Module *M) {
 // replace _ZL3expd, just delete its body
+void replace_cuda_math_built_in(llvm::Module *M) {
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
    auto func_name = F->getName().str();
--- a/compilation/KernelTranslation/src/cpu/insert_sync.cpp
+++ b/compilation/KernelTranslation/src/cpu/insert_sync.cpp
@ -4,29 +4,9 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <iostream>
 #include <queue>

 using namespace llvm;
@ -44,7 +24,7 @@ public:
    std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
    std::vector<llvm::Instruction *> insert_inter_warp_sync_before;

-    // insert sync in the entry
+    // insert sync after the entry and before the first non-AllocaInst
    BasicBlock *entry = &(*F.begin());
    for (auto i = entry->begin(); i != entry->end(); i++) {
      if (!isa<AllocaInst>(i)) {
@ -54,10 +34,8 @@ public:
    }

    for (Function::iterator I = F.begin(); I != F.end(); ++I) {
-      BasicBlock::iterator BI = I->begin();
-
      // insert barrier before return
-      for (; BI != I->end(); BI++) {
+      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
        llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
        if (Ret) {
          insert_inter_warp_sync_before.push_back(&(*BI));
@ -125,7 +103,7 @@ public:

    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();

-    // first find all conditional barriers
+    // find all conditional barriers
    std::vector<BasicBlock *> conditionalBarriers;
    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
      BasicBlock *b = &*i;
@ -148,12 +126,9 @@ public:
      conditionalBarriers.pop_back();

      // insert barrier in the start of if-condition
-
-      BasicBlock *pos = b;
      BasicBlock *pred = firstNonBackedgePredecessor(b);

      while (PDT->getPostDomTree().dominates(b, pred)) {
-        pos = pred;
        // If our BB post dominates the given block, we know it is not the
        // branching block that makes the barrier conditional.
        pred = firstNonBackedgePredecessor(pred);
@ -468,7 +443,6 @@ public:
      auto header_block = L->getHeader();
      assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
             "has more than 2 successors of the for-head\n");
-      BasicBlock *for_body = NULL;
      for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
           i++) {
        auto bb = header_block->getTerminator()->getSuccessor(i);
--- a/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
@ -4,43 +4,20 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include <assert.h>
-#include <iostream>
 #include <set>

-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 #include <set>
 #include <sstream>
-#include <tuple>
-#include <vector>

 using namespace llvm;

@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();

  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
-  Function *FF = instruction->getParent()->getParent();
  Module *M = instruction->getParent()->getParent()->getParent();
-  LLVMContext &C = M->getContext();
-  const llvm::DataLayout &Layout = M->getDataLayout();

  llvm::Type *elementType;
  if (isa<AllocaInst>(instruction)) {
@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
  }

  Type *AllocType = elementType;
-  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
-  llvm::Value *ItemSize = nullptr;
  llvm::AllocaInst *Alloca = nullptr;

  auto block_size_addr = M->getGlobalVariable("block_size");
@ -697,9 +669,6 @@ public:
            is_single_conditional_branch_block = 1;
          } else {
            // generate by replicate local variable
-            printf(
-                "[WARNING] match single conditional branch with HARD CODE\n");
-            bool branch_to_intra_init = false;
            for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
              llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
              auto block_name = entryCandidate->getName().str();
@ -755,7 +724,7 @@ public:
      entry = entryCandidate;
      break;
    }
-    // delete useless PR, those PRs only have branch
+    // delete useless PR, those PRs only have branch instructions
    if (entry == exit) {
      if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
        return;
--- a/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
@ -1,29 +1,10 @@
 #include "memory_hierarchy.h"
 #include "debug.hpp"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <assert.h>
-#include <fstream>
-#include <iostream>
 #include <map>
 #include <set>
-#include <sstream>
-#include <tuple>
-#include <vector>

 void mem_share2global(llvm::Module *M) {
-  LLVMContext *C = &M->getContext();
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int64T = Type::getInt64Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
  std::set<llvm::Instruction *> need_remove;
  std::set<GlobalVariable *> need_remove_share_memory;
@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) {
              // generate global type pointer
              PointerType *PointerTy =
                  PointerType::get(array_type->getElementType(), 0);
-              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
              llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
                  *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
                  NULL, "dynamic_shared_memory", NULL,
@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) {
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else if (element_type->isFloatTy()) {
-            auto FP_type = llvm::Type::getFloatTy(*C);
+            auto FP_type = llvm::Type::getFloatTy(M->getContext());
            auto zero = llvm::ConstantFP::get(FP_type, 0);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) {
 }

 void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
-  LLVMContext *C = &M->getContext();
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int64T = Type::getInt64Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
  std::set<llvm::Instruction *> need_remove;
  std::set<GlobalVariable *> need_remove_constant_memory;
@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
    if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
      if (auto PT = dyn_cast<PointerType>(I->getType())) {
        unsigned AS = PT->getAddressSpace();
-        if (AS == 4) { // find a share memory
+        if (AS == 4) { // find a constant memory
          need_remove_constant_memory.insert(constant_memory);
          // generate the corresponding global memory variable
          auto new_name = "wrapper_global_" + constant_memory->getName().str();
@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
            if (constant_memory->hasExternalLinkage() &&
                array_type->getArrayNumElements() == 0) {
-              // external shared memory of []
+              // external constant memory of []
              // generate global type pointer
              PointerType *PointerTy =
                  PointerType::get(array_type->getElementType(), 0);
--- a/compilation/KernelTranslation/src/cpu/performance.cpp
+++ b/compilation/KernelTranslation/src/cpu/performance.cpp
@ -1,43 +1,13 @@
 #include "performance.h"
 #include "debug.hpp"
 #include "tool.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <map>
-#include <set>
-#include <sstream>
-#include <tuple>
-#include <vector>

 using namespace llvm;

@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) {
  llvm::legacy::PassManager Passes;

  // add target machine info
-  llvm::Triple triple("x86_64-unknown-linux-gnu");
+  llvm::Triple triple(llvm::sys::getProcessTriple());

  std::string Error;
  const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) {
  Options.FloatABIType = FloatABI::Hard;

  TargetMachine *TM = TheTarget->createTargetMachine(
-      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
+      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""),
      Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
  assert(TM && "No Machine Information\n");

@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) {
  Builder.LoopVectorize = true;
  Builder.SLPVectorize = true;

-  Builder.VerifyInput = true;
-  Builder.VerifyOutput = true;
-
  Builder.populateModulePassManager(Passes);
  Passes.run(*M);
 }
--- a/compilation/KernelTranslation/src/cpu/tool.cpp
+++ b/compilation/KernelTranslation/src/cpu/tool.cpp
@ -1,29 +1,13 @@
 #include "tool.h"
 #include "debug.hpp"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-
-#include <iostream>
 #include <set>

 using namespace llvm;
@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
    Value *val = phi->getIncomingValue(incoming);
    BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
    builder.SetInsertPoint(incomingBB->getTerminator());
-    llvm::Instruction *store = builder.CreateStore(val, alloca);
+    builder.CreateStore(val, alloca);
  }
  builder.SetInsertPoint(phi);

@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) {
      }
    }

-    bool changed = false;
    for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
      Instruction *instr = *i;
      BreakPHIToAllocas(dyn_cast<PHINode>(instr));
@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) {

    for (auto BB = F->begin(); BB != F->end(); ++BB) {
      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
-        if (auto Load = dyn_cast<LoadInst>(BI)) {
-          auto load_from = Load->getOperand(0);
-        } else if (auto Call = dyn_cast<CallInst>(BI)) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
            if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) {
        if (auto Call = dyn_cast<CallInst>(BI)) {
          if (Call->getCalledFunction()) {
            auto func_name = Call->getCalledFunction()->getName().str();
-            auto callFn = Call->getCalledFunction();
            if (func_name == "vprintf") {
              /*
               * replace CUDA's printf to C's printf
@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) {
                    dyn_cast<PointerType>(BC->getOperand(0)->getType());
                auto SrcTy = SrcPointTy->getElementType();
                // reverse the bitcast
-                auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
+                new BitCastInst(BC, SrcPointTy, "", Call);
                assert(SrcTy->isStructTy() == 1);
                auto StructTy = dyn_cast<StructType>(SrcTy);
                for (int i = 0; i < StructTy->getNumElements(); i++) {
@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) {

 void replace_asm_call(llvm::Module *M) {
  LLVMContext &context = M->getContext();
-  auto I32 = llvm::Type::getInt32Ty(context);
  std::vector<CallInst *> need_remove;
  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
    Function *F = &(*i);
--- a/compilation/KernelTranslation/src/cpu/warp_func.cpp
+++ b/compilation/KernelTranslation/src/cpu/warp_func.cpp
@ -2,15 +2,6 @@
 #include "warp_func.h"
 #include "debug.hpp"
 #include "tool.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <iostream>
 #include <set>

 using namespace llvm;
@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) {
      res = BinaryOperator::CreateNot(res, "", sync_inst);
    }

-    auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
+    new llvm::StoreInst(res, GEP, "", sync_inst);
    // create barrier
    CreateIntraWarpBarrier(sync_inst);
    /*
--- a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
+++ b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
@ -1,186 +0,0 @@
-#include "generate_x86_format.h"
-#include "debug.hpp"
-#include "tool.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <iostream>
-#include <map>
-
-using namespace llvm;
-
-void set_meta_data(llvm::Module *M) {
-  M->setTargetTriple("x86_64-unknown-linux-gnu");
-  M->setDataLayout(
-      "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
-}
-
-// as pthread only accept a single void* for input
-// we have to decode this input inside the kernel
-void decode_input(llvm::Module *M) {
-
-  std::set<llvm::Function *> need_remove;
-
-  LLVMContext *C = &M->getContext();
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
-  llvm::FunctionType *LauncherFuncT = FunctionType::get(
-      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
-
-  std::set<GlobalVariable *> dynmaic_memory;
-
-  std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
-
-  // generate Wrapper Function type
-  // now we only support a single int32*
-  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
-    Function *F = &(*i);
-    if (!isKernelFunction(M, F))
-      continue;
-    auto func_name = F->getName().str();
-    // remove mangle prefix
-    // remove _Z24
-    for (int pos = 2; pos < func_name.length(); pos++) {
-      if (func_name[pos] >= '0' && func_name[pos] <= '9')
-        continue;
-      func_name = func_name.substr(pos);
-      break;
-    }
-    llvm::IRBuilder<> Builder(M->getContext());
-
-    FunctionCallee fc =
-        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
-    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
-
-    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
-    Builder.SetInsertPoint(Block);
-
-    // WorkGroup has only a single input
-    Function::arg_iterator ai = WorkGroup->arg_begin();
-
-    SmallVector<Value *, 8> Arguments;
-    Value *input_arg = &*ai;
-    // convert to int**
-    input_arg = Builder.CreateBitOrPointerCast(
-        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
-
-    // dynamic memory load in the wrapper function
-    GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
-    if (share_memory != NULL) {
-      dynmaic_memory.insert(share_memory);
-      llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
-          *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
-          "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
-          0, false);
-      Value *loadedValue = createLoad(Builder, global_mem);
-
-      llvm::FunctionType *LaunchFun2 = FunctionType::get(
-          PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
-
-      FunctionCallee fc2 =
-          M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
-
-      Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
-
-      WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
-      WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
-      Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
-      co->setSelectionKind(Comdat::SelectionKind::Any);
-      WorkGroup2->setComdat(co);
-
-      BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
-
-      llvm::IRBuilder<> Builder2(M->getContext());
-      Builder2.SetInsertPoint(Block2);
-      Builder2.CreateRet(share_memory);
-
-      auto PT = dyn_cast<PointerType>(share_memory->getType());
-      auto element_type = PT->getElementType();
-
-      AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
-      Value *new_ar = new_arr;
-      Value *gptr = Builder.CreateBitOrPointerCast(
-          share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
-
-      Builder.CreateStore(new_ar, gptr);
-    }
-
-    size_t idx = 0;
-    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
-         ii != ee; ++ii) {
-      Type *ArgType = ii->getType();
-
-      // calculate addr
-      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
-      // load corresponding int*
-      GEP = createLoad(Builder, GEP);
-      // bitcast
-      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
-      Value *Arg = createLoad(Builder, GEP);
-      Arguments.push_back(Arg);
-      ++idx;
-    }
-
-    CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
-    Builder.CreateRetVoid();
-  }
-  for (auto f : need_remove) {
-    f->dropAllReferences();
-    f->eraseFromParent();
-  }
-}
-
-void remove_barrier(llvm::Module *M) {
-  std::vector<Instruction *> need_remove;
-  for (auto F = M->begin(); F != M->end(); ++F)
-    for (auto BB = F->begin(); BB != F->end(); ++BB) {
-      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
-        if (auto Call = dyn_cast<CallInst>(BI)) {
-          if (Call->isInlineAsm())
-            continue;
-          auto func_name = Call->getCalledFunction()->getName().str();
-          if (func_name == "llvm.nvvm.bar.warp.sync" ||
-              func_name == "llvm.nvvm.barrier0" ||
-              func_name == "llvm.nvvm.barrier.sync") {
-            need_remove.push_back(Call);
-          }
-        }
-      }
-    }
-  for (auto inst : need_remove) {
-    inst->eraseFromParent();
-  }
-}
-
-void remove_useless_var(llvm::Module *M) {
-  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
-  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
-}
-
-void generate_x86_format(llvm::Module *M) {
-  DEBUG_INFO("generate x86 format\n");
-  // change metadata
-  set_meta_data(M);
-  // decode argument
-  decode_input(M);
-  // remove barrier
-  remove_barrier(M);
-  // remove useless func/variable
-  remove_useless_var(M);
-}
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@ -1,9 +1,9 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(
-  X86runtime
-  DESCRIPTION "Implementation CUDA runtime API with x86"
+  CPUruntime
+  DESCRIPTION "Implementation CUDA runtime API with CPUs"
  LANGUAGES CXX)
-set(LIB_NAME x86Runtime)
+set(LIB_NAME CPUruntime)
 set(CMAKE_VERBOSE_MAKEFILE ON)

 # compile threadPool implementation
@ -12,9 +12,9 @@ add_subdirectory(threadPool)
 # compile x86 runtime library
 include_directories(../common)
 include_directories(./include/)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(./threadPool/include/)
-include_directories(./threadPool/include/x86)
+include_directories(./threadPool/include/cpu)
 include_directories(../external/moodycamel/)
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
--- a/runtime/include/cpu/cudaKernelImpl.h
+++ b/runtime/include/cpu/cudaKernelImpl.h
@ -1,4 +1,4 @@
-#ifndef __RUNTIME_IMPL__
+#ifndef __KERNEL_IMPL__
 #define __KERNEL_IMPL__
 #include "structures.h"
 #include <stdint.h>
--- a/runtime/include/cpu/cudaRuntimeImpl.h
+++ b/runtime/include/cpu/cudaRuntimeImpl.h
--- a/runtime/src/cpu/cudaKernelImpl.cpp
+++ b/runtime/src/cpu/cudaKernelImpl.cpp
--- a/runtime/src/cpu/cudaRuntimeImpl.cpp
+++ b/runtime/src/cpu/cudaRuntimeImpl.cpp
@ -10,27 +10,31 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 cudaError_t cudaGetDevice(int *devPtr) {
  *devPtr = 0;
  return cudaSuccess;
 }
+
 const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
-cudaError_t cudaDeviceReset(void) {
-  scheduler_uninit();
-  return cudaSuccess;
-}
+
+cudaError_t cudaDeviceReset(void) { return cudaSuccess; }
+
 cudaError_t cudaDeviceSynchronize(void) {
  cuSynchronizeBarrier();
  return cudaSuccess;
 }
+
 cudaError_t cudaThreadSynchronize(void) {
  cuSynchronizeBarrier();
  return cudaSuccess;
 }
+
 cudaError_t cudaFree(void *devPtr) {
  free(devPtr);
  return cudaSuccess;
 }
+
 cudaError_t cudaFreeHost(void *devPtr) {
  free(devPtr);
  return cudaSuccess;
@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  cu_kernel *ker =
      create_kernel(func, gridDim, blockDim, args, sharedMem, stream);

-  int lstatus = cuLaunchKernel(&ker);
-
+  cuLaunchKernel(&ker);
  return cudaSuccess;
 }
+
 cudaError_t cudaMalloc(void **devPtr, size_t size) {
  *devPtr = malloc(size);
  if (devPtr == NULL)
    return cudaErrorMemoryAllocation;
  return cudaSuccess;
 }
+
 cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
  memset(devPtr, value, count);
  return cudaSuccess;
 }
+
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
                       cudaMemcpyKind kind) {
  if (kind == cudaMemcpyHostToHost) {
@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
  return cudaSuccess;
 }

-static int stream_counter = 1;
 /*
 From our evaluation, CPU backend can gain little benefit
 from multi stream. Thus, we only use single stream
@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess;
 const char *cudaGetErrorString(cudaError_t error) {
  if (error == cudaSuccess) {
    return "Cuda Get Error Success";
+  } else {
+    return "Cuda Get Error Failed";
  }
 }

--- a/runtime/threadPool/CMakeLists.txt
+++ b/runtime/threadPool/CMakeLists.txt
@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME threadPool)

 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
 include_directories(../../common)
 include_directories(./include)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../external/moodycamel)

-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
--- a/runtime/threadPool/include/cpu/api.h
+++ b/runtime/threadPool/include/cpu/api.h
--- a/runtime/threadPool/include/cpu/def.h
+++ b/runtime/threadPool/include/cpu/def.h
--- a/runtime/threadPool/include/cpu/macros.h
+++ b/runtime/threadPool/include/cpu/macros.h
--- a/runtime/threadPool/include/cpu/structures.h
+++ b/runtime/threadPool/include/cpu/structures.h
--- a/runtime/threadPool/src/cpu/api.cpp
+++ b/runtime/threadPool/src/cpu/api.cpp
@ -1,3 +1,19 @@
+/*
+  This file contains the implementation of the CPU thread pool. For a kernel
+  launch, the host thread will enqueue the kernel to the kernelQueue, and the
+  threads in the thread pool will try to fetch work from the queue. After a
+  thread fetches a kernel from the queue, it will execute the kernel. After the
+  kernel execution, the thread will try to fetch another kernel from the queue.
+  If the queue is empty, the thread will wait for the next kernel launch.
+
+  By default, we try to use all CPU cores for execution. Thus, for a kernel
+  launch, the host thread pushes P kernel variables to the queue, where P is the
+  number of CPU cores.
+
+  For some lightweight kernels, useing fewer CPU cores can speed up the overall
+  execution time, due to fewer CPU cores lead to lower synchronization overhead.
+*/
+
 #include "api.h"
 #include "blockingconcurrentqueue.h"
 #include "debug.hpp"
@ -9,9 +25,6 @@
 #include <stdlib.h>
 #include <thread>

-/*
-Initialize the device
-*/
 int device_max_compute_units = 1;
 bool device_initilized = false;
 int init_device() {
@ -32,7 +45,6 @@ int init_device() {
 }

 // Create Kernel
-static int kernelIds = 0;
 cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
                         void **args, size_t sharedMem, cudaStream_t stream) {
  cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
@ -71,13 +83,11 @@ __thread int warp_shfl[32] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };

-/*
-    Enqueue Kernel (k) to the scheduler kernelQueue
-*/
 int TaskToExecute;
+// Enqueue Kernel to the scheduler kernelQueue
 int schedulerEnqueueKernel(cu_kernel *k) {
-  int totalBlocks =
-      k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
+  int totalBlocks = k->totalBlocks;
+  // by default, all CPU cores are used to execute GPU blocks equally
  int gpuBlockToExecutePerCpuThread =
      (totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
  TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) {
  return C_SUCCESS;
 }

-/*
-  Kernel Launch with numBlocks and numThreadsPerBlock
-*/
+// Push kernel to the kernelQueue
 int cuLaunchKernel(cu_kernel **k) {
  if (!device_initilized) {
    init_device();
  }
-  // Calculate Block Size N/numBlocks
-  cu_kernel *ker = *k;
-  int status = C_RUN;
  // set complete to false, this variable is used for sync
  for (int i = 0; i < scheduler->num_worker_threads; i++) {
    scheduler->thread_pool[i].completeTask = 0;
  }
-  schedulerEnqueueKernel(ker);
+  schedulerEnqueueKernel(*k);

  return 0;
 }

-/*
-    Thread Gets Work
-*/
+// threads in thread-pool try to fetch work from the queue
 int get_work(c_thread *th) {
  int dynamic_shared_mem_size = 0;
  dim3 gridDim;
@ -136,6 +139,7 @@ int get_work(c_thread *th) {
      grid_size_x = gridDim.x;
      grid_size_y = gridDim.y;
      grid_size_z = gridDim.z;
+      // allocate dynamic shared memory
      if (dynamic_shared_mem_size > 0)
        dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
      // execute GPU blocks
@ -153,7 +157,8 @@ int get_work(c_thread *th) {
    }
    // if cannot get tasks, check whether programs stop
    if (scheduler->threadpool_shutdown_requested) {
-      return true; // thread exit
+      // thread exit
+      break;
    }
  }
  return 0;
@ -176,9 +181,7 @@ void *driver_thread(void *p) {
  }
 }

-/*
-Initialize the scheduler
-*/
+// Initialize the scheduler
 int scheduler_init(cu_device device) {
  scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
  scheduler->num_worker_threads = device.max_compute_units;
@ -198,8 +201,6 @@ int scheduler_init(cu_device device) {
  return C_SUCCESS;
 }

-void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
-
 /*
  Barrier for Kernel Launch
 */
--- a/test/runHeteroMark.sh
+++ b/test/runHeteroMark.sh
@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \
    $HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc  $HeteroMark_PATH/src/common/benchmark/*.cc \
    $HeteroMark_PATH/src/common/command_line_option/*.cc  $HeteroMark_PATH/src/common/time_measurement/*.cc \
    -L$CuPBoP_BUILD_PATH/runtime   -L$CuPBoP_BUILD_PATH/runtime/threadPool \
-    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool
+    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool

 case $1 in
  aes)