From fd56811650dbdea1254bf80a46e748eecfa62ff3 Mon Sep 17 00:00:00 2001
From: Ruobing Han <hanruobing@gatech.edu>
Date: Wed, 13 Dec 2023 14:29:17 -0500
Subject: [PATCH] Refactoring the codebase. Remove useless variables; Add
 comments; Remove useless header files; Remove hard code and support both x86
 and ARM CPU

---
 .github/workflows/build.yml                   |   6 +-
 CMakeLists.txt                                |  19 +-
 README.md                                     |   2 +-
 compilation/CMakeLists.txt                    |  12 +-
 compilation/HostTranslation.cpp               |   5 -
 compilation/HostTranslation/CMakeLists.txt    |   7 +-
 .../include/{x86 => cpu}/RemoveCudaBuiltin.h  |   0
 .../include/{x86 => cpu}/RemoveMetadata.h     |   0
 .../{x86 => cpu}/ReplaceConstantMemory.h      |   0
 .../include/{x86 => cpu}/ReplaceCudaBuiltin.h |   0
 .../include/{x86 => cpu}/ReplaceKernelArgs.h  |   0
 .../src/{x86 => cpu}/RemoveCudaBuiltin.cpp    |   8 -
 .../src/{x86 => cpu}/RemoveMetadata.cpp       |  16 +-
 .../{x86 => cpu}/ReplaceConstantMemory.cpp    |   5 -
 .../src/{x86 => cpu}/ReplaceCudaBuiltin.cpp   |  21 --
 .../src/{x86 => cpu}/ReplaceKernelArgs.cpp    |  10 -
 compilation/KernelTranslation.cpp             |  29 ++-
 compilation/KernelTranslation/CMakeLists.txt  |   7 +-
 .../include/cpu/generate_cpu_format.h         |  10 +
 .../include/{x86 => cpu}/handle_sync.h        |   0
 .../include/{x86 => cpu}/init.h               |   0
 .../include/{x86 => cpu}/insert_sync.h        |   0
 .../include/{x86 => cpu}/insert_warp_loop.h   |   0
 .../include/{x86 => cpu}/memory_hierarchy.h   |   0
 .../include/{x86 => cpu}/performance.h        |   0
 .../include/{x86 => cpu}/tool.h               |   0
 .../include/{x86 => cpu}/warp_func.h          |   0
 .../include/x86/generate_x86_format.h         |  10 -
 .../src/cpu/generate_cpu_format.cpp           | 125 ++++++++++++
 .../src/{x86 => cpu}/handle_sync.cpp          |   6 -
 .../src/{x86 => cpu}/init.cpp                 |  30 +--
 .../src/{x86 => cpu}/insert_sync.cpp          |  32 +--
 .../src/{x86 => cpu}/insert_warp_loop.cpp     |  33 +---
 .../src/{x86 => cpu}/memory_hierarchy.cpp     |  31 +--
 .../src/{x86 => cpu}/performance.cpp          |  37 +---
 .../src/{x86 => cpu}/tool.cpp                 |  29 +--
 .../src/{x86 => cpu}/warp_func.cpp            |  11 +-
 .../src/x86/generate_x86_format.cpp           | 186 ------------------
 runtime/CMakeLists.txt                        |  12 +-
 runtime/include/{x86 => cpu}/cudaKernelImpl.h |   2 +-
 .../include/{x86 => cpu}/cudaRuntimeImpl.h    |   0
 runtime/src/{x86 => cpu}/cudaKernelImpl.cpp   |   0
 runtime/src/{x86 => cpu}/cudaRuntimeImpl.cpp  |  21 +-
 runtime/threadPool/CMakeLists.txt             |   5 +-
 runtime/threadPool/include/{x86 => cpu}/api.h |   0
 runtime/threadPool/include/{x86 => cpu}/def.h |   0
 .../threadPool/include/{x86 => cpu}/macros.h  |   0
 .../include/{x86 => cpu}/structures.h         |   0
 runtime/threadPool/src/{x86 => cpu}/api.cpp   |  51 ++---
 test/runHeteroMark.sh                         |   2 +-
 50 files changed, 249 insertions(+), 531 deletions(-)
 rename compilation/HostTranslation/include/{x86 => cpu}/RemoveCudaBuiltin.h (100%)
 rename compilation/HostTranslation/include/{x86 => cpu}/RemoveMetadata.h (100%)
 rename compilation/HostTranslation/include/{x86 => cpu}/ReplaceConstantMemory.h (100%)
 rename compilation/HostTranslation/include/{x86 => cpu}/ReplaceCudaBuiltin.h (100%)
 rename compilation/HostTranslation/include/{x86 => cpu}/ReplaceKernelArgs.h (100%)
 rename compilation/HostTranslation/src/{x86 => cpu}/RemoveCudaBuiltin.cpp (93%)
 rename compilation/HostTranslation/src/{x86 => cpu}/RemoveMetadata.cpp (66%)
 rename compilation/HostTranslation/src/{x86 => cpu}/ReplaceConstantMemory.cpp (96%)
 rename compilation/HostTranslation/src/{x86 => cpu}/ReplaceCudaBuiltin.cpp (94%)
 rename compilation/HostTranslation/src/{x86 => cpu}/ReplaceKernelArgs.cpp (88%)
 create mode 100644 compilation/KernelTranslation/include/cpu/generate_cpu_format.h
 rename compilation/KernelTranslation/include/{x86 => cpu}/handle_sync.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/init.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/insert_sync.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/insert_warp_loop.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/memory_hierarchy.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/performance.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/tool.h (100%)
 rename compilation/KernelTranslation/include/{x86 => cpu}/warp_func.h (100%)
 delete mode 100644 compilation/KernelTranslation/include/x86/generate_x86_format.h
 create mode 100644 compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
 rename compilation/KernelTranslation/src/{x86 => cpu}/handle_sync.cpp (90%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/init.cpp (93%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/insert_sync.cpp (95%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/insert_warp_loop.cpp (96%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/memory_hierarchy.cpp (90%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/performance.cpp (60%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/tool.cpp (96%)
 rename compilation/KernelTranslation/src/{x86 => cpu}/warp_func.cpp (95%)
 delete mode 100644 compilation/KernelTranslation/src/x86/generate_x86_format.cpp
 rename runtime/include/{x86 => cpu}/cudaKernelImpl.h (96%)
 rename runtime/include/{x86 => cpu}/cudaRuntimeImpl.h (100%)
 rename runtime/src/{x86 => cpu}/cudaKernelImpl.cpp (100%)
 rename runtime/src/{x86 => cpu}/cudaRuntimeImpl.cpp (97%)
 rename runtime/threadPool/include/{x86 => cpu}/api.h (100%)
 rename runtime/threadPool/include/{x86 => cpu}/def.h (100%)
 rename runtime/threadPool/include/{x86 => cpu}/macros.h (100%)
 rename runtime/threadPool/include/{x86 => cpu}/structures.h (100%)
 rename runtime/threadPool/src/{x86 => cpu}/api.cpp (82%)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 58fe7c7..8a77a6c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -51,7 +51,7 @@ jobs:
           hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
           llc --relocation-model=pic --filetype=obj  kernel.bc
           llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
           ./reverse
       - name: Execute the dynamic shared memory demo
         run: |
@@ -63,7 +63,7 @@ jobs:
           hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
           llc --relocation-model=pic --filetype=obj  kernel.bc
           llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
+          g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
           ./reverse
       - name: Execute Hetero-mark benchmark
         run: |
@@ -79,5 +79,5 @@ jobs:
           hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
           llc --relocation-model=pic --filetype=obj  kernel.bc
           llc --relocation-model=pic --filetype=obj  host.bc
-          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
+          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread
           ./lavaMD -boxes1d 10
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d889303..cf10d4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,13 +1,9 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 
-project(CudaOnX86)
-set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
+project(CuPBoP)
+set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.")
 set(CMAKE_CXX_STANDARD "14")
-set(MAJOR_VERSION 0)
-set(MINOR_VERSION 1)
-set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
-set(COX_VERSION ${VERSION_STRING})
-# get LLVM PATH get PATH for head file
+
 if(DEFINED LLVM_CONFIG_PATH)
   if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
     if(EXISTS "${LLVM_CONFIG_PATH}")
@@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH)
 else()
   message(FATAL_ERROR "llvm-config is required")
 endif()
-# get CUDA PATH
+
 if(DEFINED CUDA_PATH)
   message(STATUS "Using CUDA: ${CUDA_PATH}")
 else()
@@ -45,7 +41,7 @@ if(DEBUG)
 endif()
 
 set(CMAKE_CXX_FLAGS
-    "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
+    "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused")
 
 set(GCC_COVERAGE_LINK_FLAGS
     "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
@@ -54,5 +50,8 @@ add_subdirectory(compilation)
 add_subdirectory(runtime)
 enable_testing()
 
-option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data)
+set(HETERO_MARK_DATA
+    "/tmp/data"
+    CACHE PATH "The path to download hetero-mark dataset.")
+
 add_subdirectory(test)
diff --git a/README.md b/README.md
index 435d225..34986f2 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \
       -L$CuPBoP_PATH/build/runtime  \
       -L$CuPBoP_PATH/build/runtime/threadPool \
       host.o kernel.o \
-      -I../.. -lc -lx86Runtime -lthreadPool -lpthread
+      -I../.. -lc -lCPUruntime -lthreadPool -lpthread
 # Execute
 ./vecadd
 ```
diff --git a/compilation/CMakeLists.txt b/compilation/CMakeLists.txt
index e0f8bbc..7c54c29 100644
--- a/compilation/CMakeLists.txt
+++ b/compilation/CMakeLists.txt
@@ -1,20 +1,16 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-project(
-  NVVM2X86
-  DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend"
-  LANGUAGES CXX)
 
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
-# compile kernel translator
-include_directories(./KernelTranslation/include/x86)
+# build kernel translator
+include_directories(./KernelTranslation/include/cpu)
 add_subdirectory(KernelTranslation)
 
 add_executable(kernelTranslator KernelTranslation.cpp)
 target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
 
-# compile host translator
-include_directories(./HostTranslation/include/x86)
+# build host translator
+include_directories(./HostTranslation/include/cpu)
 add_subdirectory(HostTranslation)
 
 add_executable(hostTranslator HostTranslation.cpp)
diff --git a/compilation/HostTranslation.cpp b/compilation/HostTranslation.cpp
index cb32b34..02a7d9c 100644
--- a/compilation/HostTranslation.cpp
+++ b/compilation/HostTranslation.cpp
@@ -4,12 +4,7 @@
 #include "ReplaceCudaBuiltin.h"
 #include "ReplaceKernelArgs.h"
 #include "tool.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
 #include <assert.h>
-#include <fstream>
-#include <iostream>
-#include <stdlib.h>
 
 using namespace llvm;
 
diff --git a/compilation/HostTranslation/CMakeLists.txt b/compilation/HostTranslation/CMakeLists.txt
index cf893ac..4fd9cb8 100644
--- a/compilation/HostTranslation/CMakeLists.txt
+++ b/compilation/HostTranslation/CMakeLists.txt
@@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME cudaRuntime2cpuRuntime)
 
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../common)
 
-file(GLOB proj_HEADERS "include/x86/*.h")
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_HEADERS "include/cpu/*.h")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 
 # Add core library.
 add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
diff --git a/compilation/HostTranslation/include/x86/RemoveCudaBuiltin.h b/compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h
similarity index 100%
rename from compilation/HostTranslation/include/x86/RemoveCudaBuiltin.h
rename to compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h
diff --git a/compilation/HostTranslation/include/x86/RemoveMetadata.h b/compilation/HostTranslation/include/cpu/RemoveMetadata.h
similarity index 100%
rename from compilation/HostTranslation/include/x86/RemoveMetadata.h
rename to compilation/HostTranslation/include/cpu/RemoveMetadata.h
diff --git a/compilation/HostTranslation/include/x86/ReplaceConstantMemory.h b/compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h
similarity index 100%
rename from compilation/HostTranslation/include/x86/ReplaceConstantMemory.h
rename to compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h
diff --git a/compilation/HostTranslation/include/x86/ReplaceCudaBuiltin.h b/compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h
similarity index 100%
rename from compilation/HostTranslation/include/x86/ReplaceCudaBuiltin.h
rename to compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h
diff --git a/compilation/HostTranslation/include/x86/ReplaceKernelArgs.h b/compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h
similarity index 100%
rename from compilation/HostTranslation/include/x86/ReplaceKernelArgs.h
rename to compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h
diff --git a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp b/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
similarity index 93%
rename from compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp
rename to compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
index f74b816..acb8024 100644
--- a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp
@@ -3,16 +3,8 @@
  */
 #include "RemoveCudaBuiltin.h"
 #include "debug.hpp"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Transforms/Utils/CtorUtils.h"
-#include <iostream>
-#include <map>
 #include <set>
 
 using namespace llvm;
diff --git a/compilation/HostTranslation/src/x86/RemoveMetadata.cpp b/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
similarity index 66%
rename from compilation/HostTranslation/src/x86/RemoveMetadata.cpp
rename to compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
index 515befc..4d7ddc3 100644
--- a/compilation/HostTranslation/src/x86/RemoveMetadata.cpp
+++ b/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp
@@ -1,16 +1,14 @@
 #include "RemoveMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <iostream>
+#include "llvm/Support/Host.h"
 
 using namespace llvm;
 
 void RemoveMetadata(llvm::Module *M) {
+  // change the target triple to the host triple
+  M->setTargetTriple(llvm::sys::getProcessTriple());
+  // use the default DataLayout
+  M->setDataLayout("");
+
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     Function *F = &(*i);
@@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) {
     F->removeFnAttr("min-legal-vector-width");
     F->removeFnAttr("no-trapping-math");
     F->removeFnAttr(llvm::Attribute::OptimizeNone);
+    F->removeFnAttr("target-cpu");
+    F->removeFnAttr("target-features");
   }
 }
diff --git a/compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp b/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
similarity index 96%
rename from compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp
rename to compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
index 48b0a0f..adb191b 100644
--- a/compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp
@@ -1,12 +1,7 @@
 #include "ReplaceConstantMemory.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include <assert.h>
 #include <fstream>
-#include <iostream>
 #include <map>
 #include <set>
 
diff --git a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp b/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
similarity index 94%
rename from compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp
rename to compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
index 5c1ea5d..23f2df3 100644
--- a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp
@@ -1,13 +1,6 @@
 #include "ReplaceCudaBuiltin.h"
 #include "debug.hpp"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <iostream>
 #include <map>
 #include <regex>
 #include <set>
@@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
   std::map<std::string, Function *> kernels;
 
   std::set<llvm::Function *> need_remove;
-  LLVMContext *C = &M->getContext();
-
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
-  llvm::FunctionType *LauncherFuncT =
-      FunctionType::get(Type::getVoidTy(*C), NULL);
-
-  llvm::FunctionType *LaunchFun2 =
-      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
-
-  bool done = false;
 
   std::set<std::string> cuda_register_kernel_names;
 
@@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
                 std::vector<size_t> arg_sizes;
                 functionOperand =
                     dyn_cast<Function>(callOperand->stripPointerCasts());
-
-                FunctionType *ft = calledFunction->getFunctionType();
                 DEBUG_INFO("Parent (Caller) Function Name: %s, "
                            "cudaLaunchKernel Function: %s, args : %d\n",
                            func_name.c_str(),
diff --git a/compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp b/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
similarity index 88%
rename from compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp
rename to compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
index 501a783..6d6212c 100644
--- a/compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp
+++ b/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp
@@ -1,12 +1,5 @@
 #include "ReplaceKernelArgs.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <iostream>
 #include <map>
 #include <set>
 
@@ -23,12 +16,9 @@ using namespace llvm;
 // to use use-analysis to find the arguments in the future
 void ReplaceKernelArg(llvm::Module *M) {
   LLVMContext &context = M->getContext();
-  auto VoidTy = llvm::Type::getVoidTy(context);
-  auto I8 = llvm::Type::getInt8PtrTy(context);
   std::map<std::string, Function *> kernels;
 
   std::set<llvm::Function *> need_replace;
-  LLVMContext *C = &M->getContext();
 
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     Function *F = &(*i);
diff --git a/compilation/KernelTranslation.cpp b/compilation/KernelTranslation.cpp
index 2dd318b..bb2d0b1 100644
--- a/compilation/KernelTranslation.cpp
+++ b/compilation/KernelTranslation.cpp
@@ -1,4 +1,4 @@
-#include "generate_x86_format.h"
+#include "generate_cpu_format.h"
 #include "handle_sync.h"
 #include "init.h"
 #include "insert_sync.h"
@@ -6,17 +6,14 @@
 #include "performance.h"
 #include "tool.h"
 #include "warp_func.h"
-#include "llvm/IR/Module.h"
 #include <assert.h>
-#include <fstream>
-#include <iostream>
-#include <llvm/Support/raw_ostream.h>
-#include <map>
-#include <set>
-#include <stdlib.h>
 
 using namespace llvm;
 
+// to support constant memory variables, we need to convert information
+// from kernelTranslator to HostTranslator, since HostTranslator knows nothing
+// about the kernel functions, we need to write the information to a file
+// by KernelTranslator and read it in HostTranslator
 std::string PATH = "kernel_meta.log";
 
 int main(int argc, char **argv) {
@@ -26,8 +23,9 @@ int main(int argc, char **argv) {
   std::ofstream fout;
   fout.open(PATH);
 
-  // inline, and create auxiliary global variables
+  // inline __device__ functions, and create auxiliary global variables
   init_block(program, fout);
+
   // insert sync before each vote, and replace the
   // original vote function to warp vote
   handle_warp_vote(program);
@@ -40,17 +38,18 @@ int main(int argc, char **argv) {
 
   // split block by sync
   split_block_by_sync(program);
-  // add loop for intra&intera thread
+
+  // add loop for intra&intera thread, it refers 'hierarchical collapsing' in
+  // COX paper.
   insert_warp_loop(program);
 
-  // (TODO): replace this patch
   replace_built_in_function(program);
 
-  // TODO: replace with a more general function
-  // Not only for x86 backend
-  generate_x86_format(program);
+  // the input kernel programs have NVIDIA metadata, they need to be replaced to
+  // CPU metadata
+  generate_cpu_format(program);
 
-  // performance optimization
+  // execute O3 pipeline on the transformed program
   performance_optimization(program);
 
   VerifyModule(program);
diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt
index a5d71b9..e2e7633 100644
--- a/compilation/KernelTranslation/CMakeLists.txt
+++ b/compilation/KernelTranslation/CMakeLists.txt
@@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME spmd2mpmd)
 
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../common)
 
-file(GLOB proj_HEADERS "include/x86/*.h")
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_HEADERS "include/cpu/*.h")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 
 # Add core library.
 add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
diff --git a/compilation/KernelTranslation/include/cpu/generate_cpu_format.h b/compilation/KernelTranslation/include/cpu/generate_cpu_format.h
new file mode 100644
index 0000000..86e6abc
--- /dev/null
+++ b/compilation/KernelTranslation/include/cpu/generate_cpu_format.h
@@ -0,0 +1,10 @@
+#ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__
+#define __NVVM2CPU_GENERATE_CPU_FORMAT__
+
+#include "llvm/IR/Module.h"
+
+void generate_cpu_format(llvm::Module *M);
+
+void set_meta_data(llvm::Module *M);
+
+#endif
diff --git a/compilation/KernelTranslation/include/x86/handle_sync.h b/compilation/KernelTranslation/include/cpu/handle_sync.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/handle_sync.h
rename to compilation/KernelTranslation/include/cpu/handle_sync.h
diff --git a/compilation/KernelTranslation/include/x86/init.h b/compilation/KernelTranslation/include/cpu/init.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/init.h
rename to compilation/KernelTranslation/include/cpu/init.h
diff --git a/compilation/KernelTranslation/include/x86/insert_sync.h b/compilation/KernelTranslation/include/cpu/insert_sync.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/insert_sync.h
rename to compilation/KernelTranslation/include/cpu/insert_sync.h
diff --git a/compilation/KernelTranslation/include/x86/insert_warp_loop.h b/compilation/KernelTranslation/include/cpu/insert_warp_loop.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/insert_warp_loop.h
rename to compilation/KernelTranslation/include/cpu/insert_warp_loop.h
diff --git a/compilation/KernelTranslation/include/x86/memory_hierarchy.h b/compilation/KernelTranslation/include/cpu/memory_hierarchy.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/memory_hierarchy.h
rename to compilation/KernelTranslation/include/cpu/memory_hierarchy.h
diff --git a/compilation/KernelTranslation/include/x86/performance.h b/compilation/KernelTranslation/include/cpu/performance.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/performance.h
rename to compilation/KernelTranslation/include/cpu/performance.h
diff --git a/compilation/KernelTranslation/include/x86/tool.h b/compilation/KernelTranslation/include/cpu/tool.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/tool.h
rename to compilation/KernelTranslation/include/cpu/tool.h
diff --git a/compilation/KernelTranslation/include/x86/warp_func.h b/compilation/KernelTranslation/include/cpu/warp_func.h
similarity index 100%
rename from compilation/KernelTranslation/include/x86/warp_func.h
rename to compilation/KernelTranslation/include/cpu/warp_func.h
diff --git a/compilation/KernelTranslation/include/x86/generate_x86_format.h b/compilation/KernelTranslation/include/x86/generate_x86_format.h
deleted file mode 100644
index 747b1b1..0000000
--- a/compilation/KernelTranslation/include/x86/generate_x86_format.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
-#define __NVVM2x86_GENERATE_X86_FORMAT__
-
-#include "llvm/IR/Module.h"
-
-void generate_x86_format(llvm::Module *M);
-
-void set_meta_data(llvm::Module *M);
-
-#endif
diff --git a/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp b/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
new file mode 100644
index 0000000..9ee10ff
--- /dev/null
+++ b/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp
@@ -0,0 +1,125 @@
+#include "generate_cpu_format.h"
+#include "debug.hpp"
+#include "tool.h"
+#include "llvm/Support/Host.h"
+
+using namespace llvm;
+
+// set TargetTriple and DataLayout same as the host CPU
+void set_meta_data(llvm::Module *M) {
+  M->setTargetTriple(llvm::sys::getProcessTriple());
+  // use the default DataLayout
+  M->setDataLayout("");
+}
+
+// as pthread only accept a single void* for input
+// we have to decode this input inside the kernel
+void decode_input(llvm::Module *M) {
+
+  std::set<llvm::Function *> need_remove;
+
+  llvm::Type *Int32T = Type::getInt32Ty(M->getContext());
+  llvm::Type *Int8T = Type::getInt8Ty(M->getContext());
+
+  llvm::FunctionType *LauncherFuncT = FunctionType::get(
+      Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false);
+
+  // generate Wrapper Function type
+  // now we only support a single int32*
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    if (!isKernelFunction(M, F))
+      continue;
+    auto func_name = F->getName().str();
+    // filter out _Z24 and other mangled prefix
+    for (int pos = 2; pos < func_name.length(); pos++) {
+      if (func_name[pos] >= '0' && func_name[pos] <= '9')
+        continue;
+      func_name = func_name.substr(pos);
+      break;
+    }
+    llvm::IRBuilder<> Builder(M->getContext());
+
+    FunctionCallee fc =
+        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
+    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
+
+    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
+    Builder.SetInsertPoint(Block);
+
+    // WorkGroup has only a single input
+    Function::arg_iterator ai = WorkGroup->arg_begin();
+
+    SmallVector<Value *, 8> Arguments;
+    Value *input_arg = &*ai;
+    // convert to int**
+    input_arg = Builder.CreateBitOrPointerCast(
+        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
+
+    size_t idx = 0;
+    // replace original arguments with the unpacked values
+    // for example, for a function f(int* a, char* b),
+    // we will generate a function f_wrapper(int** input)
+    // and replace the original arguments with the unpacked values
+    // e.g., a = (int*)input[0], b = (char*)input[1]
+    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+         ii != ee; ++ii) {
+      Type *ArgType = ii->getType();
+      // calculate addr
+      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
+      // load corresponding int*
+      GEP = createLoad(Builder, GEP);
+      // bitcast
+      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
+      Value *Arg = createLoad(Builder, GEP);
+      Arguments.push_back(Arg);
+      ++idx;
+    }
+    Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
+    Builder.CreateRetVoid();
+  }
+  for (auto f : need_remove) {
+    f->dropAllReferences();
+    f->eraseFromParent();
+  }
+}
+
+// after flat/hierarchical collapsing, the barrier instructions are useless
+void remove_barrier(llvm::Module *M) {
+  std::vector<Instruction *> need_remove;
+  for (auto F = M->begin(); F != M->end(); ++F)
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) {
+        if (auto Call = dyn_cast<CallInst>(Inst)) {
+          if (Call->isInlineAsm())
+            continue;
+          auto func_name = Call->getCalledFunction()->getName().str();
+          if (func_name == "llvm.nvvm.bar.warp.sync" ||
+              func_name == "llvm.nvvm.barrier0" ||
+              func_name == "llvm.nvvm.barrier.sync") {
+            need_remove.push_back(Call);
+          }
+        }
+      }
+    }
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+
+void remove_useless_var(llvm::Module *M) {
+  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
+  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
+}
+
+void generate_cpu_format(llvm::Module *M) {
+  DEBUG_INFO("generate cpu format\n");
+  // change metadata
+  set_meta_data(M);
+  // decode argument
+  decode_input(M);
+  // remove barrier
+  remove_barrier(M);
+  // remove useless func/variable
+  remove_useless_var(M);
+}
diff --git a/compilation/KernelTranslation/src/x86/handle_sync.cpp b/compilation/KernelTranslation/src/cpu/handle_sync.cpp
similarity index 90%
rename from compilation/KernelTranslation/src/x86/handle_sync.cpp
rename to compilation/KernelTranslation/src/cpu/handle_sync.cpp
index f9f4b5c..4806e58 100644
--- a/compilation/KernelTranslation/src/x86/handle_sync.cpp
+++ b/compilation/KernelTranslation/src/cpu/handle_sync.cpp
@@ -1,13 +1,7 @@
 #include "handle_sync.h"
 #include "debug.hpp"
 #include "tool.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include <set>
 #include <string>
 
diff --git a/compilation/KernelTranslation/src/x86/init.cpp b/compilation/KernelTranslation/src/cpu/init.cpp
similarity index 93%
rename from compilation/KernelTranslation/src/x86/init.cpp
rename to compilation/KernelTranslation/src/cpu/init.cpp
index 049d4b2..117de31 100644
--- a/compilation/KernelTranslation/src/x86/init.cpp
+++ b/compilation/KernelTranslation/src/cpu/init.cpp
@@ -2,26 +2,11 @@
 #include "debug.hpp"
 #include "memory_hierarchy.h"
 #include "tool.h"
-#include <fstream>
-#include <iostream>
-#include <set>
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <set>
 
 using namespace llvm;
 
@@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) {
 
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     Function *F = &(*i);
-    auto func_name = F->getName().str();
     if (!isKernelFunction(M, F))
       continue;
-    Function::iterator I = F->begin();
-    for (Function::iterator E = F->end(); I != E; ++I) {
+    for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
       for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
         if (CallInst *c = dyn_cast<CallInst>(BI++)) {
           if (c->getCalledFunction()) {
@@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) {
 }
 
 bool find_sreg_inst(llvm::Function *F) {
-  Function::iterator I = F->begin();
-  for (Function::iterator E = F->end(); I != E; ++I) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
       if (CallInst *c = dyn_cast<CallInst>(BI++)) {
         if (c->getCalledFunction()) {
@@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) {
   Passes.run(*M);
 }
 
+// transform constant expression into sequence of instructions
 bool lower_constant_expr(llvm::Module *M) {
   bool modified = false;
-  LLVMContext &context = M->getContext();
-  auto I32 = llvm::Type::getInt32Ty(context);
   std::vector<CallInst *> need_remove;
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     Function *F = &(*i);
-    auto func_name = F->getName().str();
     if (!isKernelFunction(M, F))
       continue;
 
@@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) {
   return modified;
 }
 
+// replace _ZL3expd, just delete its body
 void replace_cuda_math_built_in(llvm::Module *M) {
-  // replace _ZL3expd, just delete its body
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     Function *F = &(*i);
     auto func_name = F->getName().str();
diff --git a/compilation/KernelTranslation/src/x86/insert_sync.cpp b/compilation/KernelTranslation/src/cpu/insert_sync.cpp
similarity index 95%
rename from compilation/KernelTranslation/src/x86/insert_sync.cpp
rename to compilation/KernelTranslation/src/cpu/insert_sync.cpp
index aa2be0c..a2c5311 100644
--- a/compilation/KernelTranslation/src/x86/insert_sync.cpp
+++ b/compilation/KernelTranslation/src/cpu/insert_sync.cpp
@@ -4,29 +4,9 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <iostream>
 #include <queue>
 
 using namespace llvm;
@@ -44,7 +24,7 @@ public:
     std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
     std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
 
-    // insert sync in the entry
+    // insert sync after the entry and before the first non-AllocaInst
     BasicBlock *entry = &(*F.begin());
     for (auto i = entry->begin(); i != entry->end(); i++) {
       if (!isa<AllocaInst>(i)) {
@@ -54,10 +34,8 @@ public:
     }
 
     for (Function::iterator I = F.begin(); I != F.end(); ++I) {
-      BasicBlock::iterator BI = I->begin();
-
       // insert barrier before return
-      for (; BI != I->end(); BI++) {
+      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
         llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
         if (Ret) {
           insert_inter_warp_sync_before.push_back(&(*BI));
@@ -125,7 +103,7 @@ public:
 
     auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
 
-    // first find all conditional barriers
+    // find all conditional barriers
     std::vector<BasicBlock *> conditionalBarriers;
     for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
       BasicBlock *b = &*i;
@@ -148,12 +126,9 @@ public:
       conditionalBarriers.pop_back();
 
       // insert barrier in the start of if-condition
-
-      BasicBlock *pos = b;
       BasicBlock *pred = firstNonBackedgePredecessor(b);
 
       while (PDT->getPostDomTree().dominates(b, pred)) {
-        pos = pred;
         // If our BB post dominates the given block, we know it is not the
         // branching block that makes the barrier conditional.
         pred = firstNonBackedgePredecessor(pred);
@@ -468,7 +443,6 @@ public:
       auto header_block = L->getHeader();
       assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
              "has more than 2 successors of the for-head\n");
-      BasicBlock *for_body = NULL;
       for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
            i++) {
         auto bb = header_block->getTerminator()->getSuccessor(i);
diff --git a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp b/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
similarity index 96%
rename from compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
rename to compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
index 0c11c2f..50aa3e0 100644
--- a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp
@@ -4,43 +4,20 @@
 #include "handle_sync.h"
 #include "tool.h"
 #include <assert.h>
-#include <iostream>
 #include <set>
 
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 #include <set>
 #include <sstream>
-#include <tuple>
-#include <vector>
 
 using namespace llvm;
 
@@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
   BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
 
   IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
-  Function *FF = instruction->getParent()->getParent();
   Module *M = instruction->getParent()->getParent()->getParent();
-  LLVMContext &C = M->getContext();
-  const llvm::DataLayout &Layout = M->getDataLayout();
 
   llvm::Type *elementType;
   if (isa<AllocaInst>(instruction)) {
@@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
   }
 
   Type *AllocType = elementType;
-  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
-  llvm::Value *ItemSize = nullptr;
   llvm::AllocaInst *Alloca = nullptr;
 
   auto block_size_addr = M->getGlobalVariable("block_size");
@@ -697,9 +669,6 @@ public:
             is_single_conditional_branch_block = 1;
           } else {
             // generate by replicate local variable
-            printf(
-                "[WARNING] match single conditional branch with HARD CODE\n");
-            bool branch_to_intra_init = false;
             for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
               llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
               auto block_name = entryCandidate->getName().str();
@@ -755,7 +724,7 @@ public:
       entry = entryCandidate;
       break;
     }
-    // delete useless PR, those PRs only have branch
+    // delete useless PR, those PRs only have branch instructions
     if (entry == exit) {
       if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
         return;
diff --git a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp b/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
similarity index 90%
rename from compilation/KernelTranslation/src/x86/memory_hierarchy.cpp
rename to compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
index 5bc4e01..90f5a99 100644
--- a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp
@@ -1,29 +1,10 @@
 #include "memory_hierarchy.h"
 #include "debug.hpp"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <assert.h>
-#include <fstream>
-#include <iostream>
 #include <map>
 #include <set>
-#include <sstream>
-#include <tuple>
-#include <vector>
 
 void mem_share2global(llvm::Module *M) {
-  LLVMContext *C = &M->getContext();
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int64T = Type::getInt64Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
   std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
   std::set<llvm::Instruction *> need_remove;
   std::set<GlobalVariable *> need_remove_share_memory;
@@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) {
               // generate global type pointer
               PointerType *PointerTy =
                   PointerType::get(array_type->getElementType(), 0);
-              llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
               llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
                   *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
                   NULL, "dynamic_shared_memory", NULL,
@@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) {
                 std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                               global_memory));
           } else if (element_type->isFloatTy()) {
-            auto FP_type = llvm::Type::getFloatTy(*C);
+            auto FP_type = llvm::Type::getFloatTy(M->getContext());
             auto zero = llvm::ConstantFP::get(FP_type, 0);
             llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
                 *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
@@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) {
 }
 
 void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
-  LLVMContext *C = &M->getContext();
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int64T = Type::getInt64Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
   std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
   std::set<llvm::Instruction *> need_remove;
   std::set<GlobalVariable *> need_remove_constant_memory;
@@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
     if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
       if (auto PT = dyn_cast<PointerType>(I->getType())) {
         unsigned AS = PT->getAddressSpace();
-        if (AS == 4) { // find a share memory
+        if (AS == 4) { // find a constant memory
           need_remove_constant_memory.insert(constant_memory);
           // generate the corresponding global memory variable
           auto new_name = "wrapper_global_" + constant_memory->getName().str();
@@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
           if (auto array_type = dyn_cast<ArrayType>(element_type)) {
             if (constant_memory->hasExternalLinkage() &&
                 array_type->getArrayNumElements() == 0) {
-              // external shared memory of []
+              // external constant memory of []
               // generate global type pointer
               PointerType *PointerTy =
                   PointerType::get(array_type->getElementType(), 0);
diff --git a/compilation/KernelTranslation/src/x86/performance.cpp b/compilation/KernelTranslation/src/cpu/performance.cpp
similarity index 60%
rename from compilation/KernelTranslation/src/x86/performance.cpp
rename to compilation/KernelTranslation/src/cpu/performance.cpp
index 2d79384..fa00643 100644
--- a/compilation/KernelTranslation/src/x86/performance.cpp
+++ b/compilation/KernelTranslation/src/cpu/performance.cpp
@@ -1,43 +1,13 @@
 #include "performance.h"
 #include "debug.hpp"
 #include "tool.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/PassInfo.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <map>
-#include <set>
-#include <sstream>
-#include <tuple>
-#include <vector>
 
 using namespace llvm;
 
@@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) {
   llvm::legacy::PassManager Passes;
 
   // add target machine info
-  llvm::Triple triple("x86_64-unknown-linux-gnu");
+  llvm::Triple triple(llvm::sys::getProcessTriple());
 
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
@@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) {
   Options.FloatABIType = FloatABI::Hard;
 
   TargetMachine *TM = TheTarget->createTargetMachine(
-      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
+      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""),
       Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
   assert(TM && "No Machine Information\n");
 
@@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) {
   Builder.LoopVectorize = true;
   Builder.SLPVectorize = true;
 
-  Builder.VerifyInput = true;
-  Builder.VerifyOutput = true;
-
   Builder.populateModulePassManager(Passes);
   Passes.run(*M);
 }
diff --git a/compilation/KernelTranslation/src/x86/tool.cpp b/compilation/KernelTranslation/src/cpu/tool.cpp
similarity index 96%
rename from compilation/KernelTranslation/src/x86/tool.cpp
rename to compilation/KernelTranslation/src/cpu/tool.cpp
index d3024ac..32ab62e 100644
--- a/compilation/KernelTranslation/src/x86/tool.cpp
+++ b/compilation/KernelTranslation/src/cpu/tool.cpp
@@ -1,29 +1,13 @@
 #include "tool.h"
 #include "debug.hpp"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-
-#include <iostream>
 #include <set>
 
 using namespace llvm;
@@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
     Value *val = phi->getIncomingValue(incoming);
     BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
     builder.SetInsertPoint(incomingBB->getTerminator());
-    llvm::Instruction *store = builder.CreateStore(val, alloca);
+    builder.CreateStore(val, alloca);
   }
   builder.SetInsertPoint(phi);
 
@@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) {
       }
     }
 
-    bool changed = false;
     for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
       Instruction *instr = *i;
       BreakPHIToAllocas(dyn_cast<PHINode>(instr));
@@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) {
 
     for (auto BB = F->begin(); BB != F->end(); ++BB) {
       for (auto BI = BB->begin(); BI != BB->end(); BI++) {
-        if (auto Load = dyn_cast<LoadInst>(BI)) {
-          auto load_from = Load->getOperand(0);
-        } else if (auto Call = dyn_cast<CallInst>(BI)) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
           if (Call->getCalledFunction()) {
             auto func_name = Call->getCalledFunction()->getName().str();
             if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
@@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) {
         if (auto Call = dyn_cast<CallInst>(BI)) {
           if (Call->getCalledFunction()) {
             auto func_name = Call->getCalledFunction()->getName().str();
-            auto callFn = Call->getCalledFunction();
             if (func_name == "vprintf") {
               /*
                * replace CUDA's printf to C's printf
@@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) {
                     dyn_cast<PointerType>(BC->getOperand(0)->getType());
                 auto SrcTy = SrcPointTy->getElementType();
                 // reverse the bitcast
-                auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
+                new BitCastInst(BC, SrcPointTy, "", Call);
                 assert(SrcTy->isStructTy() == 1);
                 auto StructTy = dyn_cast<StructType>(SrcTy);
                 for (int i = 0; i < StructTy->getNumElements(); i++) {
@@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) {
 
 void replace_asm_call(llvm::Module *M) {
   LLVMContext &context = M->getContext();
-  auto I32 = llvm::Type::getInt32Ty(context);
   std::vector<CallInst *> need_remove;
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     Function *F = &(*i);
diff --git a/compilation/KernelTranslation/src/x86/warp_func.cpp b/compilation/KernelTranslation/src/cpu/warp_func.cpp
similarity index 95%
rename from compilation/KernelTranslation/src/x86/warp_func.cpp
rename to compilation/KernelTranslation/src/cpu/warp_func.cpp
index d388f11..ed406c9 100644
--- a/compilation/KernelTranslation/src/x86/warp_func.cpp
+++ b/compilation/KernelTranslation/src/cpu/warp_func.cpp
@@ -2,15 +2,6 @@
 #include "warp_func.h"
 #include "debug.hpp"
 #include "tool.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <iostream>
 #include <set>
 
 using namespace llvm;
@@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) {
       res = BinaryOperator::CreateNot(res, "", sync_inst);
     }
 
-    auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
+    new llvm::StoreInst(res, GEP, "", sync_inst);
     // create barrier
     CreateIntraWarpBarrier(sync_inst);
     /*
diff --git a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
deleted file mode 100644
index 23c97d0..0000000
--- a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-#include "generate_x86_format.h"
-#include "debug.hpp"
-#include "tool.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <iostream>
-#include <map>
-
-using namespace llvm;
-
-void set_meta_data(llvm::Module *M) {
-  M->setTargetTriple("x86_64-unknown-linux-gnu");
-  M->setDataLayout(
-      "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
-}
-
-// as pthread only accept a single void* for input
-// we have to decode this input inside the kernel
-void decode_input(llvm::Module *M) {
-
-  std::set<llvm::Function *> need_remove;
-
-  LLVMContext *C = &M->getContext();
-  llvm::Type *Int32T = Type::getInt32Ty(*C);
-  llvm::Type *Int8T = Type::getInt8Ty(*C);
-
-  llvm::FunctionType *LauncherFuncT = FunctionType::get(
-      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
-
-  std::set<GlobalVariable *> dynmaic_memory;
-
-  std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
-
-  // generate Wrapper Function type
-  // now we only support a single int32*
-  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
-    Function *F = &(*i);
-    if (!isKernelFunction(M, F))
-      continue;
-    auto func_name = F->getName().str();
-    // remove mangle prefix
-    // remove _Z24
-    for (int pos = 2; pos < func_name.length(); pos++) {
-      if (func_name[pos] >= '0' && func_name[pos] <= '9')
-        continue;
-      func_name = func_name.substr(pos);
-      break;
-    }
-    llvm::IRBuilder<> Builder(M->getContext());
-
-    FunctionCallee fc =
-        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
-    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
-
-    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
-    Builder.SetInsertPoint(Block);
-
-    // WorkGroup has only a single input
-    Function::arg_iterator ai = WorkGroup->arg_begin();
-
-    SmallVector<Value *, 8> Arguments;
-    Value *input_arg = &*ai;
-    // convert to int**
-    input_arg = Builder.CreateBitOrPointerCast(
-        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
-
-    // dynamic memory load in the wrapper function
-    GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
-    if (share_memory != NULL) {
-      dynmaic_memory.insert(share_memory);
-      llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
-          *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
-          "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
-          0, false);
-      Value *loadedValue = createLoad(Builder, global_mem);
-
-      llvm::FunctionType *LaunchFun2 = FunctionType::get(
-          PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
-
-      FunctionCallee fc2 =
-          M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
-
-      Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
-
-      WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
-      WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
-      Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
-      co->setSelectionKind(Comdat::SelectionKind::Any);
-      WorkGroup2->setComdat(co);
-
-      BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
-
-      llvm::IRBuilder<> Builder2(M->getContext());
-      Builder2.SetInsertPoint(Block2);
-      Builder2.CreateRet(share_memory);
-
-      auto PT = dyn_cast<PointerType>(share_memory->getType());
-      auto element_type = PT->getElementType();
-
-      AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
-      Value *new_ar = new_arr;
-      Value *gptr = Builder.CreateBitOrPointerCast(
-          share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
-
-      Builder.CreateStore(new_ar, gptr);
-    }
-
-    size_t idx = 0;
-    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
-         ii != ee; ++ii) {
-      Type *ArgType = ii->getType();
-
-      // calculate addr
-      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
-      // load corresponding int*
-      GEP = createLoad(Builder, GEP);
-      // bitcast
-      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
-      Value *Arg = createLoad(Builder, GEP);
-      Arguments.push_back(Arg);
-      ++idx;
-    }
-
-    CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
-    Builder.CreateRetVoid();
-  }
-  for (auto f : need_remove) {
-    f->dropAllReferences();
-    f->eraseFromParent();
-  }
-}
-
-void remove_barrier(llvm::Module *M) {
-  std::vector<Instruction *> need_remove;
-  for (auto F = M->begin(); F != M->end(); ++F)
-    for (auto BB = F->begin(); BB != F->end(); ++BB) {
-      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
-        if (auto Call = dyn_cast<CallInst>(BI)) {
-          if (Call->isInlineAsm())
-            continue;
-          auto func_name = Call->getCalledFunction()->getName().str();
-          if (func_name == "llvm.nvvm.bar.warp.sync" ||
-              func_name == "llvm.nvvm.barrier0" ||
-              func_name == "llvm.nvvm.barrier.sync") {
-            need_remove.push_back(Call);
-          }
-        }
-      }
-    }
-  for (auto inst : need_remove) {
-    inst->eraseFromParent();
-  }
-}
-
-void remove_useless_var(llvm::Module *M) {
-  M->getGlobalVariable("intra_warp_index")->eraseFromParent();
-  M->getGlobalVariable("inter_warp_index")->eraseFromParent();
-}
-
-void generate_x86_format(llvm::Module *M) {
-  DEBUG_INFO("generate x86 format\n");
-  // change metadata
-  set_meta_data(M);
-  // decode argument
-  decode_input(M);
-  // remove barrier
-  remove_barrier(M);
-  // remove useless func/variable
-  remove_useless_var(M);
-}
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 4c4e8e7..36d86ad 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -1,9 +1,9 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(
-  X86runtime
-  DESCRIPTION "Implementation CUDA runtime API with x86"
+  CPUruntime
+  DESCRIPTION "Implementation CUDA runtime API with CPUs"
   LANGUAGES CXX)
-set(LIB_NAME x86Runtime)
+set(LIB_NAME CPUruntime)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 # compile threadPool implementation
@@ -12,9 +12,9 @@ add_subdirectory(threadPool)
 # compile x86 runtime library
 include_directories(../common)
 include_directories(./include/)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(./threadPool/include/)
-include_directories(./threadPool/include/x86)
+include_directories(./threadPool/include/cpu)
 include_directories(../external/moodycamel/)
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
diff --git a/runtime/include/x86/cudaKernelImpl.h b/runtime/include/cpu/cudaKernelImpl.h
similarity index 96%
rename from runtime/include/x86/cudaKernelImpl.h
rename to runtime/include/cpu/cudaKernelImpl.h
index e65d0c1..1abbfae 100644
--- a/runtime/include/x86/cudaKernelImpl.h
+++ b/runtime/include/cpu/cudaKernelImpl.h
@@ -1,4 +1,4 @@
-#ifndef __RUNTIME_IMPL__
+#ifndef __KERNEL_IMPL__
 #define __KERNEL_IMPL__
 #include "structures.h"
 #include <stdint.h>
diff --git a/runtime/include/x86/cudaRuntimeImpl.h b/runtime/include/cpu/cudaRuntimeImpl.h
similarity index 100%
rename from runtime/include/x86/cudaRuntimeImpl.h
rename to runtime/include/cpu/cudaRuntimeImpl.h
diff --git a/runtime/src/x86/cudaKernelImpl.cpp b/runtime/src/cpu/cudaKernelImpl.cpp
similarity index 100%
rename from runtime/src/x86/cudaKernelImpl.cpp
rename to runtime/src/cpu/cudaKernelImpl.cpp
diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/cpu/cudaRuntimeImpl.cpp
similarity index 97%
rename from runtime/src/x86/cudaRuntimeImpl.cpp
rename to runtime/src/cpu/cudaRuntimeImpl.cpp
index 8f60525..c281845 100644
--- a/runtime/src/x86/cudaRuntimeImpl.cpp
+++ b/runtime/src/cpu/cudaRuntimeImpl.cpp
@@ -10,27 +10,31 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 cudaError_t cudaGetDevice(int *devPtr) {
   *devPtr = 0;
   return cudaSuccess;
 }
+
 const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
-cudaError_t cudaDeviceReset(void) {
-  scheduler_uninit();
-  return cudaSuccess;
-}
+
+cudaError_t cudaDeviceReset(void) { return cudaSuccess; }
+
 cudaError_t cudaDeviceSynchronize(void) {
   cuSynchronizeBarrier();
   return cudaSuccess;
 }
+
 cudaError_t cudaThreadSynchronize(void) {
   cuSynchronizeBarrier();
   return cudaSuccess;
 }
+
 cudaError_t cudaFree(void *devPtr) {
   free(devPtr);
   return cudaSuccess;
 }
+
 cudaError_t cudaFreeHost(void *devPtr) {
   free(devPtr);
   return cudaSuccess;
@@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
   cu_kernel *ker =
       create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
 
-  int lstatus = cuLaunchKernel(&ker);
-
+  cuLaunchKernel(&ker);
   return cudaSuccess;
 }
+
 cudaError_t cudaMalloc(void **devPtr, size_t size) {
   *devPtr = malloc(size);
   if (devPtr == NULL)
     return cudaErrorMemoryAllocation;
   return cudaSuccess;
 }
+
 cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
   memset(devPtr, value, count);
   return cudaSuccess;
 }
+
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
                        cudaMemcpyKind kind) {
   if (kind == cudaMemcpyHostToHost) {
@@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
   return cudaSuccess;
 }
 
-static int stream_counter = 1;
 /*
 From our evaluation, CPU backend can gain little benefit
 from multi stream. Thus, we only use single stream
@@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess;
 const char *cudaGetErrorString(cudaError_t error) {
   if (error == cudaSuccess) {
     return "Cuda Get Error Success";
+  } else {
+    return "Cuda Get Error Failed";
   }
 }
 
diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt
index 5d53566..3f54f99 100644
--- a/runtime/threadPool/CMakeLists.txt
+++ b/runtime/threadPool/CMakeLists.txt
@@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 set(LIB_NAME threadPool)
 
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_BUILD_TYPE Debug)
 include_directories(../../common)
 include_directories(./include)
-include_directories(./include/x86)
+include_directories(./include/cpu)
 include_directories(../../external/moodycamel)
 
-file(GLOB proj_SOURCES "src/x86/*.cpp")
+file(GLOB proj_SOURCES "src/cpu/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
diff --git a/runtime/threadPool/include/x86/api.h b/runtime/threadPool/include/cpu/api.h
similarity index 100%
rename from runtime/threadPool/include/x86/api.h
rename to runtime/threadPool/include/cpu/api.h
diff --git a/runtime/threadPool/include/x86/def.h b/runtime/threadPool/include/cpu/def.h
similarity index 100%
rename from runtime/threadPool/include/x86/def.h
rename to runtime/threadPool/include/cpu/def.h
diff --git a/runtime/threadPool/include/x86/macros.h b/runtime/threadPool/include/cpu/macros.h
similarity index 100%
rename from runtime/threadPool/include/x86/macros.h
rename to runtime/threadPool/include/cpu/macros.h
diff --git a/runtime/threadPool/include/x86/structures.h b/runtime/threadPool/include/cpu/structures.h
similarity index 100%
rename from runtime/threadPool/include/x86/structures.h
rename to runtime/threadPool/include/cpu/structures.h
diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/cpu/api.cpp
similarity index 82%
rename from runtime/threadPool/src/x86/api.cpp
rename to runtime/threadPool/src/cpu/api.cpp
index 4a319fc..7ab36e9 100644
--- a/runtime/threadPool/src/x86/api.cpp
+++ b/runtime/threadPool/src/cpu/api.cpp
@@ -1,3 +1,19 @@
+/*
+  This file contains the implementation of the CPU thread pool. For a kernel
+  launch, the host thread will enqueue the kernel to the kernelQueue, and the
+  threads in the thread pool will try to fetch work from the queue. After a
+  thread fetches a kernel from the queue, it will execute the kernel. After the
+  kernel execution, the thread will try to fetch another kernel from the queue.
+  If the queue is empty, the thread will wait for the next kernel launch.
+
+  By default, we try to use all CPU cores for execution. Thus, for a kernel
+  launch, the host thread pushes P kernel variables to the queue, where P is the
+  number of CPU cores.
+
+  For some lightweight kernels, useing fewer CPU cores can speed up the overall
+  execution time, due to fewer CPU cores lead to lower synchronization overhead.
+*/
+
 #include "api.h"
 #include "blockingconcurrentqueue.h"
 #include "debug.hpp"
@@ -9,9 +25,6 @@
 #include <stdlib.h>
 #include <thread>
 
-/*
-Initialize the device
-*/
 int device_max_compute_units = 1;
 bool device_initilized = false;
 int init_device() {
@@ -32,7 +45,6 @@ int init_device() {
 }
 
 // Create Kernel
-static int kernelIds = 0;
 cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
                          void **args, size_t sharedMem, cudaStream_t stream) {
   cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
@@ -71,13 +83,11 @@ __thread int warp_shfl[32] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
-/*
-    Enqueue Kernel (k) to the scheduler kernelQueue
-*/
 int TaskToExecute;
+// Enqueue Kernel to the scheduler kernelQueue
 int schedulerEnqueueKernel(cu_kernel *k) {
-  int totalBlocks =
-      k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
+  int totalBlocks = k->totalBlocks;
+  // by default, all CPU cores are used to execute GPU blocks equally
   int gpuBlockToExecutePerCpuThread =
       (totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
   TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
@@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) {
   return C_SUCCESS;
 }
 
-/*
-  Kernel Launch with numBlocks and numThreadsPerBlock
-*/
+// Push kernel to the kernelQueue
 int cuLaunchKernel(cu_kernel **k) {
   if (!device_initilized) {
     init_device();
   }
-  // Calculate Block Size N/numBlocks
-  cu_kernel *ker = *k;
-  int status = C_RUN;
   // set complete to false, this variable is used for sync
   for (int i = 0; i < scheduler->num_worker_threads; i++) {
     scheduler->thread_pool[i].completeTask = 0;
   }
-  schedulerEnqueueKernel(ker);
+  schedulerEnqueueKernel(*k);
 
   return 0;
 }
 
-/*
-    Thread Gets Work
-*/
+// threads in thread-pool try to fetch work from the queue
 int get_work(c_thread *th) {
   int dynamic_shared_mem_size = 0;
   dim3 gridDim;
@@ -136,6 +139,7 @@ int get_work(c_thread *th) {
       grid_size_x = gridDim.x;
       grid_size_y = gridDim.y;
       grid_size_z = gridDim.z;
+      // allocate dynamic shared memory
       if (dynamic_shared_mem_size > 0)
         dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
       // execute GPU blocks
@@ -153,7 +157,8 @@ int get_work(c_thread *th) {
     }
     // if cannot get tasks, check whether programs stop
     if (scheduler->threadpool_shutdown_requested) {
-      return true; // thread exit
+      // thread exit
+      break;
     }
   }
   return 0;
@@ -176,9 +181,7 @@ void *driver_thread(void *p) {
   }
 }
 
-/*
-Initialize the scheduler
-*/
+// Initialize the scheduler
 int scheduler_init(cu_device device) {
   scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
   scheduler->num_worker_threads = device.max_compute_units;
@@ -198,8 +201,6 @@ int scheduler_init(cu_device device) {
   return C_SUCCESS;
 }
 
-void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
-
 /*
   Barrier for Kernel Launch
 */
diff --git a/test/runHeteroMark.sh b/test/runHeteroMark.sh
index 416e1ed..f9d4c97 100644
--- a/test/runHeteroMark.sh
+++ b/test/runHeteroMark.sh
@@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \
     $HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc  $HeteroMark_PATH/src/common/benchmark/*.cc \
     $HeteroMark_PATH/src/common/command_line_option/*.cc  $HeteroMark_PATH/src/common/time_measurement/*.cc \
     -L$CuPBoP_BUILD_PATH/runtime   -L$CuPBoP_BUILD_PATH/runtime/threadPool \
-    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool
+    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool
 
 case $1 in
   aes)