From addf0a95b730f25f586322c984a306141606881f Mon Sep 17 00:00:00 2001
From: RobinHan <hanruobing@pku.edu.cn>
Date: Tue, 11 Jan 2022 11:01:42 -0500
Subject: [PATCH] add backbone, including basic features for compilation

---
 CMakeLists.txt                                |  41 +
 CONTRIBUTING.md                               |  34 +
 LICENSE                                       |  21 +
 README.md                                     |  62 ++
 compilation/CMakeLists.txt                    |  23 +
 compilation/HostTranslation.cpp               |  25 +
 compilation/HostTranslation/CMakeLists.txt    |  21 +
 .../include/ReplaceKernelLaunch.h             |  11 +
 .../lib/ReplaceKernelLaunch.cpp               |  94 ++
 compilation/KernelTranslation.cpp             |  53 ++
 compilation/KernelTranslation/CMakeLists.txt  |  21 +
 .../include/generate_x86_format.h             |   8 +
 .../KernelTranslation/include/handle_sync.h   |  10 +
 compilation/KernelTranslation/include/init.h  |   7 +
 .../KernelTranslation/include/insert_sync.h   |   9 +
 .../include/insert_warp_loop.h                |  12 +
 .../include/memory_hierarchy.h                |   9 +
 .../KernelTranslation/include/performance.h   |   7 +
 compilation/KernelTranslation/include/tool.h  |  24 +
 .../KernelTranslation/include/warp_func.h     |  10 +
 .../lib/generate_x86_format.cpp               | 119 +++
 .../KernelTranslation/lib/handle_sync.cpp     |  57 ++
 compilation/KernelTranslation/lib/init.cpp    | 302 +++++++
 .../KernelTranslation/lib/insert_sync.cpp     | 494 ++++++++++
 .../lib/insert_warp_loop.cpp                  | 848 ++++++++++++++++++
 .../lib/memory_hierarchy.cpp                  | 126 +++
 .../KernelTranslation/lib/performance.cpp     |  88 ++
 compilation/KernelTranslation/lib/tool.cpp    | 480 ++++++++++
 .../KernelTranslation/lib/warp_func.cpp       | 217 +++++
 compilation/examples/reduce/host.cpp          |  82 ++
 .../kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll  | 150 ++++
 compilation/examples/reduce/run.sh            |   6 +
 compilation/examples/reduce_shuffle/host.cpp  |  82 ++
 .../kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll  | 179 ++++
 compilation/examples/reduce_shuffle/run.sh    |   6 +
 compilation/examples/run_example.sh           |  11 +
 compilation/examples/vecadd/host.cpp          |  84 ++
 .../kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll  |  86 ++
 compilation/examples/vecadd/run.sh            |   6 +
 runtime/CMakeLists.txt                        |  16 +
 runtime/include/cudaRuntimeImpl.h             |  19 +
 runtime/include/cudaStatus.h                  |  18 +
 runtime/lib/cudaRuntimeImpl.cpp               | 100 +++
 runtime/threadPool/CMakeLists.txt             |  17 +
 runtime/threadPool/include/api.h              |  25 +
 runtime/threadPool/include/def.h              |  26 +
 runtime/threadPool/include/macros.h           |  38 +
 runtime/threadPool/include/structures.h       | 191 ++++
 runtime/threadPool/lib/api.cpp                | 456 ++++++++++
 49 files changed, 4831 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 compilation/CMakeLists.txt
 create mode 100644 compilation/HostTranslation.cpp
 create mode 100644 compilation/HostTranslation/CMakeLists.txt
 create mode 100644 compilation/HostTranslation/include/ReplaceKernelLaunch.h
 create mode 100644 compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
 create mode 100644 compilation/KernelTranslation.cpp
 create mode 100644 compilation/KernelTranslation/CMakeLists.txt
 create mode 100644 compilation/KernelTranslation/include/generate_x86_format.h
 create mode 100644 compilation/KernelTranslation/include/handle_sync.h
 create mode 100644 compilation/KernelTranslation/include/init.h
 create mode 100644 compilation/KernelTranslation/include/insert_sync.h
 create mode 100644 compilation/KernelTranslation/include/insert_warp_loop.h
 create mode 100644 compilation/KernelTranslation/include/memory_hierarchy.h
 create mode 100644 compilation/KernelTranslation/include/performance.h
 create mode 100644 compilation/KernelTranslation/include/tool.h
 create mode 100644 compilation/KernelTranslation/include/warp_func.h
 create mode 100644 compilation/KernelTranslation/lib/generate_x86_format.cpp
 create mode 100644 compilation/KernelTranslation/lib/handle_sync.cpp
 create mode 100644 compilation/KernelTranslation/lib/init.cpp
 create mode 100644 compilation/KernelTranslation/lib/insert_sync.cpp
 create mode 100644 compilation/KernelTranslation/lib/insert_warp_loop.cpp
 create mode 100644 compilation/KernelTranslation/lib/memory_hierarchy.cpp
 create mode 100644 compilation/KernelTranslation/lib/performance.cpp
 create mode 100644 compilation/KernelTranslation/lib/tool.cpp
 create mode 100644 compilation/KernelTranslation/lib/warp_func.cpp
 create mode 100644 compilation/examples/reduce/host.cpp
 create mode 100644 compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 create mode 100644 compilation/examples/reduce/run.sh
 create mode 100644 compilation/examples/reduce_shuffle/host.cpp
 create mode 100644 compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 create mode 100644 compilation/examples/reduce_shuffle/run.sh
 create mode 100644 compilation/examples/run_example.sh
 create mode 100644 compilation/examples/vecadd/host.cpp
 create mode 100644 compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
 create mode 100644 compilation/examples/vecadd/run.sh
 create mode 100644 runtime/CMakeLists.txt
 create mode 100644 runtime/include/cudaRuntimeImpl.h
 create mode 100644 runtime/include/cudaStatus.h
 create mode 100644 runtime/lib/cudaRuntimeImpl.cpp
 create mode 100644 runtime/threadPool/CMakeLists.txt
 create mode 100644 runtime/threadPool/include/api.h
 create mode 100644 runtime/threadPool/include/def.h
 create mode 100644 runtime/threadPool/include/macros.h
 create mode 100644 runtime/threadPool/include/structures.h
 create mode 100644 runtime/threadPool/lib/api.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..7b08bef
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+
+project(CudaOnX86)
+set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
+set(CMAKE_CXX_STANDARD "14")
+set(MAJOR_VERSION 0)
+set(MINOR_VERSION 1)
+set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
+set(COX_VERSION ${VERSION_STRING})
+# get LLVM PATH get PATH for head file
+if(DEFINED LLVM_CONFIG_PATH)
+  if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
+    if(EXISTS "${LLVM_CONFIG_PATH}")
+      set(LLVM_CONFIG "${LLVM_CONFIG_PATH}")
+    else()
+      message(FATAL_ERROR "llvm-config is not found in ${LLVM_CONFIG_PATH}")
+    endif()
+    message(STATUS "Using llvm-config: ${LLVM_CONFIG}")
+    execute_process(
+      COMMAND "${LLVM_CONFIG}" "--cxxflags"
+      OUTPUT_VARIABLE LLVM_CXX_FLAG
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND "${LLVM_CONFIG}" "--libdir"
+      OUTPUT_VARIABLE LLVM_LIB_PATH
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND "${LLVM_CONFIG}" "--libs"
+      OUTPUT_VARIABLE LLVM_LINK_FLAG
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+else()
+  message(FATAL_ERROR "llvm-config is required")
+endif()
+
+set(CMAKE_CXX_FLAGS "${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
+
+set(GCC_COVERAGE_LINK_FLAGS
+    "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
+
+add_subdirectory(compilation)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..90d3f91
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,34 @@
+# Contributing to COX
+
+Thank you for your interest in contributing to COX!
+We appreciate all contributions, including but not limited to:
+
+- Add documentation
+- Add new features and components
+- Fix bugs
+
+## How to contribute?
+
+0. (Optional) Open an issue and discuss your idea before start
+1. Fork the latest version COX
+2. Commit to the forked repo
+3. Create a Pull Request to COX main branch
+
+## Code style
+
+We follow the Clang format in this repo.
+To make sure your contribution is following the correct style,
+we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
+
+```bash
+# Python environment is required
+pip install pre-commit
+```
+
+Then, from the repository folder, execute the following instruction:
+
+```bash
+ pre-commit install
+```
+
+With pre-commit plugin, each local commit will be automatically checked.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..9864eaa
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Ruobing Han
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..34461f1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# COX: CUDA on X86
+
+## Introduction
+
+This project consists of two parts: a series of LLVM passes that
+achieve a SPMD NVVM IR as input, and output the corresponding
+MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
+
+## Install
+
+### Prerequisites
+
+* Linux: Verified on Ubuntu 18.04
+* LLVM10.0
+* NVIDIA CUDA-toolkit
+* x86 CPU
+* pthread
+* GCC 7.5.0
+
+### Installation
+
+1. Clone from github
+
+    ```bash
+    git clone https://github.com/drcut/open_source_template
+    cd open_source_template
+    ```
+
+2. Build the transformer for NVVM IR to LLVM IR for X86
+
+    ```bash
+    mkdir build && cd build
+    cmake .. -DLLVM_CONFIG_PATH=`which llvm-config` # need path to llvm-config
+    make
+    ```
+
+## Run Vecadd samples
+
+```bash
+# Generate bitcode from human-readable LLVM IR
+llvm-as ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+# use LLVM passes to transform NVVM IR (SPMD) to LLVM IR (MPMD+SIMD).
+# NOTE: we hard-code the grid size (1, 1, 1)
+# and block size (1024, 1, 1) into the generated LLVM IR
+./compilation/nvvm2x86 \
+    ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc \
+    kernel.bc 1 1 1 32 1 1
+# generate object file from LLVM IR
+llc --filetype=obj kernel.bc
+# link generated kernel function
+# with host function and generate excutable file
+g++ ../compilation/examples/vecadd/host.cpp \
+    kernel.o -lpthread -o vecadd_example
+# execute the executable file
+./vecadd_example
+```
+
+## Author
+
+[Ruobing Han](https://drcut.github.io/) is a CS phd student in
+Georgia Institute Technology, under the supervision
+of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).
diff --git a/compilation/CMakeLists.txt b/compilation/CMakeLists.txt
new file mode 100644
index 0000000..868f9c4
--- /dev/null
+++ b/compilation/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(
+  NVVM2X86
+  DESCRIPTION "Translate NVVM IR to LLVM IR for X86"
+  LANGUAGES CXX)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# compile kernel translator
+include_directories(./KernelTranslation/include)
+add_subdirectory(KernelTranslation)
+
+add_executable(kernelTranslator KernelTranslation.cpp)
+target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
+
+# compile host translator
+include_directories(./HostTranslation/include)
+add_subdirectory(HostTranslation)
+
+add_executable(hostTranslator HostTranslation.cpp)
+
+target_link_libraries(hostTranslator spmd2mpmd cudaRuntime2cpuRuntime
+                      ${GCC_COVERAGE_LINK_FLAGS})
diff --git a/compilation/HostTranslation.cpp b/compilation/HostTranslation.cpp
new file mode 100644
index 0000000..9695b56
--- /dev/null
+++ b/compilation/HostTranslation.cpp
@@ -0,0 +1,25 @@
+#include "ReplaceKernelLaunch.h"
+#include "tool.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include <assert.h>
+#include <iostream>
+#include <stdlib.h>
+
+using namespace llvm;
+
+int main(int argc, char **argv) {
+  assert(argc == 3 && "incorrect number of arguments\n");
+
+  char *input_host_path = argv[1];
+  char *output_host_path = argv[2];
+
+  // load LLVM module(s)
+  llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
+  VerifyModule(hostModule);
+  // process host module
+  ReplaceKernelLaunch(hostModule);
+  VerifyModule(hostModule);
+  DumpModule(hostModule, output_host_path);
+  return 0;
+}
diff --git a/compilation/HostTranslation/CMakeLists.txt b/compilation/HostTranslation/CMakeLists.txt
new file mode 100644
index 0000000..2e97dd6
--- /dev/null
+++ b/compilation/HostTranslation/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.1)
+
+# C project
+project(
+  HostTranslation
+  DESCRIPTION "Translate CUDA host modules to CPU host modules,
+    mainly replace CUDA Runtime APIs with CPU Runtime APIs"
+  LANGUAGES CXX)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(LIB_NAME cudaRuntime2cpuRuntime)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_BUILD_TYPE Debug)
+include_directories(./include)
+
+file(GLOB proj_HEADERS "include/*.h")
+file(GLOB proj_SOURCES "lib/*.cpp")
+
+# Add core library.
+add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
diff --git a/compilation/HostTranslation/include/ReplaceKernelLaunch.h b/compilation/HostTranslation/include/ReplaceKernelLaunch.h
new file mode 100644
index 0000000..769489c
--- /dev/null
+++ b/compilation/HostTranslation/include/ReplaceKernelLaunch.h
@@ -0,0 +1,11 @@
+#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
+#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
+
+#include "llvm/IR/Module.h"
+/*
+ * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
+ * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
+ */
+void ReplaceKernelLaunch(llvm::Module *M);
+
+#endif
diff --git a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
new file mode 100644
index 0000000..67525d7
--- /dev/null
+++ b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp
@@ -0,0 +1,94 @@
+#include "ReplaceKernelLaunch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include <iostream>
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
+// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
+void ReplaceKernelLaunch(llvm::Module *M) {
+  LLVMContext &context = M->getContext();
+  auto VoidTy = llvm::Type::getVoidTy(context);
+  auto I8 = llvm::Type::getInt8PtrTy(context);
+  std::map<std::string, BitCastInst *> kernels;
+
+  LLVMContext *C = &M->getContext();
+
+  llvm::Type *Int32T = Type::getInt32Ty(*C);
+  llvm::Type *Int8T = Type::getInt8Ty(*C);
+
+  llvm::FunctionType *LauncherFuncT =
+      FunctionType::get(Type::getVoidTy(*C), NULL);
+
+  llvm::FunctionType *LaunchFun2 =
+      FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
+
+  bool done = false;
+
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+
+    for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+      BasicBlock *B = &(*b);
+
+      for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+        Instruction *inst = &(*i);
+
+        if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
+          if (Function *calledFunction = callInst->getCalledFunction()) {
+
+            if (calledFunction->getName().startswith("cudaLaunchKernel")) {
+
+              Value *callOperand = callInst->getArgOperand(0);
+
+              Function *functionOperand =
+                  dyn_cast<Function>(callInst->getArgOperand(0));
+
+              // call function is wrapped in a bitcast
+              if (functionOperand == NULL) {
+
+                std::vector<size_t> arg_sizes;
+                functionOperand =
+                    dyn_cast<Function>(callOperand->stripPointerCasts());
+
+                FunctionType *ft = calledFunction->getFunctionType();
+                std::cout << " Parent (Caller) Function Name: " << func_name
+                          << ", cudaLaunchKernel Function: "
+                          << functionOperand->getName().str() << ", args "
+                          << functionOperand->arg_size() << std::endl;
+                auto rep = kernels.find(functionOperand->getName().str());
+                if (rep != kernels.end()) {
+
+                  callInst->setArgOperand(0, rep->second);
+                  continue;
+                }
+
+                std::vector<Type *> Params;
+                Params.push_back(I8);
+                FunctionType *FT = FunctionType::get(VoidTy, Params, false);
+                std::string newName =
+                    functionOperand->getName().str() + "_wrapper";
+
+                Function *F =
+                    Function::Create(FT, Function::ExternalLinkage, newName, M);
+                F->setDSOLocal(true);
+
+                BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
+                callInst->setArgOperand(0, BC);
+                kernels.insert({functionOperand->getName().str(), BC});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/compilation/KernelTranslation.cpp b/compilation/KernelTranslation.cpp
new file mode 100644
index 0000000..1d24c7b
--- /dev/null
+++ b/compilation/KernelTranslation.cpp
@@ -0,0 +1,53 @@
+#include "generate_x86_format.h"
+#include "handle_sync.h"
+#include "init.h"
+#include "insert_sync.h"
+#include "insert_warp_loop.h"
+#include "performance.h"
+#include "tool.h"
+#include "warp_func.h"
+#include "llvm/IR/Module.h"
+#include <assert.h>
+#include <iostream>
+#include <map>
+#include <set>
+#include <stdlib.h>
+
+using namespace llvm;
+
+int main(int argc, char **argv) {
+  assert(argc == 9 && "incorrect number of arguments\n");
+  llvm::Module *program = LoadModuleFromFilr(argv[1]);
+  // get size of grid and dim from input arguments
+  int *grid_dim = new int[3];
+  int *block_dim = new int[3];
+  grid_dim[0] = atoi(argv[3]);
+  grid_dim[1] = atoi(argv[4]);
+  grid_dim[2] = atoi(argv[5]);
+  block_dim[0] = atoi(argv[6]);
+  block_dim[1] = atoi(argv[7]);
+  block_dim[2] = atoi(argv[8]);
+
+  // inline, and create auxiliary global variables
+  init_block(program);
+  // insert sync before each vote, and replace the
+  // original vote function to warp vote
+  handle_warp_vote(program);
+  // replace warp shuffle
+  handle_warp_shfl(program);
+  // insert sync
+  insert_sync(program);
+  // split block by sync
+  split_block_by_sync(program);
+  // add loop for intra&intera thread
+  insert_warp_loop(program);
+  // (TODO): replace this patch
+  replace_built_in_function(program, grid_dim, block_dim);
+  // VerifyModule(program);
+  generate_x86_format(program);
+  // performance optimization
+  performance_optimization(program);
+
+  DumpModule(program, argv[2]);
+  return 0;
+}
diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt
new file mode 100644
index 0000000..7ec3898
--- /dev/null
+++ b/compilation/KernelTranslation/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.1)
+
+# C project
+project(
+  KernelTranslation
+  DESCRIPTION
+    "Translate SPMD Kernel to MPMD format with hierarchical collapsing"
+  LANGUAGES CXX)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(LIB_NAME spmd2mpmd)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_BUILD_TYPE Debug)
+include_directories(./include)
+
+file(GLOB proj_HEADERS "include/*.h")
+file(GLOB proj_SOURCES "lib/*.cpp")
+
+# Add core library.
+add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
diff --git a/compilation/KernelTranslation/include/generate_x86_format.h b/compilation/KernelTranslation/include/generate_x86_format.h
new file mode 100644
index 0000000..dff3694
--- /dev/null
+++ b/compilation/KernelTranslation/include/generate_x86_format.h
@@ -0,0 +1,8 @@
+#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
+#define __NVVM2x86_GENERATE_X86_FORMAT__
+
+#include "llvm/IR/Module.h"
+
+void generate_x86_format(llvm::Module *M);
+
+#endif
diff --git a/compilation/KernelTranslation/include/handle_sync.h b/compilation/KernelTranslation/include/handle_sync.h
new file mode 100644
index 0000000..e007f96
--- /dev/null
+++ b/compilation/KernelTranslation/include/handle_sync.h
@@ -0,0 +1,10 @@
+#ifndef __NVVM2x86_HANDLE_SYNC__
+#define __NVVM2x86_HANDLE_SYNC__
+
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+void split_block_by_sync(llvm::Module *M);
+
+#endif
diff --git a/compilation/KernelTranslation/include/init.h b/compilation/KernelTranslation/include/init.h
new file mode 100644
index 0000000..10f5186
--- /dev/null
+++ b/compilation/KernelTranslation/include/init.h
@@ -0,0 +1,7 @@
+#ifndef __NVVM2x86_INIT__
+#define __NVVM2x86_INIT__
+
+#include "llvm/IR/Module.h"
+
+void init_block(llvm::Module *M);
+#endif
diff --git a/compilation/KernelTranslation/include/insert_sync.h b/compilation/KernelTranslation/include/insert_sync.h
new file mode 100644
index 0000000..a7e2d8a
--- /dev/null
+++ b/compilation/KernelTranslation/include/insert_sync.h
@@ -0,0 +1,9 @@
+#ifndef __NVVM2x86_INSERT_SYNC__
+#define __NVVM2x86_INSERT_SYNC__
+
+#include "llvm/IR/Function.h"
+
+// insert extra barrier
+void insert_sync(llvm::Module *M);
+
+#endif
diff --git a/compilation/KernelTranslation/include/insert_warp_loop.h b/compilation/KernelTranslation/include/insert_warp_loop.h
new file mode 100644
index 0000000..da1a708
--- /dev/null
+++ b/compilation/KernelTranslation/include/insert_warp_loop.h
@@ -0,0 +1,12 @@
+#ifndef __NVVM2x86_INSERT_WARP_LOOP__
+#define __NVVM2x86_INSERT_WARP_LOOP__
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+void insert_warp_loop(llvm::Module *M);
+
+#endif
diff --git a/compilation/KernelTranslation/include/memory_hierarchy.h b/compilation/KernelTranslation/include/memory_hierarchy.h
new file mode 100644
index 0000000..1f8495c
--- /dev/null
+++ b/compilation/KernelTranslation/include/memory_hierarchy.h
@@ -0,0 +1,9 @@
+#ifndef __NVVM2x86_MEMORY_HIERARCHY__
+#define __NVVM2x86_MEMORY_HIERARCHY__
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+void mem_share2global(llvm::Module *M);
+
+#endif
diff --git a/compilation/KernelTranslation/include/performance.h b/compilation/KernelTranslation/include/performance.h
new file mode 100644
index 0000000..bb9bf88
--- /dev/null
+++ b/compilation/KernelTranslation/include/performance.h
@@ -0,0 +1,7 @@
+#ifndef __NVVM2x86_PERFORMANCE__
+#define __NVVM2x86_PERFORMANCE__
+
+#include "llvm/IR/Module.h"
+
+void performance_optimization(llvm::Module *M);
+#endif
diff --git a/compilation/KernelTranslation/include/tool.h b/compilation/KernelTranslation/include/tool.h
new file mode 100644
index 0000000..cb1963f
--- /dev/null
+++ b/compilation/KernelTranslation/include/tool.h
@@ -0,0 +1,24 @@
+#ifndef __NVVM2x86_TOOL__
+#define __NVVM2x86_TOOL__
+
+#include "llvm/IR/Module.h"
+llvm::Module *LoadModuleFromFilr(char *file_name);
+void DumpModule(llvm::Module *M, char *file_name);
+bool isKernelFunction(llvm::Module *M, llvm::Function *F);
+void replace_block(llvm::Function *F, llvm::BasicBlock *before,
+                   llvm::BasicBlock *after);
+llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore);
+llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
+void VerifyModule(llvm::Module *);
+void phi2alloc(llvm::Module *M);
+void remove_cuda_built_in(llvm::Module *M);
+void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
+void replace_asm_call(llvm::Module *M);
+bool find_block_barrier_in_region(llvm::BasicBlock *start,
+                                  llvm::BasicBlock *end);
+bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end);
+bool has_warp_barrier(llvm::BasicBlock *B);
+bool has_barrier(llvm::BasicBlock *B);
+bool has_block_barrier(llvm::BasicBlock *B);
+bool has_barrier(llvm::Function *F);
+#endif
diff --git a/compilation/KernelTranslation/include/warp_func.h b/compilation/KernelTranslation/include/warp_func.h
new file mode 100644
index 0000000..1fc0554
--- /dev/null
+++ b/compilation/KernelTranslation/include/warp_func.h
@@ -0,0 +1,10 @@
+#ifndef __NVVM2x86_WARP_FUNC__
+#define __NVVM2x86_WARP_FUNC__
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+void handle_warp_vote(llvm::Module *M);
+void handle_warp_shfl(llvm::Module *M);
+#endif
diff --git a/compilation/KernelTranslation/lib/generate_x86_format.cpp b/compilation/KernelTranslation/lib/generate_x86_format.cpp
new file mode 100644
index 0000000..dbfacbc
--- /dev/null
+++ b/compilation/KernelTranslation/lib/generate_x86_format.cpp
@@ -0,0 +1,119 @@
+#include "generate_x86_format.h"
+#include "tool.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+void set_meta_data(llvm::Module *M) {
+  M->setTargetTriple("x86_64-unknown-linux-gnu");
+  M->setDataLayout(
+      "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
+}
+
+// as pthread only accept a single void* for input
+// we have to decode this input inside the kernel
+void decode_input(llvm::Module *M) {
+
+  std::set<llvm::Function *> need_remove;
+
+  LLVMContext *C = &M->getContext();
+  llvm::Type *Int32T = Type::getInt32Ty(*C);
+  llvm::Type *Int8T = Type::getInt8Ty(*C);
+
+  llvm::FunctionType *LauncherFuncT = FunctionType::get(
+      Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
+
+  // generate Wrapper Function type
+  // now we only support a single int32*
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    if (!isKernelFunction(M, F))
+      continue;
+    auto func_name = F->getName().str();
+    llvm::IRBuilder<> Builder(M->getContext());
+
+    FunctionCallee fc =
+        M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
+    Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
+
+    BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
+    Builder.SetInsertPoint(Block);
+
+    // WorkGroup has only a single input
+    Function::arg_iterator ai = WorkGroup->arg_begin();
+
+    SmallVector<Value *, 8> Arguments;
+    Value *input_arg = &*ai;
+    // convert to int**
+    input_arg = Builder.CreateBitOrPointerCast(
+        input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
+    size_t idx = 0;
+    for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+         ii != ee; ++ii) {
+      Type *ArgType = ii->getType();
+
+      // calculate addr
+      Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx));
+      // load corresponding int*
+      GEP = Builder.CreateLoad(GEP);
+      // bitcast
+      GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
+      Value *Arg = Builder.CreateLoad(GEP);
+      Arguments.push_back(Arg);
+      ++idx;
+    }
+
+    CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
+    Builder.CreateRetVoid();
+  }
+  for (auto f : need_remove) {
+    f->dropAllReferences();
+    f->eraseFromParent();
+  }
+}
+
+void remove_barrier(llvm::Module *M) {
+  std::vector<Instruction *> need_remove;
+  for (auto F = M->begin(); F != M->end(); ++F)
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
+          auto func_name = Call->getCalledFunction()->getName().str();
+          if (func_name == "llvm.nvvm.bar.warp.sync" ||
+              func_name == "llvm.nvvm.barrier0" ||
+              func_name == "llvm.nvvm.barrier.sync") {
+            need_remove.push_back(Call);
+          }
+        }
+      }
+    }
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+
+void generate_x86_format(llvm::Module *M) {
+  // change metadata
+  set_meta_data(M);
+  // decode argument
+  decode_input(M);
+  // remove barrier
+  remove_barrier(M);
+}
diff --git a/compilation/KernelTranslation/lib/handle_sync.cpp b/compilation/KernelTranslation/lib/handle_sync.cpp
new file mode 100644
index 0000000..e0fb19b
--- /dev/null
+++ b/compilation/KernelTranslation/lib/handle_sync.cpp
@@ -0,0 +1,57 @@
+#include "handle_sync.h"
+#include "tool.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include <set>
+#include <string>
+
+using namespace llvm;
+
+void split_block_by_sync(llvm::Function *F) {
+  std::set<llvm::Instruction *> sync_inst;
+  bool jump_first_sync = 1;
+  for (Function::iterator b = F->begin(); b != F->end(); ++b) {
+    BasicBlock *B = &(*b);
+    for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+      Instruction *inst = &(*i);
+      if (jump_first_sync) {
+        jump_first_sync = 0;
+        Instruction *next_inst = &(*std::next(i));
+        sync_inst.insert(next_inst);
+        continue;
+      }
+      llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
+      if (Call) {
+        auto func_name = Call->getCalledFunction()->getName().str();
+        if (func_name == "llvm.nvvm.barrier0" ||
+            func_name == "llvm.nvvm.bar.warp.sync" ||
+            func_name == "llvm.nvvm.barrier.sync") {
+          sync_inst.insert(Call);
+          // we should also sync the next instruction
+          // so that we can get a block with sync inst only
+          Instruction *next_inst = &(*std::next(i));
+          sync_inst.insert(next_inst);
+        }
+      }
+    }
+  }
+  int _tmp = 0;
+  for (auto inst : sync_inst) {
+    inst->getParent()->splitBasicBlock(
+        inst, inst->getParent()->getName().str() + "_after_block_sync_" +
+                  std::to_string(_tmp++));
+  }
+}
+
+void split_block_by_sync(llvm::Module *M) {
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    if (isKernelFunction(M, F))
+      split_block_by_sync(F);
+  }
+}
diff --git a/compilation/KernelTranslation/lib/init.cpp b/compilation/KernelTranslation/lib/init.cpp
new file mode 100644
index 0000000..8007470
--- /dev/null
+++ b/compilation/KernelTranslation/lib/init.cpp
@@ -0,0 +1,302 @@
+#include "init.h"
+#include "memory_hierarchy.h"
+#include "tool.h"
+#include <iostream>
+#include <set>
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassInfo.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+void inline_func_vote(llvm::Module *M) {
+  std::set<llvm::Function *> need_remove;
+
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (!isKernelFunction(M, F))
+      continue;
+    Function::iterator I = F->begin();
+    for (Function::iterator E = F->end(); I != E; ++I) {
+      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+        if (CallInst *c = dyn_cast<CallInst>(BI++)) {
+          if (c->getCalledFunction()) {
+            if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
+              InlineFunctionInfo IFI;
+              InlineFunction(c, IFI);
+              need_remove.insert(c->getCalledFunction());
+            }
+          }
+        }
+      }
+    }
+  }
+  for (auto f : need_remove) {
+    f->dropAllReferences();
+    f->eraseFromParent();
+  }
+}
+
+void create_global_variable(llvm::Module *M) {
+  llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
+  llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
+
+  auto zero = llvm::ConstantInt::get(I32, 0, true);
+
+  // we need global variable used for warp shuffle
+  llvm::Type *WarpArrayType = llvm::ArrayType::get(I32, 32);
+  llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32);
+
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           zero, "intra_warp_index", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           zero, "inter_warp_index", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_size", NULL,
+                           llvm::GlobalValue::NotThreadLocal, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_size_x", NULL,
+                           llvm::GlobalValue::NotThreadLocal, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_size_y", NULL,
+                           llvm::GlobalValue::NotThreadLocal, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_size_z", NULL,
+                           llvm::GlobalValue::NotThreadLocal, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "grid_size", NULL,
+                           llvm::GlobalValue::NotThreadLocal, 0, false);
+  new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
+                           NULL, "block_index", NULL,
+                           llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  // TLS variable used for warp-level collective operators
+  new llvm::GlobalVariable(
+      *M, WarpArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
+      "warp_shfl", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  auto warp_vote = new llvm::GlobalVariable(
+      *M, VoteArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
+      "warp_vote", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
+  warp_vote->setAlignment(llvm::MaybeAlign(32));
+}
+
+void remove_metadata(llvm::Module *M) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    F->getAllMetadata(MDs);
+    for (auto &MD : MDs) {
+      F->setMetadata(MD.first, NULL);
+    }
+    F->removeFnAttr("target-features");
+    F->removeFnAttr("target-cpu");
+  }
+}
+
+void init_llvm_pass() {
+
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+
+  initializeCore(Registry);
+  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
+  initializeIPO(Registry);
+  initializeAnalysis(Registry);
+  initializeTransformUtils(Registry);
+  initializeInstCombine(Registry);
+  initializeInstrumentation(Registry);
+  initializeTarget(Registry);
+
+  llvm::StringMap<llvm::cl::Option *> &opts = llvm::cl::getRegisteredOptions();
+
+  llvm::cl::Option *O = nullptr;
+
+  O = opts["scalarize-load-store"];
+  assert(O && "could not find LLVM option 'scalarize-load-store'");
+  O->addOccurrence(1, StringRef("scalarize-load-store"), StringRef("1"), false);
+
+  // LLVM inner loop vectorizer does not check whether the loop inside
+  // another loop, in which case even a small trip count loops might be
+  // worthwhile to vectorize.
+  O = opts["vectorizer-min-trip-count"];
+  assert(O && "could not find LLVM option 'vectorizer-min-trip-count'");
+  O->addOccurrence(1, StringRef("vectorizer-min-trip-count"), StringRef("2"),
+                   false);
+
+  // Disable jump threading optimization with following two options from
+  // duplicating blocks. Using jump threading will mess up parallel region
+  // construction especially when kernel contains barriers.
+  // TODO: If enabled then parallel region construction code needs
+  // improvements and make sure it doesn't disallow other optimizations like
+  // vectorization.
+  O = opts["jump-threading-threshold"];
+  assert(O && "could not find LLVM option 'jump-threading-threshold'");
+  O->addOccurrence(1, StringRef("jump-threading-threshold"), StringRef("0"),
+                   false);
+  O = opts["jump-threading-implication-search-threshold"];
+  assert(O && "could not find LLVM option "
+              "'jump-threading-implication-search-threshold'");
+  O->addOccurrence(1, StringRef("jump-threading-implication-search-threshold"),
+                   StringRef("0"), false);
+
+  // Enable diagnostics from the loop vectorizer.
+  O = opts["pass-remarks-missed"];
+  assert(O && "could not find LLVM option 'pass-remarks-missed'");
+  O->addOccurrence(1, StringRef("pass-remarks-missed"),
+                   StringRef("loop-vectorize"), false);
+  O->addOccurrence(1, StringRef("pass-remarks-missed"),
+                   StringRef("slp-vectorize"), false);
+
+  O = opts["pass-remarks-analysis"];
+  assert(O && "could not find LLVM option 'pass-remarks-analysis'");
+  O->addOccurrence(1, StringRef("pass-remarks-analysis"),
+                   StringRef("loop-vectorize"), false);
+  O->addOccurrence(1, StringRef("pass-remarks-analysis"),
+                   StringRef("slp-vectorize"), false);
+
+  O = opts["pass-remarks"];
+  assert(O && "could not find LLVM option 'pass-remarks'");
+  O->addOccurrence(1, StringRef("pass-remarks"), StringRef("loop-vectorize"),
+                   false);
+  O->addOccurrence(1, StringRef("pass-remarks"), StringRef("slp-vectorize"),
+                   false);
+}
+
+void llvm_preprocess(llvm::Module *M) {
+  init_llvm_pass();
+
+  auto Registry = PassRegistry::getPassRegistry();
+
+  llvm::legacy::PassManager Passes;
+
+  std::vector<std::string> passes;
+  passes.push_back("lowerswitch");
+  passes.push_back("mem2reg");
+  passes.push_back("simplifycfg");
+  passes.push_back("loop-simplify");
+  for (auto pass : passes) {
+    const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
+    if (PIs) {
+      Pass *thispass = PIs->createPass();
+      Passes.add(thispass);
+    } else {
+      printf("Pass: %s not found\n", pass.c_str());
+    }
+  }
+  Passes.run(*M);
+}
+
+bool lower_constant_expr(llvm::Module *M) {
+  bool modified = false;
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::vector<CallInst *> need_remove;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (!isKernelFunction(M, F))
+      continue;
+
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto load_inst = dyn_cast<llvm::LoadInst>(BI)) {
+          auto load_from = load_inst->getOperand(0);
+          if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
+            modified = true;
+            auto ReplInst = get_element_ptr->getAsInstruction();
+            ReplInst->insertBefore(load_inst);
+            std::vector<Instruction *> Users;
+            // Do not replace use during iteration of use. Do it in another loop
+            for (auto U : get_element_ptr->users()) {
+              if (auto InstUser = dyn_cast<Instruction>(U)) {
+                Users.push_back(InstUser);
+              }
+            }
+            for (auto &User : Users)
+              User->replaceUsesOfWith(get_element_ptr, ReplInst);
+          }
+        } else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
+          auto store_to = store_inst->getOperand(1);
+          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
+            modified = true;
+            auto ReplInst = addr_cast->getAsInstruction();
+            ReplInst->insertBefore(store_inst);
+            std::vector<Instruction *> Users;
+            // Do not replace use during iteration of use. Do it in another loop
+            for (auto U : addr_cast->users()) {
+              if (auto InstUser = dyn_cast<Instruction>(U)) {
+                Users.push_back(InstUser);
+              }
+            }
+            for (auto &User : Users)
+              User->replaceUsesOfWith(addr_cast, ReplInst);
+          }
+        } else if (auto get_element_ptr =
+                       dyn_cast<llvm::GetElementPtrInst>(BI)) {
+          auto get_from = get_element_ptr->getOperand(0);
+          if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
+            modified = true;
+            auto ReplInst = addr_cast->getAsInstruction();
+            ReplInst->insertBefore(get_element_ptr);
+            std::vector<Instruction *> Users;
+            // Do not replace use during iteration of use. Do it in another loop
+            for (auto U : addr_cast->users()) {
+              if (auto InstUser = dyn_cast<Instruction>(U)) {
+                Users.push_back(InstUser);
+              }
+            }
+            for (auto &User : Users)
+              User->replaceUsesOfWith(addr_cast, ReplInst);
+          }
+        }
+      }
+    }
+  }
+  return modified;
+}
+
+void init_block(llvm::Module *M) {
+  // using official llvm preprocess
+  llvm_preprocess(M);
+  // remove useles Cuda function
+  remove_cuda_built_in(M);
+
+  // lower ConstantExpression
+  bool modified;
+  do {
+    modified = lower_constant_expr(M);
+  } while (modified);
+  // remove useless metadata
+  remove_metadata(M);
+  // inline vote function
+  inline_func_vote(M);
+  // create global variable for warp and vote
+  create_global_variable(M);
+  // replace phi with data load
+  phi2alloc(M);
+  // replace share memory
+  mem_share2global(M);
+  // replace asm Inline
+  replace_asm_call(M);
+}
diff --git a/compilation/KernelTranslation/lib/insert_sync.cpp b/compilation/KernelTranslation/lib/insert_sync.cpp
new file mode 100644
index 0000000..dfe0676
--- /dev/null
+++ b/compilation/KernelTranslation/lib/insert_sync.cpp
@@ -0,0 +1,494 @@
+#include "insert_sync.h"
+#include "assert.h"
+#include "handle_sync.h"
+#include "tool.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassInfo.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <iostream>
+#include <queue>
+
+using namespace llvm;
+
+class InsertBuiltInBarrier : public llvm::FunctionPass {
+
+public:
+  static char ID;
+
+  InsertBuiltInBarrier() : FunctionPass(ID) {}
+
+  virtual bool runOnFunction(Function &F) {
+    if (!isKernelFunction(F.getParent(), &F))
+      return 0;
+    std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
+    std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
+
+    // insert sync in the entry
+    BasicBlock *entry = &(*F.begin());
+    for (auto i = entry->begin(); i != entry->end(); i++) {
+      if (!isa<AllocaInst>(i)) {
+        insert_inter_warp_sync_before.push_back(&(*(i)));
+        break;
+      }
+    }
+
+    for (Function::iterator I = F.begin(); I != F.end(); ++I) {
+      BasicBlock::iterator BI = I->begin();
+
+      // insert barrier before return
+      for (; BI != I->end(); BI++) {
+        llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
+        if (Ret) {
+          insert_inter_warp_sync_before.push_back(&(*BI));
+        }
+      }
+    }
+    if (insert_intra_warp_sync_before.empty() &&
+        insert_inter_warp_sync_before.empty())
+      return 0;
+    for (auto inst : insert_intra_warp_sync_before) {
+      CreateIntraWarpBarrier(inst);
+    }
+    for (auto inst : insert_inter_warp_sync_before) {
+      CreateInterWarpBarrier(inst);
+    }
+    return 1;
+  }
+};
+
+class InsertConditionalBarrier : public llvm::FunctionPass {
+
+public:
+  static char ID;
+
+  InsertConditionalBarrier() : FunctionPass(ID) {}
+
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  BasicBlock *firstNonBackedgePredecessor(llvm::BasicBlock *bb) {
+
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+    pred_iterator I = pred_begin(bb), E = pred_end(bb);
+    if (I == E)
+      return NULL;
+    while (DT->dominates(bb, *I) && I != E)
+      ++I;
+    if (I == E)
+      return NULL;
+    else
+      return *I;
+  }
+
+  BasicBlock *firstNonBackedgeSuccessor(llvm::BasicBlock *bb) {
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto t = bb->getTerminator();
+    assert(t->getNumSuccessors() <= 2);
+    for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *successor = t->getSuccessor(i);
+      bool isBackedge = DT->dominates(successor, bb);
+      if (isBackedge)
+        continue;
+      return successor;
+    }
+  };
+
+  virtual bool runOnFunction(Function &F) {
+    if (!isKernelFunction(F.getParent(), &F))
+      return 0;
+
+    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
+
+    // first find all conditional barriers
+    std::vector<BasicBlock *> conditionalBarriers;
+    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+      BasicBlock *b = &*i;
+      if (!has_barrier(b))
+        continue;
+
+      // Unconditional barrier postdominates the entry node.
+      if (PDT->getPostDomTree().dominates(b, &F.getEntryBlock()))
+        continue;
+      conditionalBarriers.push_back(b);
+    }
+
+    if (conditionalBarriers.size() == 0)
+      return 0;
+
+    bool changed = false;
+
+    while (!conditionalBarriers.empty()) {
+      BasicBlock *b = conditionalBarriers.back();
+      conditionalBarriers.pop_back();
+
+      // insert barrier in the start of if-condition
+
+      BasicBlock *pos = b;
+      BasicBlock *pred = firstNonBackedgePredecessor(b);
+
+      while (PDT->getPostDomTree().dominates(b, pred)) {
+        pos = pred;
+        // If our BB post dominates the given block, we know it is not the
+        // branching block that makes the barrier conditional.
+        pred = firstNonBackedgePredecessor(pred);
+
+        if (pred == b)
+          break; // Traced across a loop edge, skip this case.
+      }
+      // we should create warp/block barrier based on the conditional barrier
+      if (has_warp_barrier(b)) {
+        CreateIntraWarpBarrier(pred->getTerminator());
+      } else {
+        CreateInterWarpBarrier(pred->getTerminator());
+      }
+      changed = true;
+
+      // insert barrier in the merge point for then-else branches
+      // also insert barrier at the end of conditional branch
+      DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      std::queue<llvm::BasicBlock *> successor_queue;
+      for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
+        auto ss = pred->getTerminator()->getSuccessor(i);
+        if (!DT->dominates(ss, pred))
+          successor_queue.push(ss);
+      }
+      std::set<llvm::BasicBlock *> visited;
+      llvm::BasicBlock *merge_point = NULL;
+      while (!successor_queue.empty()) {
+        auto curr = successor_queue.front();
+        successor_queue.pop();
+        if (visited.find(curr) != visited.end())
+          continue;
+
+        visited.insert(curr);
+        if (PDT->getPostDomTree().dominates(curr, pred)) {
+          // find the truly merge point
+          merge_point = curr;
+          if (has_warp_barrier(b)) {
+            CreateIntraWarpBarrier(&(*curr->begin()));
+            for (BasicBlock *Pred : predecessors(curr)) {
+              CreateIntraWarpBarrier(&(*Pred->getTerminator()));
+            }
+          } else {
+            CreateInterWarpBarrier(&(*curr->begin()));
+            for (BasicBlock *Pred : predecessors(curr)) {
+              CreateInterWarpBarrier(&(*Pred->getTerminator()));
+            }
+          }
+          break;
+        }
+        for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
+          auto ss = curr->getTerminator()->getSuccessor(i);
+          if (!DT->dominates(ss, curr))
+            successor_queue.push(ss);
+        }
+      }
+      assert(merge_point && "do not find merge point\n");
+      changed = true;
+
+      // we may create a new conditional barrier after insert
+      if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
+        conditionalBarriers.push_back(pred);
+
+      // find any block which are not dominated by header
+      // but be posdiminated by merge point
+      std::queue<llvm::BasicBlock *> if_body;
+      std::set<llvm::BasicBlock *> visited_block;
+      for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
+        if_body.push(pred->getTerminator()->getSuccessor(i));
+      }
+      while (!if_body.empty()) {
+        auto curr = if_body.front();
+        if_body.pop();
+        if (visited_block.find(curr) != visited_block.end())
+          continue;
+        visited_block.insert(curr);
+        if (!PDT->getPostDomTree().dominates(merge_point, curr))
+          continue;
+        if (!DT->dominates(pred, curr) &&
+            PDT->getPostDomTree().dominates(merge_point, curr)) {
+          // we should insert barrier at the beginning and
+          // end of its predecessor
+          if (has_warp_barrier(b)) {
+            CreateIntraWarpBarrier(&(*curr->begin()));
+            for (BasicBlock *Pred : predecessors(curr)) {
+              CreateIntraWarpBarrier(&(*Pred->getTerminator()));
+            }
+          } else {
+            CreateInterWarpBarrier(&(*curr->begin()));
+            for (BasicBlock *Pred : predecessors(curr)) {
+              CreateInterWarpBarrier(&(*Pred->getTerminator()));
+            }
+          }
+        }
+        for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
+          if_body.push(curr->getTerminator()->getSuccessor(i));
+        }
+      }
+    }
+    return changed;
+  }
+};
+
+class InsertBarrierForSpecialCase : public llvm::FunctionPass {
+public:
+  static char ID;
+
+  InsertBarrierForSpecialCase() : FunctionPass(ID) {}
+
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  virtual bool runOnFunction(Function &F) {
+    if (!isKernelFunction(F.getParent(), &F))
+      return 0;
+    bool changed = false;
+    std::set<BasicBlock *> if_head;
+    // insert an extra block for the following case
+    // 1) there is a merge point for an if-else branch,
+    // but this merge point has other income edge
+
+    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
+    auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+    for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+      BasicBlock *b = &*i;
+      BasicBlock *merge_point = NULL;
+      if (b->getTerminator()->getNumSuccessors() == 2) {
+        auto b1 = b->getTerminator()->getSuccessor(0);
+        auto b2 = b->getTerminator()->getSuccessor(1);
+        if (PDT->getPostDomTree().dominates(b1, b2)) {
+          merge_point = b1;
+        } else if (PDT->getPostDomTree().dominates(b2, b2)) {
+          merge_point = b2;
+        } else {
+          assert(0 && "find complex if-else branch\n");
+        }
+        std::cout << std::flush;
+        for (BasicBlock *Pred : predecessors(merge_point)) {
+          if (!DT->dominates(b, Pred)) {
+            // we need to insert an extra block to be the merge point
+            // for the if-branch
+            if_head.insert(b);
+          }
+        }
+      }
+    }
+
+    auto M = F.getParent();
+    for (auto head : if_head) {
+      assert(head->getTerminator()->getNumSuccessors() == 2);
+      BasicBlock *merge_point = NULL;
+      auto s1 = head->getTerminator()->getSuccessor(0);
+      auto s2 = head->getTerminator()->getSuccessor(1);
+      if (PDT->getPostDomTree().dominates(s1, s2)) {
+        merge_point = s1;
+      } else {
+        merge_point = s2;
+      }
+      if (!find_barrier_in_region(head, merge_point)) {
+        printf("do not need to handle tri-income if: %s\n",
+               merge_point->getName().str().c_str());
+        continue;
+      }
+
+      BasicBlock *Block = BasicBlock::Create(M->getContext(), "if_end", &F);
+      llvm::IRBuilder<> Builder(M->getContext());
+      Builder.SetInsertPoint(Block);
+      auto br_inst = Builder.CreateBr(merge_point);
+      assert(has_barrier(head) && "preheader does not have barrier\n");
+      if (has_warp_barrier(head)) {
+        CreateIntraWarpBarrier(br_inst);
+      } else {
+        CreateInterWarpBarrier(br_inst);
+      }
+      // replace usage in if-branch
+      std::set<Instruction *> need_replace;
+      for (BasicBlock *Pred : predecessors(merge_point)) {
+        if (DT->dominates(head, Pred) && Pred != Block) {
+          need_replace.insert(Pred->getTerminator());
+        }
+      }
+      for (auto inst : need_replace) {
+        inst->replaceUsesOfWith(merge_point, Block);
+      }
+      changed = 1;
+    }
+    return changed;
+  }
+};
+
+class InsertConditionalForBarrier : public llvm::LoopPass {
+
+public:
+  static char ID;
+
+  InsertConditionalForBarrier() : LoopPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) {
+    if (!isKernelFunction(L->getHeader()->getParent()->getParent(),
+                          L->getHeader()->getParent()))
+      return 0;
+    // check whether this loop has barrier
+    bool is_conditional_loop = 0;
+    bool is_warp = 0;
+    for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); i != e;
+         ++i) {
+      for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
+           ++j) {
+        if (auto Call = dyn_cast<CallInst>(j)) {
+          auto func_name = Call->getCalledFunction()->getName().str();
+          if (func_name == "llvm.nvvm.barrier0" ||
+              func_name == "llvm.nvvm.bar.warp.sync" ||
+              func_name == "llvm.nvvm.barrier.sync") {
+            is_conditional_loop = true;
+            if (func_name == "llvm.nvvm.bar.warp.sync") {
+              is_warp = 1;
+            }
+            break;
+          }
+        }
+      }
+    }
+    if (!is_conditional_loop)
+      return 0;
+    // insert barrier at the beginning of header
+    // and the end of pre header, so that we can get a
+    // single block connected with latch
+    if (!is_warp) {
+      auto prehead_block = L->getLoopPreheader();
+      CreateInterWarpBarrier(prehead_block->getTerminator());
+      auto header_block = L->getHeader();
+      CreateInterWarpBarrier(&(*header_block->begin()));
+    } else {
+      auto prehead_block = L->getLoopPreheader();
+      CreateIntraWarpBarrier(prehead_block->getTerminator());
+      auto header_block = L->getHeader();
+      CreateIntraWarpBarrier(&(*header_block->begin()));
+    }
+
+    // as we assume all loops are rotated, we have to insert
+    // barrier before the condition jump of the loop exit
+
+    if (auto exit_block = L->getExitingBlock()) {
+      auto conditional_br =
+          dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
+      assert(conditional_br && conditional_br->isConditional());
+      // insert barrier at the beginning of successor of exit
+      if (!is_warp)
+        CreateInterWarpBarrier(conditional_br);
+      else
+        CreateIntraWarpBarrier(conditional_br);
+    } else {
+      // handle break in for-loop
+      printf("loop has multiply exists\n");
+      // this time, we have also insert sync before the for-body
+      auto header_block = L->getHeader();
+      assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
+             "has more than 2 successors of the for-head\n");
+      BasicBlock *for_body = NULL;
+      for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
+           i++) {
+        auto bb = header_block->getTerminator()->getSuccessor(i);
+        if (L->contains(bb)) {
+          if (is_warp) {
+            CreateIntraWarpBarrier(&(*bb->begin()));
+          } else {
+            CreateInterWarpBarrier(&(*bb->begin()));
+          }
+        }
+      }
+
+      SmallVector<llvm::BasicBlock *, 8> ExitingBlocks;
+
+      L->getExitingBlocks(ExitingBlocks);
+      while (!ExitingBlocks.empty()) {
+        auto exit_block = ExitingBlocks.back();
+        ExitingBlocks.pop_back();
+        auto conditional_br =
+            dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
+        assert(conditional_br && conditional_br->isConditional());
+        // insert barrier at the beginning of successor of exit
+        if (!is_warp)
+          CreateInterWarpBarrier(conditional_br);
+        else
+          CreateIntraWarpBarrier(conditional_br);
+      }
+    }
+
+    return 1;
+  }
+};
+
+char InsertBuiltInBarrier::ID = 0;
+char InsertConditionalBarrier::ID = 0;
+char InsertConditionalForBarrier::ID = 0;
+char InsertBarrierForSpecialCase::ID = 0;
+
+namespace {
+static RegisterPass<InsertConditionalBarrier>
+    insert_conditional_barrier("insert-conditional-if-barriers",
+                               "Insert conditional barriers for if body");
+static RegisterPass<InsertConditionalForBarrier>
+    insert_conditional_for_barrier("insert-conditional-for-barriers",
+                                   "Insert conditional barriers for for loop");
+static RegisterPass<InsertBarrierForSpecialCase>
+    insert_special_case("insert-special-case-barriers",
+                        "Insert barriers for special cases");
+static RegisterPass<InsertBuiltInBarrier>
+    insert_built_in_barrier("insert-built-in-barriers",
+                            "Insert built in barriers");
+} // namespace
+
+void insert_sync(llvm::Module *M) {
+  auto Registry = PassRegistry::getPassRegistry();
+
+  llvm::legacy::PassManager Passes;
+
+  std::vector<std::string> passes;
+  passes.push_back("insert-built-in-barriers");
+  passes.push_back("insert-conditional-if-barriers");
+  passes.push_back("insert-conditional-for-barriers");
+  passes.push_back("insert-special-case-barriers");
+  for (auto pass : passes) {
+    const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
+    if (PIs) {
+      Pass *thispass = PIs->createPass();
+      Passes.add(thispass);
+    } else {
+      assert(0 && "Pass not found\n");
+    }
+  }
+  Passes.run(*M);
+}
diff --git a/compilation/KernelTranslation/lib/insert_warp_loop.cpp b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
new file mode 100644
index 0000000..f4023f0
--- /dev/null
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@@ -0,0 +1,848 @@
+
+#include "insert_warp_loop.h"
+#include "handle_sync.h"
+#include "tool.h"
+#include <assert.h>
+#include <iostream>
+#include <set>
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassInfo.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+#include <set>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+
+struct ParallelRegion {
+  std::set<llvm::BasicBlock *> wrapped_block;
+  llvm::BasicBlock *successor_block;
+  llvm::BasicBlock *start_block;
+  llvm::BasicBlock *end_block;
+  bool inst_in_region(llvm::Instruction *inst) {
+    for (auto bb : wrapped_block) {
+      if (inst->getParent()->getName().str() == bb->getName().str())
+        return true;
+    }
+    return false;
+  }
+  bool inst_used_in_region(llvm::Instruction *inst) {
+    for (auto ui = inst->use_begin(); ui != inst->use_end(); ++ui) {
+      auto *user = dyn_cast<Instruction>(ui->getUser());
+      if (user == NULL)
+        continue;
+      if (inst_in_region(user)) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+};
+
+std::map<llvm::Instruction *, unsigned> tempInstructionIds;
+std::map<std::string, llvm::Instruction *> contextArrays;
+int tempInstructionIndex = 0;
+int need_nested_loop;
+
+bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
+  if (isa<BranchInst>(instr))
+    return true;
+
+  llvm::Module *M = instr->getParent()->getParent()->getParent();
+  llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
+  if (load != NULL) {
+    auto load_addr = load->getPointerOperand();
+    if (load_addr == M->getGlobalVariable("intra_warp_index"))
+      return true;
+    if (load_addr == M->getGlobalVariable("inter_warp_index"))
+      return true;
+    if (load_addr == M->getGlobalVariable("warp_vote"))
+      return true;
+  }
+
+  // TODO: we should further analyze whether the local variable
+  // is same among all threads within a wrap
+  return false;
+}
+
+// generate countpart alloc in the beginning of the Function
+llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
+                                   bool intra_warp_loop) {
+  std::ostringstream var;
+
+  if (std::string(instruction->getName().str()) != "") {
+    var << instruction->getName().str();
+  } else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) {
+    var << tempInstructionIds[instruction];
+  } else {
+    tempInstructionIds[instruction] = tempInstructionIndex++;
+    var << tempInstructionIds[instruction];
+  }
+  if (intra_warp_loop)
+    var << "_intra_warp_";
+  else
+    var << "_inter_warp_";
+  std::string varName = var.str();
+
+  if (contextArrays.find(varName) != contextArrays.end())
+    return contextArrays[varName];
+
+  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
+  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
+  Function *FF = instruction->getParent()->getParent();
+  Module *M = instruction->getParent()->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  const llvm::DataLayout &Layout = M->getDataLayout();
+
+  llvm::Type *elementType;
+  if (isa<AllocaInst>(instruction)) {
+    elementType =
+        dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
+  } else {
+    elementType = instruction->getType();
+  }
+
+  Type *AllocType = elementType;
+  AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
+  if (InstCast) {
+    unsigned Alignment = InstCast->getAlignment();
+
+    uint64_t StoreSize = Layout.getTypeStoreSize(InstCast->getAllocatedType());
+
+    if ((Alignment > 1) && (StoreSize & (Alignment - 1))) {
+      uint64_t AlignedSize = (StoreSize & (~(Alignment - 1))) + Alignment;
+      assert(AlignedSize > StoreSize);
+      uint64_t RequiredExtraBytes = AlignedSize - StoreSize;
+
+      if (isa<ArrayType>(elementType)) {
+
+        ArrayType *StructPadding = ArrayType::get(
+            Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
+
+        std::vector<Type *> PaddedStructElements;
+        PaddedStructElements.push_back(elementType);
+        PaddedStructElements.push_back(StructPadding);
+        const ArrayRef<Type *> NewStructElements(PaddedStructElements);
+        AllocType = StructType::get(M->getContext(), NewStructElements, true);
+        uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
+        assert(NewStoreSize == AlignedSize);
+      } else if (isa<StructType>(elementType)) {
+        StructType *OldStruct = dyn_cast<StructType>(elementType);
+
+        ArrayType *StructPadding = ArrayType::get(
+            Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
+        std::vector<Type *> PaddedStructElements;
+        for (unsigned j = 0; j < OldStruct->getNumElements(); j++)
+          PaddedStructElements.push_back(OldStruct->getElementType(j));
+        PaddedStructElements.push_back(StructPadding);
+        const ArrayRef<Type *> NewStructElements(PaddedStructElements);
+        AllocType = StructType::get(OldStruct->getContext(), NewStructElements,
+                                    OldStruct->isPacked());
+        uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
+        assert(NewStoreSize == AlignedSize);
+      }
+    }
+  }
+
+  llvm::Value *ItemSize = nullptr;
+  llvm::AllocaInst *Alloca = nullptr;
+
+  auto block_size_addr = M->getGlobalVariable("block_size");
+  auto block_size = builder.CreateLoad(block_size_addr);
+  Alloca = builder.CreateAlloca(AllocType, block_size, varName);
+
+  contextArrays[varName] = Alloca;
+  return Alloca;
+}
+
+// save the local variable into replicated array
+llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
+                                  llvm::Instruction *alloca,
+                                  bool intra_warp_loop) {
+
+  if (isa<AllocaInst>(instruction)) {
+    return NULL;
+  }
+
+  llvm::Module *M = instruction->getParent()->getParent()->getParent();
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+
+  /* Save the produced variable to the array. */
+  BasicBlock::iterator definition =
+      (dyn_cast<Instruction>(instruction))->getIterator();
+  ++definition;
+
+  IRBuilder<> builder(&*definition);
+  std::vector<llvm::Value *> gepArgs;
+
+  auto inter_warp_index =
+      builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
+  auto intra_warp_index =
+      builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+  auto thread_idx = builder.CreateBinOp(
+      Instruction::Add, intra_warp_index,
+      builder.CreateBinOp(Instruction::Mul, inter_warp_index,
+                          ConstantInt::get(I32, 32)),
+      "thread_idx");
+  gepArgs.push_back(thread_idx);
+
+  return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
+}
+
+llvm::Instruction *AddContextRestore(llvm::Value *val,
+                                     llvm::Instruction *alloca,
+                                     llvm::Instruction *before, bool isAlloca,
+                                     bool intra_warp_loop) {
+  assert(val != NULL);
+  assert(alloca != NULL);
+  IRBuilder<> builder(alloca);
+  if (before != NULL) {
+    builder.SetInsertPoint(before);
+  } else if (isa<Instruction>(val)) {
+    builder.SetInsertPoint(dyn_cast<Instruction>(val));
+    before = dyn_cast<Instruction>(val);
+  } else {
+    assert(false && "Unknown context restore location!");
+  }
+
+  std::vector<llvm::Value *> gepArgs;
+
+  auto M = before->getParent()->getParent()->getParent();
+  auto I32 = llvm::Type::getInt32Ty(M->getContext());
+  auto inter_warp_index =
+      builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
+  auto intra_warp_index =
+      builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+  auto thread_idx = builder.CreateBinOp(
+      Instruction::Add, intra_warp_index,
+      builder.CreateBinOp(Instruction::Mul, inter_warp_index,
+                          ConstantInt::get(I32, 32)),
+      "thread_idx");
+  gepArgs.push_back(thread_idx);
+
+  llvm::Instruction *gep =
+      dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
+  if (isAlloca) {
+    return gep;
+  }
+  return builder.CreateLoad(gep);
+}
+
+void AddContextSaveRestore(llvm::Instruction *instruction,
+                           bool intra_warp_loop) {
+
+  /* Allocate the context data array for the variable. */
+  llvm::Instruction *alloca = GetContextArray(instruction, intra_warp_loop);
+
+  llvm::Instruction *theStore =
+      AddContextSave(instruction, alloca, intra_warp_loop);
+
+  std::vector<Instruction *> uses;
+
+  for (Instruction::use_iterator ui = instruction->use_begin(),
+                                 ue = instruction->use_end();
+       ui != ue; ++ui) {
+    llvm::Instruction *user = cast<Instruction>(ui->getUser());
+    if (user == NULL)
+      continue;
+    if (user == theStore)
+      continue;
+    uses.push_back(user);
+  }
+
+  for (auto user : uses) {
+    Instruction *contextRestoreLocation = user;
+    llvm::Value *loadedValue =
+        AddContextRestore(user, alloca, contextRestoreLocation,
+                          isa<AllocaInst>(instruction), intra_warp_loop);
+    user->replaceUsesOfWith(instruction, loadedValue);
+  }
+}
+
+void handle_alloc(llvm::Function *F) {
+  auto M = F->getParent();
+  LLVMContext &C = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(C);
+
+  std::vector<llvm::Instruction *> instruction_to_fix;
+  for (auto bb = F->begin(); bb != F->end(); bb++) {
+    for (auto ii = bb->begin(); ii != bb->end(); ii++) {
+      if (llvm::AllocaInst *i = dyn_cast<AllocaInst>(ii)) {
+        instruction_to_fix.push_back(i);
+      }
+    }
+  }
+
+  std::vector<llvm::Instruction *> need_remove;
+
+  for (auto inst : instruction_to_fix) {
+    // generate a new alloc
+    auto block_size_addr = M->getGlobalVariable("block_size");
+    IRBuilder<> builder(inst);
+    auto block_size = builder.CreateLoad(block_size_addr);
+
+    llvm::Type *elementType = NULL;
+    if (dyn_cast<AllocaInst>(inst)->getType()->getElementType()) {
+      elementType = dyn_cast<AllocaInst>(inst)->getType()->getElementType();
+    }
+    assert(elementType != NULL);
+
+    auto Alloca = builder.CreateAlloca(elementType, block_size,
+                                       inst->getName().str() + "inter_warp");
+
+    // replace all usage
+    std::set<Instruction *> replace_user;
+    for (Instruction::use_iterator ui = inst->use_begin(), ue = inst->use_end();
+         ui != ue; ++ui) {
+      replace_user.insert(dyn_cast<Instruction>(ui->getUser()));
+    }
+    for (auto user : replace_user) {
+
+      IRBuilder<> builder(user);
+      // std::vector<llvm::Value *> gepArgs;
+      auto inter_warp_index =
+          builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
+      auto intra_warp_index =
+          builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+      auto thread_idx = builder.CreateBinOp(
+          Instruction::Add, intra_warp_index,
+          builder.CreateBinOp(Instruction::Mul, inter_warp_index,
+                              ConstantInt::get(I32, 32)),
+          "thread_idx");
+
+      auto gep = builder.CreateGEP(Alloca, thread_idx);
+
+      user->replaceUsesOfWith(inst, gep);
+    }
+    need_remove.push_back(inst);
+  }
+  for (auto inst : need_remove) {
+    inst->dropAllReferences();
+    inst->eraseFromParent();
+  }
+}
+
+void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
+  bool intra_warp_loop = 1;
+  // we should handle allocation generated by PHI
+  {
+    std::vector<llvm::Instruction *> instruction_to_fix;
+    auto F = PRs[0].start_block->getParent();
+    for (auto bb = F->begin(); bb != F->end(); bb++) {
+      for (auto ii = bb->begin(); ii != bb->end(); ii++) {
+        if (isa<AllocaInst>(&(*ii)))
+          instruction_to_fix.push_back(&(*ii));
+      }
+      for (auto inst : instruction_to_fix) {
+        AddContextSaveRestore(inst, intra_warp_loop);
+      }
+    }
+  }
+
+  for (auto parallel_regions : PRs) {
+    std::set<llvm::Instruction *> instruction_in_region;
+    std::vector<llvm::Instruction *> instruction_to_fix;
+
+    for (auto bb : parallel_regions.wrapped_block) {
+      for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
+           ++instr) {
+        llvm::Instruction *instruction = &*instr;
+        instruction_in_region.insert(instruction);
+      }
+    }
+    /* Find all the instructions that define new values and
+       check if they need to be context saved. */
+    for (auto bb : parallel_regions.wrapped_block) {
+      for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
+           ++instr) {
+        llvm::Instruction *instruction = &*instr;
+
+        if (ShouldNotBeContextSaved(instruction))
+          continue;
+
+        for (Instruction::use_iterator ui = instruction->use_begin(),
+                                       ue = instruction->use_end();
+             ui != ue; ++ui) {
+          llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
+
+          if (user == NULL)
+            continue;
+          if (isa<AllocaInst>(instruction) ||
+              (instruction_in_region.find(user) ==
+               instruction_in_region.end())) {
+            instruction_to_fix.push_back(instruction);
+            break;
+          }
+        }
+      }
+    }
+    for (auto inst : instruction_to_fix) {
+      AddContextSaveRestore(inst, intra_warp_loop);
+    }
+  }
+}
+
+BasicBlock *insert_loop_init(llvm::BasicBlock *InsertInitBefore,
+                             bool intra_warp_loop) {
+  llvm::Module *M = InsertInitBefore->getParent()->getParent();
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::string block_name =
+      (intra_warp_loop) ? "intra_warp_init" : "inter_warp_init";
+  BasicBlock *loop_init = BasicBlock::Create(
+      context, block_name, InsertInitBefore->getParent(), InsertInitBefore);
+  IRBuilder<> builder(context);
+  builder.SetInsertPoint(loop_init);
+  if (intra_warp_loop) { // intra warp
+    auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
+    builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
+  } else { // inter warp
+    auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
+    builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
+  }
+  builder.CreateBr(InsertInitBefore);
+  return loop_init;
+}
+
+BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore,
+                             llvm::BasicBlock *LoopEnd, bool intra_warp_loop) {
+  llvm::Module *M = InsertCondBefore->getParent()->getParent();
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::string block_name =
+      (intra_warp_loop) ? "intra_warp_cond" : "inter_warp_cond";
+  BasicBlock *loop_cond = BasicBlock::Create(
+      context, block_name, InsertCondBefore->getParent(), InsertCondBefore);
+  IRBuilder<> builder(context);
+  builder.SetInsertPoint(loop_cond);
+
+  llvm::Value *cmpResult = NULL;
+  if (!intra_warp_loop) {
+    auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
+    auto block_size = M->getGlobalVariable("block_size");
+    auto warp_cnt =
+        builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size),
+                            ConstantInt::get(I32, 32), "warp_number");
+
+    cmpResult =
+        builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt);
+  } else {
+    auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
+    auto block_size = M->getGlobalVariable("block_size");
+    if (!need_nested_loop) {
+      cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
+                                        builder.CreateLoad(block_size));
+    } else {
+      cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
+                                        ConstantInt::get(I32, 32));
+    }
+  }
+  builder.CreateCondBr(cmpResult, InsertCondBefore, LoopEnd);
+  return loop_cond;
+}
+
+BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore,
+                            bool intra_warp_loop) {
+  llvm::Module *M = InsertIncBefore->getParent()->getParent();
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::string block_name =
+      (intra_warp_loop) ? "intra_warp_inc" : "inter_warp_inc";
+  BasicBlock *loop_inc = BasicBlock::Create(
+      context, block_name, InsertIncBefore->getParent(), InsertIncBefore);
+  IRBuilder<> builder(context);
+  builder.SetInsertPoint(loop_inc);
+  if (intra_warp_loop) { // intra warp
+    auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
+    auto new_index = builder.CreateBinOp(
+        Instruction::Add, builder.CreateLoad(intra_warp_index),
+        ConstantInt::get(I32, 1), "intra_warp_index_increment");
+    builder.CreateStore(new_index, intra_warp_index);
+  } else { // inter warp
+    auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
+    auto new_index = builder.CreateBinOp(
+        Instruction::Add, builder.CreateLoad(inter_warp_index),
+        ConstantInt::get(I32, 1), "inter_warp_index_increment");
+    builder.CreateStore(new_index, inter_warp_index);
+  }
+  builder.CreateBr(InsertIncBefore);
+  return loop_inc;
+}
+
+void add_warp_loop(std::vector<ParallelRegion> parallel_regions,
+                   bool intra_warp_loop) {
+  for (auto region : parallel_regions) {
+    auto start_block = region.start_block;
+    auto tail_block = region.end_block;
+    auto next_block = region.successor_block;
+
+    auto loop_cond = insert_loop_cond(start_block, next_block, intra_warp_loop);
+    auto loop_init = insert_loop_init(loop_cond, intra_warp_loop);
+
+    auto F = start_block->getParent();
+    for (Function::iterator i = F->begin(); i != F->end(); ++i) {
+      llvm::BasicBlock *bb = &(*i);
+      if (bb == loop_cond)
+        continue;
+      bb->getTerminator()->replaceUsesOfWith(start_block, loop_init);
+    }
+    auto loop_inc = insert_loop_inc(loop_cond, intra_warp_loop);
+    tail_block->getTerminator()->replaceUsesOfWith(next_block, loop_inc);
+    // we have to reset inter/intra warp index to 0, as these maybe used
+    // outside PR when there are conditional loop/branch
+    llvm::Module *M = start_block->getParent()->getParent();
+    LLVMContext &context = M->getContext();
+    auto I32 = llvm::Type::getInt32Ty(context);
+    BasicBlock *reset_index = BasicBlock::Create(start_block->getContext(),
+                                                 "reset_block", F, next_block);
+    IRBuilder<> builder(start_block->getContext());
+    builder.SetInsertPoint(reset_index);
+    if (intra_warp_loop) { // intra warp
+      auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
+      builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
+    } else { // inter warp
+      auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
+      builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
+    }
+    builder.CreateBr(next_block);
+    loop_cond->getTerminator()->replaceUsesOfWith(next_block, reset_index);
+    // add metadata
+    MDNode *Dummy =
+        MDNode::getTemporary(context, ArrayRef<Metadata *>()).release();
+    MDNode *AccessGroupMD = MDNode::getDistinct(context, {});
+    MDNode *ParallelAccessMD = MDNode::get(
+        context,
+        {MDString::get(context, "llvm.loop.parallel_accesses"), AccessGroupMD});
+    MDNode *Root = MDNode::get(context, {Dummy, ParallelAccessMD});
+
+    Root->replaceOperandWith(0, Root);
+    MDNode::deleteTemporary(Dummy);
+    // We now have
+    //   !1 = metadata !{metadata !1} <- self-referential root
+    loop_cond->getTerminator()->setMetadata("llvm.loop", Root);
+
+    for (auto bb : region.wrapped_block) {
+      for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); ii != ee;
+           ii++) {
+        if (!ii->mayReadOrWriteMemory()) {
+          continue;
+        }
+        MDNode *NewMD = MDNode::get(bb->getContext(), AccessGroupMD);
+        MDNode *OldMD = ii->getMetadata("llvm.mem.parallel_loop_access");
+        if (OldMD != nullptr) {
+          NewMD = llvm::MDNode::concatenate(OldMD, NewMD);
+        }
+        ii->setMetadata("llvm.mem.parallel_loop_access", NewMD);
+      }
+    }
+  }
+}
+
+void print_parallel_region(std::vector<ParallelRegion> parallel_regions) {
+  printf("get PR:\n");
+  for (auto region : parallel_regions) {
+    auto start = region.start_block;
+    auto end = region.end_block;
+    auto next = region.successor_block;
+    printf("parallel region: %s->%s next: %s\n", start->getName().str().c_str(),
+           end->getName().str().c_str(), next->getName().str().c_str());
+    printf("have: \n");
+    for (auto b : region.wrapped_block) {
+      printf("%s\n", b->getName().str().c_str());
+    }
+  }
+}
+
+void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
+  std::vector<Instruction *> need_remove;
+  for (auto BB = F->begin(); BB != F->end(); ++BB) {
+    for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+      if (auto Call = dyn_cast<CallInst>(BI)) {
+        auto func_name = Call->getCalledFunction()->getName().str();
+        if (func_name == "llvm.nvvm.bar.warp.sync") {
+          need_remove.push_back(Call);
+        }
+        if (!intra_warp_loop && (func_name == "llvm.nvvm.barrier0" ||
+                                 func_name == "llvm.nvvm.barrier.sync")) {
+          need_remove.push_back(Call);
+        }
+      }
+    }
+  }
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+
+class InsertWarpLoopPass : public llvm::FunctionPass {
+
+public:
+  static char ID;
+  bool intra_warp_loop;
+  DominatorTree *DT;
+  PostDominatorTree *PDT;
+
+  InsertWarpLoopPass(bool intra_warp = 0)
+      : FunctionPass(ID), intra_warp_loop(intra_warp) {}
+
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+  void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop,
+                               std::vector<ParallelRegion> &parallel_regions) {
+    ParallelRegion current_region;
+
+    SmallVector<BasicBlock *, 4> pending_blocks;
+    BasicBlock *region_entry_barrier = NULL;
+    BasicBlock *entry = NULL;
+    BasicBlock *exit = B->getSinglePredecessor();
+    for (BasicBlock *Pred : predecessors(B)) {
+      pending_blocks.push_back(Pred);
+    }
+    if (pending_blocks.size() > 1) {
+      // becuase we have insert the sync and split by them,
+      // so if B has several income edges, it must be a merge point
+      // for a conditional if. We can safely ignore it
+      // TODO: we have to further check whether this conditional if
+      // is for inter warp or intra warp
+      return;
+    }
+
+    while (!pending_blocks.empty()) {
+      BasicBlock *current = pending_blocks.back();
+      pending_blocks.pop_back();
+
+      // avoid infinite recursion of loops
+      if (current_region.wrapped_block.count(current) != 0) {
+        continue;
+      }
+
+      // If we reach another barrier this must be the
+      // parallel region entry.
+      bool has_barrier = 0;
+      for (auto i = current->begin(), e = current->end(); i != e; ++i) {
+        if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
+          auto func_name = call_inst->getCalledFunction()->getName().str();
+          if (func_name == "llvm.nvvm.barrier0" ||
+              func_name == "llvm.nvvm.barrier.sync")
+            has_barrier = 1;
+          if (func_name == "llvm.nvvm.bar.warp.sync" && intra_warp_loop)
+            has_barrier = 1;
+        }
+      }
+
+      // if we reach a block which only has a single condtional branch,
+      // it is the start point of a B-condition, we have to stop here
+      bool is_single_conditional_branch_block = 0;
+      if (auto br = dyn_cast<llvm::BranchInst>(current->getTerminator())) {
+        if (br->isConditional()) {
+          if (current->size() == 1) {
+            is_single_conditional_branch_block = 1;
+          } else {
+            // generate by replicate local variable
+            printf(
+                "[WARNING] match single conditional branch with HARD CODE\n");
+            bool branch_to_intra_init = false;
+            for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
+              llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
+              auto block_name = entryCandidate->getName().str();
+              if (find_block_barrier_in_region(current, B)) {
+                if (block_name.find("warp_init") != block_name.npos) {
+                  is_single_conditional_branch_block = 1;
+                  break;
+                }
+              }
+            }
+          }
+        }
+      }
+
+      if (has_barrier || is_single_conditional_branch_block) {
+        if (region_entry_barrier == NULL)
+          region_entry_barrier = current;
+        else if (region_entry_barrier != current) {
+          // this means there is not PR before B, just return
+          return;
+        }
+        continue;
+      }
+
+      // Non-barrier block, this must be on the region.
+      current_region.wrapped_block.insert(current);
+
+      // Add predecessors to pending queue.
+      for (BasicBlock *Pred : predecessors(current)) {
+        pending_blocks.push_back(Pred);
+      }
+    }
+
+    if (current_region.wrapped_block.empty()) {
+      return;
+    }
+
+    // if do not find entry node, this means all predecessor
+    // blocks do not need to execute multiply times
+    if (region_entry_barrier == NULL) {
+      return;
+    }
+    // Find the entry node.
+    assert(region_entry_barrier != NULL);
+    for (unsigned
+             suc = 0,
+             num = region_entry_barrier->getTerminator()->getNumSuccessors();
+         suc < num; ++suc) {
+      llvm::BasicBlock *entryCandidate =
+          region_entry_barrier->getTerminator()->getSuccessor(suc);
+      if (current_region.wrapped_block.count(entryCandidate) == 0)
+        continue;
+      entry = entryCandidate;
+      break;
+    }
+    // delete useless PR, those PRs only have branch
+    if (entry == exit) {
+      if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
+        return;
+      }
+    }
+    bool is_useless = true;
+    auto iter = entry;
+    do {
+      if (iter->size() != 1 || !isa<llvm::BranchInst>(entry->begin())) {
+        is_useless = false;
+        break;
+      }
+      if (iter->getTerminator()->getNumSuccessors() > 1) {
+        is_useless = false;
+        break;
+      }
+      iter = iter->getTerminator()->getSuccessor(0);
+    } while (iter != exit);
+    if (is_useless) {
+      return;
+    }
+    assert(current_region.wrapped_block.count(entry) != 0);
+    current_region.start_block = entry;
+    current_region.end_block = exit;
+    current_region.successor_block = B;
+    parallel_regions.push_back(current_region);
+  }
+
+  std::vector<ParallelRegion> getParallelRegions(llvm::Function *F,
+                                                 bool intra_warp_loop) {
+    std::vector<ParallelRegion> parallel_regions;
+
+    SmallVector<BasicBlock *, 4> exit_blocks;
+    for (Function::iterator s = F->begin(); s != F->end(); s++) {
+      if (llvm::CallInst *call_inst =
+              llvm::dyn_cast<llvm::CallInst>(s->begin())) {
+        auto func_name = call_inst->getCalledFunction()->getName().str();
+        if (func_name == "llvm.nvvm.barrier0" ||
+            func_name == "llvm.nvvm.barrier.sync") {
+          exit_blocks.push_back(&(*s));
+        }
+        // when handling intra warp loop, we need also split the blocks
+        // between warp barrier
+        if (intra_warp_loop && func_name == "llvm.nvvm.bar.warp.sync") {
+          exit_blocks.push_back(&(*s));
+        }
+      }
+    }
+
+    // First find all the ParallelRegions in the Function.
+    while (!exit_blocks.empty()) {
+      BasicBlock *exit = exit_blocks.back();
+      exit_blocks.pop_back();
+      getParallelRegionBefore(exit, intra_warp_loop, parallel_regions);
+    }
+    return parallel_regions;
+  }
+
+  virtual bool runOnFunction(Function &F) {
+    if (!isKernelFunction(F.getParent(), &F))
+      return 0;
+
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+    // find parallel region we need to wrap
+    auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
+    assert(!parallel_regions.empty() && "can not find any parallel regions\n");
+    // print_parallel_region(parallel_regions);
+    add_warp_loop(parallel_regions, intra_warp_loop);
+
+    if (intra_warp_loop) {
+      handle_local_variable_intra_warp(parallel_regions);
+    }
+    remove_barrier(&F, intra_warp_loop);
+    return 1;
+  }
+};
+
+char InsertWarpLoopPass::ID = 0;
+
+namespace {
+static RegisterPass<InsertWarpLoopPass> X("insert-warp-loop",
+                                          "Insert inter/intra warp loop");
+} // namespace
+
+bool has_warp_barrier(llvm::Module *M) {
+  for (auto F = M->begin(); F != M->end(); ++F)
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
+          auto func_name = Call->getCalledFunction()->getName().str();
+          if (func_name == "llvm.nvvm.bar.warp.sync") {
+            return true;
+          }
+        }
+      }
+    }
+  return false;
+}
+
+void insert_warp_loop(llvm::Module *M) {
+  llvm::legacy::PassManager Passes;
+  need_nested_loop = has_warp_barrier(M);
+  // use nested loop only when there are warp-level barrier
+  if (need_nested_loop) {
+    bool intra_warp = true;
+    Passes.add(new InsertWarpLoopPass(intra_warp));
+    // insert inter warp loop
+    Passes.add(new InsertWarpLoopPass(!intra_warp));
+    Passes.run(*M);
+  } else {
+    bool intra_warp = true;
+    // only need a single loop, with size=block_size
+    Passes.add(new InsertWarpLoopPass(intra_warp));
+    Passes.run(*M);
+    // remove all barriers
+    for (auto F = M->begin(); F != M->end(); ++F)
+      remove_barrier(dyn_cast<llvm::Function>(F), false);
+  }
+}
diff --git a/compilation/KernelTranslation/lib/memory_hierarchy.cpp b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
new file mode 100644
index 0000000..9152500
--- /dev/null
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@@ -0,0 +1,126 @@
+#include "memory_hierarchy.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <assert.h>
+#include <map>
+#include <set>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+void mem_share2global(llvm::Module *M) {
+  LLVMContext *C = &M->getContext();
+  llvm::Type *Int32T = Type::getInt32Ty(*C);
+  llvm::Type *Int64T = Type::getInt64Ty(*C);
+  llvm::Type *Int8T = Type::getInt8Ty(*C);
+
+  std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
+  std::set<llvm::Instruction *> need_remove;
+  std::set<GlobalVariable *> need_remove_share_memory;
+
+  // find all share memory and generate corresponding global memory
+  for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
+    if (GlobalVariable *share_memory = dyn_cast<GlobalVariable>(I)) {
+      if (auto PT = dyn_cast<PointerType>(I->getType())) {
+        unsigned AS = PT->getAddressSpace();
+        if (AS == 3) { // find a share memory
+          need_remove_share_memory.insert(share_memory);
+          // generate the corresponding global memory variable
+          auto new_name = "wrapper_global_" + share_memory->getName().str();
+          auto element_type = PT->getElementType();
+          if (auto array_type = dyn_cast<ArrayType>(element_type)) {
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
+                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
+            ConstantAggregateZero *const_array =
+                ConstantAggregateZero::get(array_type);
+            global_memory->setInitializer(const_array);
+            corresponding_global_memory.insert(
+                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
+                                                              global_memory));
+          } else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
+            auto zero = llvm::ConstantInt::get(int_type, 0, true);
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, int_type, false, llvm::GlobalValue::ExternalLinkage, zero,
+                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
+                false);
+            corresponding_global_memory.insert(
+                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
+                                                              global_memory));
+          } else {
+            assert(0 && "The required Share Memory Type is not supported\n");
+          }
+        }
+      }
+    }
+  }
+
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+      BasicBlock *b = &*i;
+      for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
+        if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
+          auto read_array = get_element_ptr->getPointerOperand();
+          if (GlobalVariable *read_share_memory =
+                  dyn_cast<llvm::GlobalVariable>(read_array)) {
+            // find a GetElementPtr which read share memory
+            if (corresponding_global_memory.find(read_share_memory) !=
+                corresponding_global_memory.end()) {
+              std::vector<Value *> Indices;
+              for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
+                Indices.push_back(get_element_ptr->getOperand(i + 1));
+
+              auto new_GEP = GetElementPtrInst::Create(
+                  NULL, // Pointee type
+                  corresponding_global_memory.find(read_share_memory)
+                      ->second, // Alloca
+                  Indices,      // Indices
+                  "", get_element_ptr);
+              // replace all get_element_ptr with new_GEP:
+              // we can not directly use:
+              // get_element_ptr->replaceAllUsesWith(new_GEP);
+              // as get_element_ptr and new_GEP have different return type
+              llvm::Type *original_type = get_element_ptr->getType();
+              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
+                  new_GEP, original_type, "", get_element_ptr);
+              get_element_ptr->replaceAllUsesWith(FormatASC);
+              need_remove.insert(get_element_ptr);
+            }
+          }
+        } else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
+          auto read_array = addr_cast->getOperand(0);
+          if (GlobalVariable *read_share_memory =
+                  dyn_cast<llvm::GlobalVariable>(read_array)) {
+            // find a GetElementPtr which read share memory
+            if (corresponding_global_memory.find(read_share_memory) !=
+                corresponding_global_memory.end()) {
+              llvm::Type *original_type = addr_cast->getType();
+              auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
+                  corresponding_global_memory.find(read_share_memory)->second,
+                  original_type, "", addr_cast);
+              addr_cast->replaceAllUsesWith(FormatASC);
+              need_remove.insert(addr_cast);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (auto i : need_remove) {
+    i->dropAllReferences();
+    i->eraseFromParent();
+  }
+  for (auto i : need_remove_share_memory) {
+    i->dropAllReferences();
+    i->eraseFromParent();
+  }
+}
diff --git a/compilation/KernelTranslation/lib/performance.cpp b/compilation/KernelTranslation/lib/performance.cpp
new file mode 100644
index 0000000..11d30e4
--- /dev/null
+++ b/compilation/KernelTranslation/lib/performance.cpp
@@ -0,0 +1,88 @@
+#include "performance.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassInfo.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+#include <set>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+
+void performance_optimization(llvm::Module *M) {
+  for (auto F = M->begin(); F != M->end(); F++) {
+    for (auto I = F->arg_begin(); I != F->arg_end(); ++I) {
+      if (I->getType()->isPointerTy()) {
+        I->addAttr(llvm::Attribute::NoAlias);
+      }
+    }
+  }
+
+  llvm::legacy::PassManager Passes;
+
+  // add target machine info
+  llvm::Triple triple("x86_64-unknown-linux-gnu");
+
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
+  if (!TheTarget) {
+    printf("Error: %s\n", Error.c_str());
+    assert(0);
+  }
+  llvm::TargetOptions Options;
+  Options.FloatABIType = FloatABI::Hard;
+
+  TargetMachine *TM = TheTarget->createTargetMachine(
+      triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
+      Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
+  assert(TM && "No Machine Information\n");
+
+  Passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+
+  TargetLibraryInfoImpl TLII(triple);
+  TLII.disableAllFunctions();
+  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
+
+  // Add O3 optimization
+  llvm::PassManagerBuilder Builder;
+  Builder.OptLevel = 3;
+  Builder.SizeLevel = 0;
+
+  Builder.LoopVectorize = true;
+  Builder.SLPVectorize = true;
+
+  Builder.VerifyInput = true;
+  Builder.VerifyOutput = true;
+
+  Builder.populateModulePassManager(Passes);
+  Passes.run(*M);
+}
diff --git a/compilation/KernelTranslation/lib/tool.cpp b/compilation/KernelTranslation/lib/tool.cpp
new file mode 100644
index 0000000..c3c379e
--- /dev/null
+++ b/compilation/KernelTranslation/lib/tool.cpp
@@ -0,0 +1,480 @@
+#include "tool.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <iostream>
+#include <set>
+
+using namespace llvm;
+
+llvm::Module *LoadModuleFromFilr(char *file_name) {
+  llvm::SMDiagnostic Err;
+  llvm::LLVMContext *globalContext = new llvm::LLVMContext;
+  auto program = parseIRFile(file_name, Err, *globalContext).release();
+  if (!program) {
+    printf("error when opening the bitcode\n");
+    exit(1);
+  }
+  return program;
+}
+
+void VerifyModule(llvm::Module *program) {
+  std::string msg;
+  llvm::raw_string_ostream os(msg);
+  if (llvm::verifyModule(*program, &(llvm::errs())))
+    llvm::report_fatal_error(os.str().c_str());
+}
+
+void DumpModule(llvm::Module *M, char *file_name) {
+  // modify the program, add a wrapper
+  std::string msg;
+  llvm::raw_string_ostream os(msg);
+  std::error_code EC;
+  ToolOutputFile Out(file_name, EC, sys::fs::F_None);
+  if (EC) {
+    errs() << "Fails to open output file: " << EC.message();
+    return;
+  }
+  WriteBitcodeToFile(*M, Out.os());
+  Out.keep();
+}
+
+bool isKernelFunction(llvm::Module *M, llvm::Function *F) {
+  NamedMDNode *NamedMD = M->getNamedMetadata("nvvm.annotations");
+  if (!NamedMD) {
+    printf("there must be nvvm.annotations!\n");
+    exit(1);
+  }
+  for (unsigned I = 0, E = NamedMD->getNumOperands(); I != E; ++I) {
+    MDNode *MD = NamedMD->getOperand(I);
+    if (!MD || MD->getNumOperands() == 0)
+      continue;
+    if (MD->getNumOperands() != 3)
+      continue;
+    Metadata *Op = MD->getOperand(1);
+    if (auto Str = llvm::cast<MDString>(Op)) {
+      if (Str->getString().str() != "kernel")
+        continue;
+      llvm::Value *meta =
+          dyn_cast<llvm::ValueAsMetadata>(MD->getOperand(0))->getValue();
+      Function *FF = llvm::cast<Function>(meta);
+      if (FF->getName().str() == F->getName().str())
+        return true;
+    }
+  }
+  return false;
+}
+
+void replace_block(llvm::Function *F, llvm::BasicBlock *before,
+                   llvm::BasicBlock *after) {
+  for (Function::iterator i = F->begin(); i != F->end(); ++i) {
+    llvm::BasicBlock *bb = &(*i);
+    if (bb == after)
+      continue;
+    bb->getTerminator()->replaceUsesOfWith(before, after);
+  }
+}
+
+llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore) {
+  llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
+
+  llvm::FunctionType *LauncherFuncT =
+      FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
+
+  llvm::FunctionCallee f =
+      M->getOrInsertFunction("llvm.nvvm.barrier0", LauncherFuncT);
+  llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
+  return llvm::CallInst::Create(F, "", InsertBefore);
+}
+
+llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore) {
+  llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
+  llvm::FunctionType *LauncherFuncT =
+      FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
+  llvm::FunctionCallee f =
+      M->getOrInsertFunction("llvm.nvvm.bar.warp.sync", LauncherFuncT);
+  llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
+  return llvm::CallInst::Create(F, "", InsertBefore);
+}
+
+llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
+
+  std::string allocaName = std::string(phi->getName().str()) + ".ex_phi";
+
+  llvm::Function *function = phi->getParent()->getParent();
+
+  IRBuilder<> builder(&*(function->getEntryBlock().getFirstInsertionPt()));
+
+  llvm::Instruction *alloca =
+      builder.CreateAlloca(phi->getType(), 0, allocaName);
+
+  for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
+       ++incoming) {
+    Value *val = phi->getIncomingValue(incoming);
+    BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
+    builder.SetInsertPoint(incomingBB->getTerminator());
+    llvm::Instruction *store = builder.CreateStore(val, alloca);
+  }
+  builder.SetInsertPoint(phi);
+
+  llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
+  phi->replaceAllUsesWith(loadedValue);
+  phi->eraseFromParent();
+
+  return loadedValue;
+}
+
+void phi2alloc(llvm::Module *M) {
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (!isKernelFunction(M, F))
+      continue;
+
+    typedef std::vector<llvm::Instruction *> InstructionVec;
+
+    InstructionVec PHIs;
+
+    for (Function::iterator bb = F->begin(); bb != F->end(); ++bb) {
+      for (BasicBlock::iterator p = bb->begin(); p != bb->end(); ++p) {
+        Instruction *instr = &*p;
+        if (isa<PHINode>(instr)) {
+          PHIs.push_back(instr);
+        }
+      }
+    }
+
+    bool changed = false;
+    for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
+      Instruction *instr = *i;
+      BreakPHIToAllocas(dyn_cast<PHINode>(instr));
+    }
+  }
+}
+
+void remove_cuda_built_in(llvm::Module *M) {
+  // initialize function name
+  std::set<std::string> useless_func_name;
+  useless_func_name.insert("cudaMalloc");
+  useless_func_name.insert("cudaFuncGetAttributes");
+  useless_func_name.insert("cudaGetDevice");
+  useless_func_name.insert("cudaDeviceGetAttribute");
+  useless_func_name.insert("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  useless_func_name.insert(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+
+  std::set<llvm::Function *> need_remove;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (useless_func_name.find(func_name) != useless_func_name.end()) {
+      need_remove.insert(F);
+    }
+  }
+  for (auto F : need_remove) {
+    F->dropAllReferences();
+    F->eraseFromParent();
+  }
+}
+
+void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::vector<llvm::Instruction *> need_remove;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (!isKernelFunction(M, F))
+      continue;
+
+    IRBuilder<> builder(&*(F->getEntryBlock().getFirstInsertionPt()));
+    auto global_intra_warp_idx =
+        F->getParent()->getGlobalVariable("intra_warp_index");
+    auto local_intra_warp_idx =
+        builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
+                             0, "local_intra_warp_idx");
+    global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
+    auto global_inter_warp_idx =
+        F->getParent()->getGlobalVariable("inter_warp_index");
+    auto local_inter_warp_idx =
+        builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
+                             0, "local_inter_warp_idx");
+    global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
+
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto Load = dyn_cast<LoadInst>(BI)) {
+          auto load_from = Load->getOperand(0);
+          if (load_from == F->getParent()->getGlobalVariable("block_size")) {
+            Load->replaceAllUsesWith(ConstantInt::get(
+                I32, block_dim[0] * block_dim[1] * block_dim[2]));
+            need_remove.push_back(Load);
+          }
+        } else if (auto Call = dyn_cast<CallInst>(BI)) {
+          if (Call->getCalledFunction()) {
+            auto func_name = Call->getCalledFunction()->getName().str();
+            if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
+              // replace it by warp_id
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+
+              auto thread_idx = builder.CreateBinOp(
+                  Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
+                  ConstantInt::get(I32, 32), "");
+              thread_idx = builder.CreateBinOp(
+                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
+                  thread_idx, "thread_idx");
+              if (block_dim[1] != 1 || block_dim[2] != 1) {
+                printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
+                thread_idx = builder.CreateBinOp(
+                    Instruction::SRem, thread_idx,
+                    ConstantInt::get(I32, block_dim[0]), "thread_id_x");
+              }
+
+              Call->replaceAllUsesWith(thread_idx);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.y") {
+              // replace it by warp_id
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+
+              auto thread_idx = builder.CreateBinOp(
+                  Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
+                  ConstantInt::get(I32, 32), "");
+              thread_idx = builder.CreateBinOp(
+                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
+                  thread_idx, "thread_idx");
+              // tidy = tid / block_dim.x
+              thread_idx = builder.CreateBinOp(
+                  Instruction::SDiv, thread_idx,
+                  ConstantInt::get(I32, block_dim[0]),
+                  // builder.CreateLoad(M->getGlobalVariable("block_size_x")),
+                  "thread_id_y");
+
+              Call->replaceAllUsesWith(thread_idx);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
+              printf("[WARNING] We DO NOT support multi-dim block\n");
+              auto zero = ConstantInt::get(I32, 0);
+              Call->replaceAllUsesWith(zero);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
+              auto block_index_addr = M->getGlobalVariable("block_index");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto block_idx = builder.CreateLoad(block_index_addr);
+              Call->replaceAllUsesWith(block_idx);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
+                       func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
+              printf("[WARNING We DO NOT support multi-dim grid\n");
+              auto zero = ConstantInt::get(I32, 0);
+              Call->replaceAllUsesWith(zero);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
+              auto block_size_addr = M->getGlobalVariable("block_size_x");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto block_size = ConstantInt::get(I32, block_dim[0]);
+              Call->replaceAllUsesWith(block_size);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
+              auto block_size_addr = M->getGlobalVariable("block_size_y");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto block_size = ConstantInt::get(I32, block_dim[1]);
+              Call->replaceAllUsesWith(block_size);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
+              auto block_size_addr = M->getGlobalVariable("block_size_z");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto block_size = ConstantInt::get(I32, block_dim[2]);
+              Call->replaceAllUsesWith(block_size);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
+              auto grid_size_addr = M->getGlobalVariable("grid_size");
+              IRBuilder<> builder(context);
+              builder.SetInsertPoint(Call);
+              auto grid_size = ConstantInt::get(I32, grid_dim[0]);
+              Call->replaceAllUsesWith(grid_size);
+              need_remove.push_back(Call);
+            } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
+                       func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
+              printf("[WARNING We DO NOT support multi-dim grid\n");
+              auto one = ConstantInt::get(I32, 1);
+              Call->replaceAllUsesWith(one);
+              need_remove.push_back(Call);
+            }
+          }
+          if (Call->isInlineAsm()) {
+            auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
+            if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
+              printf("unknown InlineAsm\n");
+              exit(1);
+            }
+            // return the rank within the warp
+            IRBuilder<> builder(context);
+            builder.SetInsertPoint(Call);
+            auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx);
+            Call->replaceAllUsesWith(intra_warp_index);
+            need_remove.push_back(Call);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+
+void replace_asm_call(llvm::Module *M) {
+  LLVMContext &context = M->getContext();
+  auto I32 = llvm::Type::getInt32Ty(context);
+  std::vector<CallInst *> need_remove;
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (!isKernelFunction(M, F))
+      continue;
+
+    for (auto BB = F->begin(); BB != F->end(); ++BB) {
+      for (auto BI = BB->begin(); BI != BB->end(); BI++) {
+        if (auto Call = dyn_cast<CallInst>(BI)) {
+          if (Call->isInlineAsm()) {
+            auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
+            if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
+              printf("unknown InlineAsm\n");
+              exit(1);
+            }
+            // return the rank within the warp
+            IRBuilder<> builder(context);
+            builder.SetInsertPoint(Call);
+            auto intra_warp_index_addr =
+                M->getGlobalVariable("intra_warp_index");
+            auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr);
+            Call->replaceAllUsesWith(intra_warp_index);
+            need_remove.push_back(Call);
+          }
+        }
+      }
+    }
+  }
+  for (auto inst : need_remove) {
+    inst->eraseFromParent();
+  }
+}
+
+bool has_warp_barrier(llvm::BasicBlock *B) {
+  for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+    Instruction *inst = &(*i);
+    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
+    if (Call) {
+      auto func_name = Call->getCalledFunction()->getName().str();
+      if (func_name == "llvm.nvvm.bar.warp.sync") {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool has_barrier(llvm::BasicBlock *B) {
+  for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+    Instruction *inst = &(*i);
+    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
+    if (Call) {
+      auto func_name = Call->getCalledFunction()->getName().str();
+      if (func_name == "llvm.nvvm.barrier0" ||
+          func_name == "llvm.nvvm.bar.warp.sync" ||
+          func_name == "llvm.nvvm.barrier.sync") {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool has_block_barrier(llvm::BasicBlock *B) {
+  for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
+    Instruction *inst = &(*i);
+    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
+    if (Call) {
+      auto func_name = Call->getCalledFunction()->getName().str();
+      if (func_name == "llvm.nvvm.barrier0" ||
+          func_name == "llvm.nvvm.barrier.sync") {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool has_barrier(llvm::Function *F) {
+  for (auto B = F->begin(); B != F->end(); B++) {
+    if (has_barrier(&(*B)))
+      return true;
+  }
+  return false;
+}
+
+bool find_block_barrier_in_region(llvm::BasicBlock *start,
+                                  llvm::BasicBlock *end) {
+  std::set<llvm::BasicBlock *> visit;
+  std::vector<llvm::BasicBlock *> pending_blocks;
+  for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
+    pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
+  }
+  while (!pending_blocks.empty()) {
+    BasicBlock *current = pending_blocks.back();
+    pending_blocks.pop_back();
+    if (visit.find(current) != visit.end())
+      continue;
+    visit.insert(current);
+    if (current == end)
+      continue;
+    if (has_block_barrier(current)) {
+      return 1;
+    }
+    for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
+      pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
+    }
+  }
+  return 0;
+}
+
+bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
+  std::set<llvm::BasicBlock *> visit;
+  std::vector<llvm::BasicBlock *> pending_blocks;
+  for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
+    pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
+  }
+  while (!pending_blocks.empty()) {
+    BasicBlock *current = pending_blocks.back();
+    pending_blocks.pop_back();
+    if (visit.find(current) != visit.end())
+      continue;
+    visit.insert(current);
+    if (current == end)
+      continue;
+    if (has_barrier(current)) {
+      return 1;
+    }
+    for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
+      pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
+    }
+  }
+  return 0;
+}
diff --git a/compilation/KernelTranslation/lib/warp_func.cpp b/compilation/KernelTranslation/lib/warp_func.cpp
new file mode 100644
index 0000000..9708d74
--- /dev/null
+++ b/compilation/KernelTranslation/lib/warp_func.cpp
@@ -0,0 +1,217 @@
+
+#include "warp_func.h"
+#include "tool.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <iostream>
+#include <set>
+
+using namespace llvm;
+
+/*
+ * Insert sync before each vote, and replace the
+ * original vote function to warp vote version
+ */
+void handle_warp_vote(llvm::Module *M) {
+  std::set<llvm::CallInst *> need_replace;
+  llvm::Type *Int1T = Type::getInt1Ty(M->getContext());
+  llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
+  llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
+  auto zero = llvm::ConstantInt::get(I32, 0, true);
+  auto one = llvm::ConstantInt::get(I32, 1, true);
+  llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32)->getPointerTo();
+
+  llvm::FunctionType *LauncherFuncT =
+      FunctionType::get(Int1T, {VoteArrayType}, false);
+  llvm::FunctionCallee _f = M->getOrInsertFunction("warp_any", LauncherFuncT);
+  llvm::Function *func_warp_any = llvm::cast<llvm::Function>(_f.getCallee());
+  _f = M->getOrInsertFunction("warp_all", LauncherFuncT);
+  llvm::Function *func_warp_all = llvm::cast<llvm::Function>(_f.getCallee());
+
+  // replace llvm.nvvm.vote.any.sync to warp vote function
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    auto func_name = F->getName().str();
+    if (!isKernelFunction(M, F))
+      continue;
+    Function::iterator I = F->begin();
+    for (Function::iterator E = F->end(); I != E; ++I) {
+      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
+        if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
+          auto func_name = vote_any_sync->getCalledFunction()->getName();
+          if (func_name == "llvm.nvvm.vote.any.sync" ||
+              func_name == "llvm.nvvm.vote.all.sync") {
+            // insert sync before call
+            need_replace.insert(vote_any_sync);
+          }
+        }
+      }
+    }
+  }
+
+  GlobalVariable *warp_vote_ptr = M->getNamedGlobal("warp_vote");
+  for (auto sync_inst : need_replace) {
+    // create barrier
+    CreateIntraWarpBarrier(sync_inst);
+    /*
+     * store into warp_vote[tid]
+     */
+    assert(warp_vote_ptr != NULL);
+    auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index");
+    auto intra_warp_index =
+        new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst);
+
+    auto GEP = GetElementPtrInst::Create(NULL,          // Pointee type
+                                         warp_vote_ptr, // Alloca
+                                         {zero, intra_warp_index}, // Indices
+                                         "", sync_inst);
+
+    // as AVX only support 8bit for each thread
+    // so we have to cast the predict into int8
+    auto predict = llvm::CastInst::CreateIntegerCast(
+        sync_inst->getArgOperand(1), I8, false, "", sync_inst);
+    // we need to concern mask
+    auto mask = llvm::CastInst::CreateIntegerCast(sync_inst->getArgOperand(0),
+                                                  I32, false, "", sync_inst);
+    auto bit_flag = BinaryOperator::Create(Instruction::LShr, mask,
+                                           intra_warp_index, "", sync_inst);
+    auto valid =
+        BinaryOperator::Create(Instruction::And, one, bit_flag, "", sync_inst);
+    auto valid_8bit =
+        llvm::CastInst::CreateIntegerCast(valid, I8, false, "", sync_inst);
+
+    llvm::Instruction *res;
+    if (sync_inst->getCalledFunction()->getName() ==
+        "llvm.nvvm.vote.any.sync") {
+      res = BinaryOperator::Create(Instruction::Mul, valid_8bit, predict, "",
+                                   sync_inst);
+    } else if (sync_inst->getCalledFunction()->getName() ==
+               "llvm.nvvm.vote.all.sync") {
+      auto reverse_valid = BinaryOperator::CreateNot(valid_8bit, "", sync_inst);
+      res = BinaryOperator::Create(Instruction::Or, reverse_valid, predict, "",
+                                   sync_inst);
+      // as AVX do not have all, we have to
+      // reverse the result and call AVX-any instead
+      res = BinaryOperator::CreateNot(res, "", sync_inst);
+    }
+
+    auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
+    // create barrier
+    CreateIntraWarpBarrier(sync_inst);
+    /*
+     * replace llvm.nvvm.vote.any.sync(i32 mask, i1 predict)
+     * to warp_any(i32 mask, i8* predict)
+     */
+    std::vector<Value *> args;
+    // args.push_back(mask);
+    args.push_back(warp_vote_ptr);
+    llvm::Instruction *warp_inst;
+    if (sync_inst->getCalledFunction()->getName() ==
+        "llvm.nvvm.vote.any.sync") {
+      warp_inst = llvm::CallInst::Create(func_warp_any, args, "", sync_inst);
+    } else if (sync_inst->getCalledFunction()->getName() ==
+               "llvm.nvvm.vote.all.sync") {
+      warp_inst = llvm::CallInst::Create(func_warp_all, args, "", sync_inst);
+    }
+    sync_inst->replaceAllUsesWith(warp_inst);
+    sync_inst->eraseFromParent();
+  }
+}
+
+void handle_warp_shfl(llvm::Module *M) {
+  std::set<llvm::CallInst *> need_replace;
+
+  llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
+  auto ZERO = llvm::ConstantInt::get(I32, 0, true);
+  // replace llvm.nvvm.vote.any.sync to warp vote function
+  for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+    Function *F = &(*i);
+    if (!isKernelFunction(M, F))
+      continue;
+    Function::iterator I = F->begin();
+    for (Function::iterator E = F->end(); I != E; ++I) {
+      for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
+        if (CallInst *warp_shfl = dyn_cast<CallInst>(BI)) {
+          auto func_name = warp_shfl->getCalledFunction()->getName();
+          if (func_name == "llvm.nvvm.shfl.sync.down.i32" ||
+              func_name == "llvm.nvvm.shfl.sync.up.i32" ||
+              func_name == "llvm.nvvm.shfl.sync.bfly.i32") {
+            // insert sync before call
+            need_replace.insert(warp_shfl);
+          }
+        }
+      }
+    }
+  }
+
+  GlobalVariable *warp_shfl_ptr = M->getNamedGlobal("warp_shfl");
+  for (auto shfl_inst : need_replace) {
+    /*
+     * %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add32, i32
+     * 16, i32 31)
+     * ->
+     * warp_shfl[warp_id] = add32
+     * warp.barrier()
+     * %10 = warp_shfl[warp_id + offset]
+     */
+    IRBuilder<> builder(shfl_inst);
+
+    auto shfl_variable = shfl_inst->getArgOperand(1);
+    auto shfl_offset = shfl_inst->getArgOperand(2);
+
+    auto intra_warp_index =
+        builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+    builder.CreateStore(
+        shfl_variable,
+        builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index}));
+    // we should create barrier before store
+    CreateIntraWarpBarrier(intra_warp_index);
+    // load shuffled data
+    auto new_intra_warp_index =
+        builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+    auto shfl_name = shfl_inst->getCalledFunction()->getName().str();
+    if (shfl_name.find("down") != shfl_name.npos) {
+      auto calculate_offset = builder.CreateBinOp(
+          Instruction::Add, new_intra_warp_index, shfl_offset);
+      auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
+                                           ConstantInt::get(I32, 32));
+      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
+      auto load_inst = builder.CreateLoad(gep);
+
+      // create barrier
+      CreateIntraWarpBarrier(new_intra_warp_index);
+      shfl_inst->replaceAllUsesWith(load_inst);
+      shfl_inst->eraseFromParent();
+    } else if (shfl_name.find("up") != shfl_name.npos) {
+      auto calculate_offset = builder.CreateBinOp(
+          Instruction::Sub, new_intra_warp_index, shfl_offset);
+      auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
+                                           ConstantInt::get(I32, 32));
+      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
+      auto load_inst = builder.CreateLoad(gep);
+
+      // create barrier
+      CreateIntraWarpBarrier(new_intra_warp_index);
+      shfl_inst->replaceAllUsesWith(load_inst);
+      shfl_inst->eraseFromParent();
+    } else if (shfl_name.find("bfly") != shfl_name.npos) {
+      auto calculate_offset = builder.CreateBinOp(
+          Instruction::Xor, new_intra_warp_index, shfl_offset);
+      auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
+                                           ConstantInt::get(I32, 32));
+      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
+      auto load_inst = builder.CreateLoad(gep);
+
+      // create barrier
+      CreateIntraWarpBarrier(new_intra_warp_index);
+      shfl_inst->replaceAllUsesWith(load_inst);
+      shfl_inst->eraseFromParent();
+    }
+  }
+}
diff --git a/compilation/examples/reduce/host.cpp b/compilation/examples/reduce/host.cpp
new file mode 100644
index 0000000..297ff17
--- /dev/null
+++ b/compilation/examples/reduce/host.cpp
@@ -0,0 +1,82 @@
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#define NUM_WARP 2
+#define NUM_BLOCK 1
+
+int block_size = 32 * NUM_WARP;
+int block_size_x = block_size;
+int block_size_y = 1;
+int block_size_z = 1;
+__thread int block_index = 0;
+int grid_size = NUM_BLOCK;
+
+extern "C" {
+void *_Z7reduce0PiS_j_wrapper(void *);
+__thread int warp_shfl[32];
+}
+
+void *wrap(void *p) {
+  int **res = (int **)p;
+  block_index = (*(int *)res[3]);
+  _Z7reduce0PiS_j_wrapper(p);
+  return NULL;
+}
+
+void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
+  int **ret = new int *[4];
+
+  int **p0 = new int *;
+  *p0 = g_idata;
+  ret[0] = (int *)(p0);
+
+  int **p1 = new int *;
+  *p1 = g_odata;
+  ret[1] = (int *)(p1);
+
+  unsigned int *p2 = new unsigned int;
+  *p2 = n;
+  ret[2] = (int *)p2;
+
+  int *p3 = new int;
+  *p3 = bid;
+  ret[3] = (int *)p3;
+
+  return (void *)ret;
+}
+
+int main(int argc, char *argv[]) {
+  int *g_idata;
+
+  int size = block_size * NUM_BLOCK;
+  g_idata = new int[size * 2];
+  int *res = new int[size];
+
+  for (int i = 0; i < size; i++) {
+    g_idata[i] = i;
+  }
+
+  pthread_t threads[NUM_BLOCK];
+
+  void *inp[NUM_BLOCK];
+  for (long t = 0; t < NUM_BLOCK; t++) {
+    inp[t] = gen_input(t, g_idata, res, size);
+  }
+
+  for (long t = 0; t < NUM_BLOCK; t++) {
+    pthread_create(&threads[t], NULL, wrap, inp[t]);
+  }
+  for (long t = 0; t < NUM_BLOCK; t++)
+    pthread_join(threads[t], NULL);
+  int gold = 0;
+  for (int i = 0; i < size; i++) {
+    gold += g_idata[i];
+  }
+  assert(*res == gold && "Incorrect res\n");
+  printf("PASS\n");
+
+  pthread_exit(NULL);
+}
diff --git a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
new file mode 100644
index 0000000..50b112d
--- /dev/null
+++ b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@@ -0,0 +1,150 @@
+; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "kernel.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: convergent nounwind
+define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
+entry:
+  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
+  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
+  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
+  %mul = mul i32 %2, %1
+  %add = add i32 %mul, %0
+  %cmp = icmp ult i32 %add, %n
+  br i1 %cmp, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !13
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
+  %idxprom5 = zext i32 %0 to i64
+  %arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
+  %arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
+  store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
+  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
+  %cmp839 = icmp ugt i32 %2, 1
+  br i1 %cmp839, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %if.end, %cond.end
+  %cmp18 = icmp eq i32 %0, 0
+  br i1 %cmp18, label %if.then19, label %if.end23
+
+for.body:                                         ; preds = %cond.end, %if.end
+  %s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
+  %mul9 = shl nuw nsw i32 %s.040, 1
+  %rem = urem i32 %0, %mul9
+  %cmp10 = icmp eq i32 %rem, 0
+  br i1 %cmp10, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %add11 = add i32 %s.040, %0
+  %idxprom12 = zext i32 %add11 to i64
+  %arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
+  %arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
+  %4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
+  %5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
+  %add16 = add nsw i32 %5, %4
+  store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  tail call void @llvm.nvvm.barrier.sync(i32 0) #4
+  %cmp8 = icmp ult i32 %mul9, %2
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+if.then19:                                        ; preds = %for.cond.cleanup
+  %idxprom21 = zext i32 %1 to i64
+  %arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
+  %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
+  store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
+  br label %if.end23
+
+if.end23:                                         ; preds = %if.then19, %for.cond.cleanup
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: convergent nounwind
+declare void @llvm.nvvm.barrier.sync(i32) #3
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent nounwind }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
+!10 = !{i32 0, i32 1024}
+!11 = !{i32 0, i32 2147483647}
+!12 = !{i32 1, i32 1025}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"int", !15, i64 0}
+!15 = !{!"omnipotent char", !16, i64 0}
+!16 = !{!"Simple C++ TBAA"}
diff --git a/compilation/examples/reduce/run.sh b/compilation/examples/reduce/run.sh
new file mode 100644
index 0000000..93cd3fd
--- /dev/null
+++ b/compilation/examples/reduce/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
+llc --filetype=obj kernel.bc
+g++ host.cpp kernel.o -lpthread -o test
+./test
diff --git a/compilation/examples/reduce_shuffle/host.cpp b/compilation/examples/reduce_shuffle/host.cpp
new file mode 100644
index 0000000..41c5ae8
--- /dev/null
+++ b/compilation/examples/reduce_shuffle/host.cpp
@@ -0,0 +1,82 @@
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#define NUM_WARP 2
+#define NUM_BLOCK 1
+
+int block_size = 32 * NUM_WARP;
+int block_size_x = block_size;
+int block_size_y = 1;
+int block_size_z = 1;
+__thread int block_index = 0;
+int grid_size = NUM_BLOCK;
+
+extern "C" {
+void *_Z7reduce5PiS_j_wrapper(void *);
+__thread int warp_shfl[32];
+}
+
+void *wrap(void *p) {
+  int **res = (int **)p;
+  block_index = (*(int *)res[3]);
+  _Z7reduce5PiS_j_wrapper(p);
+  return NULL;
+}
+
+void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
+  int **ret = new int *[4];
+
+  int **p0 = new int *;
+  *p0 = g_idata;
+  ret[0] = (int *)(p0);
+
+  int **p1 = new int *;
+  *p1 = g_odata;
+  ret[1] = (int *)(p1);
+
+  unsigned int *p2 = new unsigned int;
+  *p2 = n;
+  ret[2] = (int *)p2;
+
+  int *p3 = new int;
+  *p3 = bid;
+  ret[3] = (int *)p3;
+
+  return (void *)ret;
+}
+
+int main(int argc, char *argv[]) {
+  int *g_idata;
+
+  int size = block_size * NUM_BLOCK;
+  g_idata = new int[size * 2];
+  int *res = new int[size];
+
+  for (int i = 0; i < size; i++) {
+    g_idata[i] = i;
+  }
+
+  pthread_t threads[NUM_BLOCK];
+
+  void *inp[NUM_BLOCK];
+  for (long t = 0; t < NUM_BLOCK; t++) {
+    inp[t] = gen_input(t, g_idata, res, size);
+  }
+
+  for (long t = 0; t < NUM_BLOCK; t++) {
+    pthread_create(&threads[t], NULL, wrap, inp[t]);
+  }
+  for (long t = 0; t < NUM_BLOCK; t++)
+    pthread_join(threads[t], NULL);
+  int gold = 0;
+  for (int i = 0; i < size; i++) {
+    gold += g_idata[i];
+  }
+  assert(*res == gold && "Incorrect res\n");
+  printf("PASS\n");
+
+  pthread_exit(NULL);
+}
diff --git a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
new file mode 100644
index 0000000..7d056fd
--- /dev/null
+++ b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@@ -0,0 +1,179 @@
+; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "kernel.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: convergent nounwind
+define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
+entry:
+  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
+  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
+  %mul = shl i32 %1, 7
+  %add = add i32 %mul, %0
+  %cmp = icmp ult i32 %add, %n
+  br i1 %cmp, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !12
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
+  %add4 = add i32 %add, 64
+  %cmp5 = icmp ult i32 %add4, %n
+  br i1 %cmp5, label %if.then, label %if.end
+
+if.then:                                          ; preds = %cond.end
+  %idxprom7 = zext i32 %add4 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
+  %3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
+  %add9 = add nsw i32 %3, %cond
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %cond.end
+  %mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
+  %idxprom10 = zext i32 %0 to i64
+  %arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
+  %arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
+  store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
+  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
+  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
+  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
+  tail call void @llvm.nvvm.barrier.sync(i32 0) #5
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
+  %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
+  %mul.i.i52 = mul nuw nsw i32 %5, %4
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
+  %mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
+  %add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
+  %add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
+  %cmp14 = icmp ult i32 %add8.i.i55, 32
+  br i1 %cmp14, label %if.then15, label %if.end32
+
+if.then15:                                        ; preds = %if.end
+  %add16 = add nuw nsw i32 %0, 32
+  %idxprom17 = zext i32 %add16 to i64
+  %arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
+  %arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
+  %8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
+  %add19 = add nsw i32 %8, %mySum.0
+  %9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
+  %add23 = add nsw i32 %9, %add19
+  %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
+  %add23.1 = add nsw i32 %10, %add23
+  %11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
+  %add23.2 = add nsw i32 %11, %add23.1
+  %12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
+  %add23.3 = add nsw i32 %12, %add23.2
+  %13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
+  %cmp27 = icmp eq i32 %add8.i.i55, 0
+  br i1 %cmp27, label %if.then28, label %if.end32
+
+if.then28:                                        ; preds = %if.then15
+  %add23.4 = add nsw i32 %13, %add23.3
+  %idxprom30 = zext i32 %1 to i64
+  %arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
+  store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
+  br label %if.end32
+
+if.end32:                                         ; preds = %if.end, %if.then28, %if.then15
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
+
+; Function Attrs: convergent nounwind
+declare void @llvm.nvvm.barrier.sync(i32) #3
+
+; Function Attrs: convergent inaccessiblememonly nounwind
+declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent nounwind }
+attributes #4 = { convergent inaccessiblememonly nounwind }
+attributes #5 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
+!10 = !{i32 0, i32 1024}
+!11 = !{i32 0, i32 2147483647}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"int", !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C++ TBAA"}
+!16 = !{i32 0, i32 64}
+!17 = !{i32 1, i32 1025}
diff --git a/compilation/examples/reduce_shuffle/run.sh b/compilation/examples/reduce_shuffle/run.sh
new file mode 100644
index 0000000..93cd3fd
--- /dev/null
+++ b/compilation/examples/reduce_shuffle/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
+llc --filetype=obj kernel.bc
+g++ host.cpp kernel.o -lpthread -o test
+./test
diff --git a/compilation/examples/run_example.sh b/compilation/examples/run_example.sh
new file mode 100644
index 0000000..8600e28
--- /dev/null
+++ b/compilation/examples/run_example.sh
@@ -0,0 +1,11 @@
+#!bin/sh
+for file in ./*
+do
+    if test -d $file
+    then
+        echo executing $file
+        cd $file
+        bash run.sh
+        cd ..
+    fi
+done
diff --git a/compilation/examples/vecadd/host.cpp b/compilation/examples/vecadd/host.cpp
new file mode 100644
index 0000000..153d7d5
--- /dev/null
+++ b/compilation/examples/vecadd/host.cpp
@@ -0,0 +1,84 @@
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define NUM_BLOCK 1
+int N = 32;
+
+int block_size = 32;
+int block_size_x = block_size;
+int block_size_y = 1;
+int block_size_z = 1;
+__thread int block_index = 0;
+int grid_size = NUM_BLOCK;
+
+extern "C" {
+void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
+}
+
+void *wrap(void *p) {
+  int **res = (int **)p;
+  block_index = (*(int *)res[4]);
+  _Z9vectorAddPKfS0_Pfi_wrapper(p);
+  return NULL;
+}
+
+void *gen_input(int bid, float *A, float *B, float *C, int N) {
+  int **ret = new int *[5];
+
+  float **p0 = new float *;
+  *p0 = A;
+  ret[0] = (int *)(p0);
+
+  float **p1 = new float *;
+  *p1 = B;
+  ret[1] = (int *)(p1);
+
+  float **p2 = new float *;
+  *p2 = C;
+  ret[2] = (int *)(p2);
+
+  int *p3 = new int;
+  *p3 = N;
+  ret[3] = (int *)p3;
+
+  int *p4 = new int;
+  *p4 = bid;
+  ret[4] = (int *)p4;
+
+  return (void *)ret;
+}
+
+int main() {
+  float *A, *B, *C;
+
+  A = new float[N];
+  B = new float[N];
+  C = new float[N];
+
+  for (int i = 0; i < N; i++) {
+    A[i] = i;
+    B[i] = 1;
+    C[i] = 0;
+  }
+
+  pthread_t threads[NUM_BLOCK];
+
+  int rc;
+  for (long t = 0; t < NUM_BLOCK; t++) {
+    void *inp = gen_input(t, A, B, C, N);
+    rc = pthread_create(&threads[t], NULL, wrap, inp);
+  }
+  clock_t t1 = clock();
+  /* Last thing that main() should do */
+  for (long t = 0; t < NUM_BLOCK; t++)
+    pthread_join(threads[t], NULL);
+
+  for (int i = 0; i < N; i++) {
+    assert(C[i] == (A[i] + B[i]));
+  }
+  printf("PASS\n");
+  pthread_exit(NULL);
+}
diff --git a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
new file mode 100644
index 0000000..57d6b64
--- /dev/null
+++ b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
@@ -0,0 +1,86 @@
+; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
+source_filename = "kernel.cu"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nounwind
+define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
+entry:
+  ret i32 999
+}
+
+; Function Attrs: nofree nounwind
+define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
+entry:
+  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
+  %idxprom8 = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
+  %1 = load float, float* %arrayidx, align 4, !tbaa !11
+  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
+  %2 = load float, float* %arrayidx2, align 4, !tbaa !11
+  %add = fadd contract float %1, %2
+  %arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
+  store float %add, float* %arrayidx4, align 4, !tbaa !11
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
+!llvm.ident = !{!8}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
+!4 = !{null, !"align", i32 8}
+!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!6 = !{null, !"align", i32 16}
+!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
+!9 = !{i32 1, i32 4}
+!10 = !{i32 0, i32 1024}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"float", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C++ TBAA"}
diff --git a/compilation/examples/vecadd/run.sh b/compilation/examples/vecadd/run.sh
new file mode 100644
index 0000000..973a99c
--- /dev/null
+++ b/compilation/examples/vecadd/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
+../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
+llc --filetype=obj kernel.bc
+g++ host.cpp kernel.o -lpthread -o test
+./test
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
new file mode 100644
index 0000000..31591ae
--- /dev/null
+++ b/runtime/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(
+  X86runtime
+  DESCRIPTION "Implementation CUDA runtime API with x86"
+  LANGUAGES CXX)
+set(LIB_NAME x86Runtime)
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# compile threadPool implementation
+add_subdirectory(threadPool)
+
+# compile x86 runtime library
+include_directories(./include)
+include_directories(./threadPool/include)
+file(GLOB proj_SOURCES "lib/*.cpp")
+add_library(${LIB_NAME} SHARED ${proj_SOURCES})
diff --git a/runtime/include/cudaRuntimeImpl.h b/runtime/include/cudaRuntimeImpl.h
new file mode 100644
index 0000000..0f5b8ae
--- /dev/null
+++ b/runtime/include/cudaRuntimeImpl.h
@@ -0,0 +1,19 @@
+#ifndef __RUNTIME_IMPL__
+#define __RUNTIME_IMPL__
+#include "cudaStatus.h"
+#include "structures.h"
+cudaError_t cudaDeviceReset(void);
+cudaError_t cudaDeviceSynchronize(void);
+cudaError_t cudaFree(void *devPtr);
+cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                             void **args, size_t sharedMem,
+                             cudaStream_t stream);
+cudaError_t cudaMalloc(void **devPtr, size_t size);
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
+                       cudaMemcpyKind kind);
+cudaError_t cudaSetDevice(int device);
+cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
+cudaError_t cudaStreamCreate(cudaStream_t *pStream);
+cudaError_t cudaStreamDestroy(cudaStream_t stream);
+cudaError_t cudaStreamSynchronize(cudaStream_t stream);
+#endif
diff --git a/runtime/include/cudaStatus.h b/runtime/include/cudaStatus.h
new file mode 100644
index 0000000..8b60edc
--- /dev/null
+++ b/runtime/include/cudaStatus.h
@@ -0,0 +1,18 @@
+#ifndef __RUNTIME_STATUS__
+#define __RUNTIME_STATUS__
+#include <stdio.h>
+
+enum cudaError_t {
+  CudaSuccess = 0,
+  CudaErrorInvalidValue = 1,
+  CudaErrorInvalidMemoryAllocation = 2,
+};
+
+enum cudaMemcpyKind {
+  cudaMemcpyHostToHost = 0,
+  cudaMemcpyHostToDevice = 1,
+  cudaMemcpyDeviceToHost = 2,
+  cudaMemcpyDeviceToDevice = 3,
+  cudaMemcpyDefault = 4,
+};
+#endif
diff --git a/runtime/lib/cudaRuntimeImpl.cpp b/runtime/lib/cudaRuntimeImpl.cpp
new file mode 100644
index 0000000..83054f3
--- /dev/null
+++ b/runtime/lib/cudaRuntimeImpl.cpp
@@ -0,0 +1,100 @@
+#include "cudaRuntimeImpl.h"
+#include "api.h"
+#include <stdio.h>
+#include <stdlib.h>
+cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
+cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
+cudaError_t cudaFree(void *devPtr) { free(devPtr); }
+cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                             void **args, size_t sharedMem,
+                             cudaStream_t stream) {
+  // if scheduler is null init device
+
+  cu_kernel *ker =
+      create_kernel(func, gridDim, blockDim, &args, sharedMem, stream);
+
+  int lstatus = cuLaunchKernel(&ker);
+}
+cudaError_t cudaMalloc(void **devPtr, size_t size) {
+  *devPtr = malloc(size);
+  if (devPtr == NULL)
+    return cudaErrorMemoryAllocation;
+  return cudaSuccess;
+}
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
+                       cudaMemcpyKind kind) {
+  if (kind == cudaMemcpyHostToHost) {
+    memcpy(dst, src, count);
+  } else if (kind == cudaMemcpyDeviceToHost) {
+    // how does the code know which device accessing the memory
+    memcpy(dst, src, count);
+  } else if (kind == cudaMemcpyHostToDevice) {
+    // how does the code know which device accessing the memory
+    memcpy(dst, src, count);
+  } else if (kind == cudaMemcpyDeviceToHost) {
+    // how does the code know which device accessing the memory
+    memcpy(dst, src, count);
+  } else if (kind == cudaMemcpyDeviceToDevice) {
+
+    memcpy(dst, dst, count);
+  } else if (kind == cudaMemcpyDefault) {
+    memcpy(dst, src, count);
+  }
+  return cudaSuccess;
+}
+
+cudaError_t cudaSetDevice(int device) {
+  // error checking
+  init_device();
+}
+
+cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
+  cstreamData *dst_stream = (cstreamData *)dst;
+  cstreamData *src_stream = (cstreamData *)src;
+
+  if (dst_stream == NULL || src_stream == NULL) {
+    return cudaErrorInvalidValue; // 1
+  }
+
+  dst_stream->stream_priority = src_stream->stream_priority;
+  dst_stream->stream_flags = src_stream->stream_flags;
+
+  return cudaSuccess; // 0
+}
+
+cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
+  cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData));
+  if (s == NULL)
+    return cudaErrorMemoryAllocation;
+  s->ev.status = C_RUN;
+  s->id = stream_counter;
+  stream_counter++;
+  s->stream_priority = DEFAULT;
+  create_KernelQueue(&(s->kernelQueue));
+
+  INIT_LOCK(s->stream_lock);
+  *pStream = (cudaStream_t)(s);
+
+  return cudaSuccess;
+}
+
+cudaError_t cudaStreamDestroy(cudaStream_t stream) {
+  cstreamData *s = (cstreamData *)(stream);
+
+  free(s->kernelQueue);
+
+  DESTROY_LOCK(s->stream_lock);
+
+  free(s);
+
+  return cudaSuccess;
+}
+
+cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
+  cstreamData *e = ((cstreamData *)(stream));
+  MUTEX_LOCK(e->stream_lock);
+
+  e->ev.status = C_SYNCHRONIZE;
+  e->ev.numKernelsToWait = e->kernelQueue->waiting_count;
+  MUTEX_UNLOCK(e->stream_lock);
+}
diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt
new file mode 100644
index 0000000..ed599bf
--- /dev/null
+++ b/runtime/threadPool/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.1)
+
+# C project
+project(
+  ThreadPool
+  DESCRIPTION "Using pthread to implement ThreadPool"
+  LANGUAGES CXX)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(LIB_NAME threadPool)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_BUILD_TYPE Debug)
+include_directories(./include)
+
+file(GLOB proj_SOURCES "lib/*.cpp")
+add_library(${LIB_NAME} SHARED ${proj_SOURCES})
diff --git a/runtime/threadPool/include/api.h b/runtime/threadPool/include/api.h
new file mode 100644
index 0000000..0c0aad6
--- /dev/null
+++ b/runtime/threadPool/include/api.h
@@ -0,0 +1,25 @@
+#ifndef C_API_H
+#define C_API_H
+
+#include "structures.h"
+
+cu_kernel *create_kernel(void (*wrap)(cu_argument *));
+int getWorkItem(struct kernel_queue **qu, cu_kernel *ker,
+                struct argument *kernel_arg, int **blockId);
+int create_KernelQueue(kernel_queue **q);
+
+int dequeKernelLL(struct kernel_queue **qu);
+
+int dequeKernel(struct kernel_queue **qu, cu_kernel *ker);
+int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker);
+
+int scheduler_init(cu_device device);
+void scheduler_uninit();
+void cuSynchronizeBarrier();
+
+int set_kernel_arguments(cu_kernel **k, unsigned int arg_num, void **arg_value);
+
+int setKernelDimensions(cu_kernel *k, struct argument **arg,
+                        void **totalBlockSize, void *blockId);
+
+#endif
diff --git a/runtime/threadPool/include/def.h b/runtime/threadPool/include/def.h
new file mode 100644
index 0000000..259733e
--- /dev/null
+++ b/runtime/threadPool/include/def.h
@@ -0,0 +1,26 @@
+#ifndef C_DEF_H
+#define C_DEF_H
+
+// Error
+#define C_SUCCESS 0x0
+#define C_ERROR 0x1
+
+// execution status
+#define C_COMPLETE 0x2
+#define C_RUNNING 0x3
+#define C_SUBMITTED 0x4
+#define C_QUEUED 0x5
+#define C_CREATED 0x5
+
+// stream status
+#define C_RUN 0x1
+#define C_WAIT 0x2
+#define C_SYNCHRONIZE 0x3
+
+// Not Initliazed Error
+#define C_ERROR_NOT_INITIALIZED 0x6
+#define C_ERROR_MEMALLOC 0x7
+
+#define C_QUEUE_EMPTY 0x8
+
+#endif
diff --git a/runtime/threadPool/include/macros.h b/runtime/threadPool/include/macros.h
new file mode 100644
index 0000000..b047c28
--- /dev/null
+++ b/runtime/threadPool/include/macros.h
@@ -0,0 +1,38 @@
+#ifndef C_MACROS_H
+#define C_MACROS_H
+#include "assert.h"
+#include <pthread.h>
+
+#define INIT_LOCK(__LOCK__)                                                    \
+  {                                                                            \
+    do {                                                                       \
+      int r = pthread_mutex_init(&(__LOCK__), NULL);                           \
+      assert(r == 0);                                                          \
+    } while (0);                                                               \
+  }
+
+#define MUTEX_LOCK(__LOCK__)                                                   \
+  {                                                                            \
+    do {                                                                       \
+      int r = pthread_mutex_lock(&(__LOCK__));                                 \
+      assert(r == 0);                                                          \
+    } while (0);                                                               \
+  }
+
+#define MUTEX_UNLOCK(__LOCK__)                                                 \
+  {                                                                            \
+    do {                                                                       \
+      int r = pthread_mutex_unlock(&(__LOCK__));                               \
+      assert(r == 0);                                                          \
+    } while (0);                                                               \
+  }
+
+#define DESTROY_LOCK(__LOCK__)                                                 \
+  {                                                                            \
+    do {                                                                       \
+      int r = pthread_mutex_destroy(&(__LOCK__));                              \
+      assert(r == 0);                                                          \
+    } while (0);                                                               \
+  }
+
+#endif // HEADER_FILE
diff --git a/runtime/threadPool/include/structures.h b/runtime/threadPool/include/structures.h
new file mode 100644
index 0000000..d78a034
--- /dev/null
+++ b/runtime/threadPool/include/structures.h
@@ -0,0 +1,191 @@
+#ifndef C_STRUCTURES_H
+#define C_STRUCTURES_H
+
+#include "pthread.h"
+#define cudaStream_t cstreamData
+typedef struct device {
+  int max_compute_units;
+  int device_id;
+} cu_device;
+
+typedef struct c_thread {
+  pthread_t thread;
+  unsigned long executed_commands;
+  unsigned index;
+  bool exit;
+} cu_ptd;
+
+typedef struct scheduler_pool {
+
+  struct c_thread *thread_pool;
+
+  size_t num_worker_threads;
+  size_t local_mem_size;
+  int num_kernel_launch;
+  int num_kernel_finished;
+  int num_kernel_queued;
+  size_t idle_threads;
+
+  pthread_cond_t wake_pool;
+
+  int threadpool_shutdown_requested;
+
+  // lock for scheduler
+  pthread_mutex_t work_queue_lock;
+
+  // C99 array at the end
+  // user kernel queue for only user called functions
+  struct kernel_queue *kernelQueue;
+
+} cu_pool;
+
+struct kernel_queue {
+
+  struct kernel *head;
+  struct kernel *tail;
+
+  // finish command count
+  unsigned long finish_count;
+
+  // waiting to be run on threads
+  unsigned long waiting_count;
+
+  // running count
+  unsigned long running_count;
+
+  // total count
+  unsigned long kernel_count;
+
+  // current index for task to be run
+  unsigned long current_index;
+};
+
+typedef struct command {
+
+  struct kernel *ker;
+
+  struct command *next;
+  struct command *prev;
+
+} cu_command;
+
+typedef struct argument {
+  // size of the argument to allocation
+  size_t size;
+  void *value;
+  unsigned int index;
+} cu_argument;
+
+typedef struct input_arg {
+  // real values for the input
+  char *p;
+  struct argument *argus[];
+  // (TODO): implement meta_data
+  // the type of metadata will need to change to list of ints or something
+  // so that we can parse the arguments p
+} cu_input;
+
+struct dim3 {
+  size_t x;
+  size_t y;
+  size_t z;
+  dim3(int d1) {
+    x = d1;
+    y = z = 1;
+  }
+  dim3() { x = y = z = 1; }
+};
+
+enum StreamType {
+  DEFAULT,
+  LOW,
+  HIGH,
+  EXT,
+};
+
+struct cStreamDataInternal {
+  /*
+      status of the stream (run , wait)
+      Run: Stream will asynchronously assign the kernel assign with this stream
+      Wait: Stream will halt kernels from exiting the scheduler
+  */
+  int status;
+  /*
+     if status == wait, wait on the number of kernels to wait to become 0
+  */
+  unsigned long numKernelsToWait;
+  unsigned int lastKernelIdToWait;
+  unsigned int count; // number of task left in the stream
+};
+
+typedef struct streamData {
+
+  // execution status of current event monitor
+  struct cStreamDataInternal ev;
+  pthread_mutex_t stream_lock; // lock on the stream
+  StreamType stream_priority;
+  unsigned int id;
+  unsigned int stream_flags;
+
+  // queue of the kernels in this stream
+  struct kernel_queue *kernelQueue;
+
+} cstreamData;
+// kernel information
+typedef struct kernel {
+
+  void *(*start_routine)(void *);
+
+  void **args;
+
+  dim3 gridDim;
+  dim3 blockDim;
+
+  struct kernel *next;
+  struct kernel *prev;
+
+  size_t shared_mem;
+
+  cstreamData *stream;
+
+  struct event *barrier;
+
+  int status;
+
+  int totalBlocks;
+  int N;
+
+  int blockSize;
+  int kernelId;
+
+  // current blockId
+  int blockId;
+
+  void *shared_mem_loc;
+
+} cu_kernel;
+
+typedef struct asyncKernel {
+  unsigned int numBlocks;
+  unsigned int numThreads;
+  struct event *evt;
+  struct kernel *ker;
+
+  struct asyncKernel *prev;
+  struct asyncKernel *next;
+
+} asyncKernel;
+
+// command queue of command nodes
+
+typedef struct kernel_arg_array {
+  size_t size;
+  unsigned int index;
+} karg_arr;
+
+typedef struct kernel_image_arg {
+  size_t size;
+  unsigned int index;
+} k_arg;
+
+#endif // HEADER_FILE
diff --git a/runtime/threadPool/lib/api.cpp b/runtime/threadPool/lib/api.cpp
new file mode 100644
index 0000000..491d0d7
--- /dev/null
+++ b/runtime/threadPool/lib/api.cpp
@@ -0,0 +1,456 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <thread>
+
+#include "api.h"
+#include "def.h"
+#include "macros.h"
+#include "structures.h"
+
+/*
+Initialize the device
+*/
+int init_device() {
+
+  cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
+  if (device == NULL)
+    return C_ERROR_MEMALLOC;
+
+  device->max_compute_units = std::thread::hardware_concurrency();
+
+  // initialize scheduler
+  int ret = scheduler_init(*device);
+
+  if (ret != C_SUCCESS)
+    return ret;
+
+  return C_SUCCESS;
+}
+
+/*
+    Create Kernel
+
+*/
+static int kernelIds = 0;
+cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
+                         void ***args, size_t sharedMem, cstreamData *stream) {
+  cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
+
+  // set the function pointer
+  ker->start_routine = (void *(*)(void *))func;
+  // ker->start_routine(args);
+  ker->args = *args;
+
+  ker->gridDim = gridDim;
+  ker->blockDim = blockDim;
+
+  ker->shared_mem = sharedMem;
+
+  // malloc shared memory dynamic (heap , needs to be on the stack)
+  // each thread create their own shared memory // after the task submission
+  ker->shared_mem_loc = calloc(1, sharedMem);
+
+  ker->stream = stream;
+
+  ker->blockId = 0;
+
+  ker->totalBlocks = gridDim.x;
+
+  ker->N = blockDim.x;
+
+  ker->kernelId = kernelIds;
+  kernelIds += 1;
+
+  ker->blockSize = blockDim.x;
+
+  return ker;
+}
+
+/*
+    Create Kernel Queue
+*/
+int create_KernelQueue(kernel_queue **q) {
+  *q = (kernel_queue *)calloc(1, sizeof(kernel_queue));
+
+  if (*q == NULL) {
+    return C_ERROR_MEMALLOC;
+  }
+
+  (*q)->kernel_count = 0;
+  (*q)->running_count = 0;
+  (*q)->waiting_count = 0;
+  (*q)->finish_count = 0;
+  (*q)->current_index = 0;
+
+  return C_SUCCESS;
+}
+
+int dequeKernelLL(struct kernel_queue **qu) {
+
+  struct kernel_queue *q = *qu;
+  q->finish_count += 1;
+
+  // free the pointer
+  if (q->head == NULL) {
+    return C_QUEUE_EMPTY;
+  } else {
+    //*ker = *(q->head);
+    q->head = (q->head)->next;
+    if (q->head != NULL) {
+      q->head->prev = NULL;
+    }
+  }
+
+  return C_SUCCESS;
+}
+
+int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) {
+  struct kernel_queue *q = *qu;
+  cu_kernel *p = *ker;
+
+  if (q->head == NULL) {
+    q->head = p;
+    q->tail = p;
+  } else {
+    p->prev = q->tail;
+    q->tail->next = p;
+    q->tail = p;
+    p->next = NULL;
+  }
+  q->kernel_count += 1;
+  q->waiting_count += 1;
+
+  // user kernel command
+
+  return C_SUCCESS;
+}
+
+// scheduler
+static cu_pool *scheduler;
+
+__thread int block_index = 0;
+__thread int thread_memory_size = 0;
+
+/*
+    Enqueue Kernel (k) to the scheduler kernelQueue
+*/
+int schedulerEnqueueKernel(cu_kernel **k) {
+  cu_kernel *ker = *k;
+  MUTEX_LOCK(scheduler->work_queue_lock);
+
+  enqueueKernel(&scheduler->kernelQueue, &ker);
+
+  pthread_cond_broadcast(&(scheduler->wake_pool));
+  MUTEX_UNLOCK(scheduler->work_queue_lock);
+}
+
+/*
+  Kernel Launch with numBlocks and numThreadsPerBlock
+*/
+int cuLaunchKernel(cu_kernel **k) {
+
+  // Calculate Block Size N/numBlocks
+
+  cu_kernel *ker = *k;
+  int status = C_RUN;
+
+  MUTEX_LOCK(scheduler->work_queue_lock);
+  scheduler->num_kernel_queued += 1;
+  MUTEX_UNLOCK(scheduler->work_queue_lock);
+
+  // stream == 0 add to the kernelQueue
+  if (ker->stream == 0) {
+
+    schedulerEnqueueKernel(&ker);
+  } else {
+    // add to it's stream queue
+    // stream queue can be waiting or running with or without tasks
+    MUTEX_LOCK(((cstreamData *)(ker->stream))->stream_lock);
+    status = ((cstreamData *)(ker->stream))->ev.status;
+
+    // if stream queue status is run (first kernel) (enqueue to the kernel
+    // queue)
+    cstreamData *e = ((cstreamData *)(ker->stream));
+    // synchronized is called after no job in the queue so stream is stuck on
+    // synchronize
+    if (e->ev.status == C_SYNCHRONIZE) {
+      if ((e->kernelQueue->finish_count) == (e->kernelQueue->kernel_count)) {
+        e->ev.status = C_RUN;
+      }
+    }
+
+    if (e->ev.status == C_RUN) {
+      // change the status to wait
+      e->ev.status == C_WAIT;
+      MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
+
+      schedulerEnqueueKernel(&ker);
+    } else {
+      // the status of stream queue is wait so just enqueue to the stream
+      enqueueKernel(&((cstreamData *)(ker->stream))->kernelQueue, &ker);
+      MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
+    }
+  }
+}
+
+/*
+    Get Work Item: get the kernel from the queue and increment blockId
+*/
+int getWorkItem(struct kernel_queue **qu, cu_kernel **kern, int blockId) {
+  struct kernel_queue *q = *qu;
+  if (q->waiting_count > 0) {
+    *kern = q->head;
+    cu_kernel *ker = *kern;
+    if (blockId + 1 == q->head->totalBlocks) {
+      // deque the head
+      dequeKernelLL(qu);
+      ker->status = C_COMPLETE;
+      q->waiting_count -= 1;
+    } else {
+      q->head->blockId += 1;
+    }
+    q->finish_count += 1;
+  } else {
+    return C_QUEUE_EMPTY;
+  }
+  return C_SUCCESS;
+}
+
+/*
+    Thread Gets Work
+*/
+int get_work(c_thread *th) {
+
+  cu_kernel ker;
+
+  MUTEX_LOCK(scheduler->work_queue_lock);
+
+RETRY:
+
+  int is_exit = 0;
+  int is_command_not_null = 0;
+
+  int blockId;
+  int localBlockSize;
+  int status;
+  int completion_status = 0;
+
+  is_exit = scheduler->threadpool_shutdown_requested;
+
+  MUTEX_UNLOCK(scheduler->work_queue_lock);
+
+  if (!is_exit) {
+
+    MUTEX_LOCK(scheduler->work_queue_lock);
+
+    // if kernel waiting to be complete is not zero
+    if (scheduler->kernelQueue->waiting_count > 0) {
+      blockId = scheduler->kernelQueue->head->blockId;
+      localBlockSize = scheduler->kernelQueue->head->blockSize;
+      // set status as success fully queue
+      status = C_SUCCESS;
+      ker = *(scheduler->kernelQueue->head);
+      // if the blockId + 1 is equal to the goal block size ,
+      // then its the last block
+
+      if (blockId + 1 == scheduler->kernelQueue->head->totalBlocks) {
+        // deque the head
+        dequeKernelLL(&scheduler->kernelQueue);
+
+        ker.status = C_COMPLETE;
+        scheduler->kernelQueue->waiting_count -= 1;
+      } else {
+        // increment the blockId
+        scheduler->kernelQueue->head->blockId =
+            scheduler->kernelQueue->head->blockId + 1;
+      }
+      // status = getWorkItem(&scheduler->kernelQueue, &ker, blockId);
+    } else {
+      status = C_QUEUE_EMPTY;
+    }
+    MUTEX_UNLOCK(scheduler->work_queue_lock);
+  }
+
+  if (status != C_QUEUE_EMPTY) {
+
+    block_index = blockId;
+    thread_memory_size = ker.shared_mem;
+    ker.start_routine(ker.args);
+
+    is_command_not_null = 1;
+    if (ker.status == C_COMPLETE) {
+
+      // check if this kernel's stream has more jobs to run (enqueue the next
+      // job)
+      if (ker.stream != NULL) {
+        bool synchronize = false;
+
+        MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
+
+        if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
+          // synchronize stream
+          if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
+            ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
+          }
+
+          MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
+
+          if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
+            // synchronize stream
+            if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
+              ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
+            }
+
+            if (((cstreamData *)(ker.stream))->ev.numKernelsToWait == 0) {
+              synchronize = false;
+            } else {
+              synchronize = true;
+            }
+          }
+          if (synchronize == false) {
+            if (((cstreamData *)(ker.stream))->kernelQueue->waiting_count > 0) {
+              ((cstreamData *)(ker.stream))->ev.status = C_WAIT;
+
+              MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
+              cu_kernel *kern =
+                  ((cstreamData *)(ker.stream))->kernelQueue->head;
+              schedulerEnqueueKernel(&kern);
+              dequeKernelLL(&((cstreamData *)(ker.stream))->kernelQueue);
+
+            } else {
+
+              // switch the stream to run to allow for the next execution
+              ((cstreamData *)(ker.stream))->ev.status = C_RUN;
+
+              MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
+            }
+          }
+        }
+      }
+      MUTEX_LOCK(scheduler->work_queue_lock);
+      scheduler->num_kernel_finished += 1;
+      MUTEX_UNLOCK(scheduler->work_queue_lock);
+    }
+  }
+
+  MUTEX_LOCK(scheduler->work_queue_lock);
+
+  if ((is_exit == 0 && is_command_not_null == 0)) {
+    // all threads in condition wait
+    scheduler->idle_threads += 1;
+    pthread_cond_wait(&(scheduler->wake_pool), &(scheduler->work_queue_lock));
+    scheduler->idle_threads -= 1;
+    goto RETRY;
+  }
+  MUTEX_UNLOCK(scheduler->work_queue_lock);
+
+  return is_exit;
+}
+
+void *driver_thread(void *p) {
+  struct c_thread *td = (struct c_thread *)p;
+  int is_exit = 0;
+  td->exit = false;
+
+  while (1) {
+    // get work
+    is_exit = get_work(td);
+
+    // exit the routine
+    if (is_exit) {
+      td->exit = true;
+      // pthread_exit
+      pthread_exit(NULL);
+    }
+  }
+}
+
+/*
+Initialize the scheduler
+*/
+int scheduler_init(cu_device device) {
+
+  scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
+  scheduler->num_worker_threads = device.max_compute_units;
+
+  scheduler->thread_pool = (struct c_thread *)calloc(
+      scheduler->num_worker_threads, sizeof(c_thread));
+  kernel_queue *asq;
+  create_KernelQueue(&asq);
+  scheduler->kernelQueue = asq;
+
+  INIT_LOCK(scheduler->work_queue_lock);
+  pthread_cond_init(&scheduler->wake_pool, NULL);
+  scheduler->idle_threads = 0;
+
+  for (int i = 0; i < scheduler->num_worker_threads; i++) {
+    scheduler->thread_pool[i].index = i;
+    pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread,
+                   (void *)&scheduler->thread_pool[i]);
+  }
+
+  return C_SUCCESS;
+}
+
+void scheduler_uninit() {
+  unsigned i;
+
+  int r = pthread_mutex_lock(&scheduler->work_queue_lock);
+  assert(r == 0);
+  scheduler->threadpool_shutdown_requested = 1;
+  pthread_cond_broadcast(&scheduler->wake_pool);
+
+  int r1 = pthread_mutex_unlock(&scheduler->work_queue_lock);
+  assert(r1 == 0);
+
+  for (i = 0; i < scheduler->num_worker_threads; i++) {
+
+    pthread_join(scheduler->thread_pool[i].thread, NULL);
+  }
+  free(scheduler->thread_pool);
+  free(scheduler->kernelQueue);
+
+  pthread_mutex_destroy(&scheduler->work_queue_lock);
+  pthread_cond_destroy(&scheduler->wake_pool);
+
+  scheduler->threadpool_shutdown_requested = 0;
+}
+
+int cuWait(cstreamData *evt) {
+
+AGAIN:
+  int r = pthread_mutex_lock(&evt->stream_lock);
+  assert(r == 0);
+  if (evt->ev.status != C_COMPLETE) {
+    int r1 = pthread_mutex_unlock(&evt->stream_lock);
+    assert(r1 == 0);
+    goto AGAIN;
+  }
+  return C_SUCCESS;
+}
+
+/*
+  Barrier for Kernel Launch
+
+  During kernel launch, increment the number of work items required to finish
+  Each kernel will point to the same event
+
+  During Running Command, decrement the event.work_item count
+  when count is 0, all work items for this kernel launch is finish
+
+  Sense Like Barrier
+  Counting Barrier basically
+*/
+void cuSynchronizeBarrier() {
+AGAIN:
+
+  MUTEX_LOCK(scheduler->work_queue_lock);
+
+  if (scheduler->num_kernel_finished != scheduler->num_kernel_queued ||
+      scheduler->idle_threads != scheduler->num_worker_threads) {
+    MUTEX_UNLOCK(scheduler->work_queue_lock);
+    goto AGAIN;
+  } else {
+    MUTEX_UNLOCK(scheduler->work_queue_lock);
+  }
+}