From d7668ccd867cf80e354cb372669e911aba76b5d5 Mon Sep 17 00:00:00 2001
From: RobinHan <hanruobing@pku.edu.cn>
Date: Fri, 17 Jun 2022 22:20:13 -0400
Subject: [PATCH] [WIP] migriate to LLVM14

---
 .../src/x86/ReplaceCudaBuiltin.cpp            | 21 ++++-
 compilation/KernelTranslation/CMakeLists.txt  |  2 +-
 .../KernelTranslation/include/x86/tool.h      |  8 ++
 .../src/x86/generate_x86_format.cpp           | 19 ++--
 .../KernelTranslation/src/x86/init.cpp        |  4 +-
 .../src/x86/insert_warp_loop.cpp              | 38 ++++----
 .../src/x86/memory_hierarchy.cpp              |  2 +-
 .../KernelTranslation/src/x86/performance.cpp |  3 +-
 .../KernelTranslation/src/x86/tool.cpp        | 90 ++++++++++---------
 .../KernelTranslation/src/x86/warp_func.cpp   | 24 ++---
 runtime/CMakeLists.txt                        |  4 +-
 runtime/src/x86/cudaRuntimeImpl.cpp           | 38 +++++---
 runtime/threadPool/CMakeLists.txt             |  3 +-
 runtime/threadPool/src/x86/api.cpp            |  7 +-
 14 files changed, 163 insertions(+), 100 deletions(-)
diff --git a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp b/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp
index 01a34b6..3991c42 100644
--- a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp
@@ -8,6 +8,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include <iostream>
 #include <map>
+#include <regex>
 #include <set>
 
 using namespace llvm;
@@ -200,8 +201,23 @@ void ReplaceKernelLaunch(llvm::Module *M) {
                   prior name before _host is add
                 */
                 std::string oldName = functionOperand->getName().str();
+                // For LLVM>=14, it will add _device_stub prefix for the kernel
+                // name, thus, we need to remove the prefix
+                // example:
+                // from: _Z24__device_stub__HistogramPjS_jj
+                // to: HistogramPjS_jj
+                oldName = std::regex_replace(oldName,
+                                             std::regex("__device_stub__"), "");
+                // remove _Z24
+                for (int i = 2; i < oldName.length(); i++) {
+                  if (oldName[i] >= '0' && oldName[i] <= '9')
+                    continue;
+                  oldName = oldName.substr(i);
+                  break;
+                }
 
-                // if parent function is __host and same as the cudaKernelLaunch
+                // if parent function is __host and same as the
+                // cudaKernelLaunch
                 std::string newName = oldName + "_wrapper";
                 if (func_name == oldName && host_changed &&
                     oldName.find("_host") != std::string::npos) {
@@ -220,12 +236,11 @@ void ReplaceKernelLaunch(llvm::Module *M) {
                 kernels.insert({functionOperand->getName().str(), F});
               }
             } else if (cuda_register_kernel_names.find(
-                           calledFunction->getName()) !=
+                           calledFunction->getName().str()) !=
                        cuda_register_kernel_names.end()) {
               // if the called function collides with kernel definiton
               // TODO: some reason changes all occurences of the function name
               // for both cudaKernelLaunch calls and regular function call
-              // errs() << *inst;
               host_changed = true;
               calledFunction->setName(calledFunction->getName() + "_host");
               std::cout << std::endl;
diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt
index 25968d6..a9b52dd 100644
--- a/compilation/KernelTranslation/CMakeLists.txt
+++ b/compilation/KernelTranslation/CMakeLists.txt
@@ -18,4 +18,4 @@ file(GLOB proj_HEADERS "include/x86/*.h")
 file(GLOB proj_SOURCES "src/x86/*.cpp")
 
 # Add core library.
-add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
+add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
diff --git a/compilation/KernelTranslation/include/x86/tool.h b/compilation/KernelTranslation/include/x86/tool.h
index e1b1e90..c4538f1 100644
--- a/compilation/KernelTranslation/include/x86/tool.h
+++ b/compilation/KernelTranslation/include/x86/tool.h
@@ -1,6 +1,8 @@
 #ifndef __NVVM2x86_TOOL__
 #define __NVVM2x86_TOOL__
 
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 llvm::Module *LoadModuleFromFilr(char *file_name);
 void DumpModule(llvm::Module *M, char *file_name);
@@ -22,4 +24,10 @@ bool has_barrier(llvm::BasicBlock *B);
 bool has_block_barrier(llvm::BasicBlock *B);
 bool has_barrier(llvm::Function *F);
 void replace_dynamic_shared_memory(llvm::Module *M);
+llvm::LoadInst *createLoad(llvm::IRBuilder<> &B, llvm::Value *addr,
+                           bool isVolatile = false);
+llvm::Value *createInBoundsGEP(llvm::IRBuilder<> &B, llvm::Value *ptr,
+                               llvm::ArrayRef<llvm::Value *> idxlist);
+llvm::Value *createGEP(llvm::IRBuilder<> &B, llvm::Value *ptr,
+                       llvm::ArrayRef<llvm::Value *> idxlist);
 #endif
diff --git a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
index b2594cf..9d563ff 100644
--- a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
+++ b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp
@@ -11,7 +11,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -19,6 +19,7 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <iostream>
+#include <map>
 
 using namespace llvm;
 
@@ -52,6 +53,14 @@ void decode_input(llvm::Module *M) {
     if (!isKernelFunction(M, F))
       continue;
     auto func_name = F->getName().str();
+    // remove mangle prefix
+    // remove _Z24
+    for (int pos = 2; pos < func_name.length(); pos++) {
+      if (func_name[pos] >= '0' && func_name[pos] <= '9')
+        continue;
+      func_name = func_name.substr(pos);
+      break;
+    }
     llvm::IRBuilder<> Builder(M->getContext());
 
     FunctionCallee fc =
@@ -78,7 +87,7 @@ void decode_input(llvm::Module *M) {
           *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
           "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
           0, false);
-      Value *loadedValue = Builder.CreateLoad(global_mem);
+      Value *loadedValue = createLoad(Builder, global_mem);
 
       llvm::FunctionType *LaunchFun2 = FunctionType::get(
           PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
@@ -120,12 +129,12 @@ void decode_input(llvm::Module *M) {
       Type *ArgType = ii->getType();
 
       // calculate addr
-      Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx));
+      Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
       // load corresponding int*
-      GEP = Builder.CreateLoad(GEP);
+      GEP = createLoad(Builder, GEP);
       // bitcast
       GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
-      Value *Arg = Builder.CreateLoad(GEP);
+      Value *Arg = createLoad(Builder, GEP);
       Arguments.push_back(Arg);
       ++idx;
     }
diff --git a/compilation/KernelTranslation/src/x86/init.cpp b/compilation/KernelTranslation/src/x86/init.cpp
index 62ae6ff..541994a 100644
--- a/compilation/KernelTranslation/src/x86/init.cpp
+++ b/compilation/KernelTranslation/src/x86/init.cpp
@@ -42,7 +42,7 @@ bool inline_warp_level_func(llvm::Module *M) {
             if (func_name == "_Z10__any_syncji" ||
                 func_name.find("shfl_down_sync") != std::string::npos) {
               InlineFunctionInfo IFI;
-              InlineFunction(c, IFI);
+              InlineFunction(*c, IFI);
               need_remove.insert(c->getCalledFunction());
               changed = true;
             }
@@ -102,7 +102,7 @@ bool inline_func_with_tid(llvm::Module *M) {
   }
   for (auto c : need_inline) {
     InlineFunctionInfo IFI;
-    InlineFunction(c, IFI);
+    InlineFunction(*c, IFI);
   }
   return changed;
 }
diff --git a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
index 1c090bf..549ac31 100644
--- a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
@@ -179,7 +179,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
   llvm::AllocaInst *Alloca = nullptr;
 
   auto block_size_addr = M->getGlobalVariable("block_size");
-  auto block_size = builder.CreateLoad(block_size_addr);
+  auto block_size = createLoad(builder, block_size_addr);
   Alloca = builder.CreateAlloca(AllocType, block_size, varName);
 
   contextArrays[varName] = Alloca;
@@ -208,9 +208,9 @@ llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
   std::vector<llvm::Value *> gepArgs;
 
   auto inter_warp_index =
-      builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
+      createLoad(builder, M->getGlobalVariable("inter_warp_index"));
   auto intra_warp_index =
-      builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+      createLoad(builder, M->getGlobalVariable("intra_warp_index"));
   auto thread_idx = builder.CreateBinOp(
       Instruction::Add, intra_warp_index,
       builder.CreateBinOp(Instruction::Mul, inter_warp_index,
@@ -218,7 +218,7 @@ llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
       "thread_idx");
   gepArgs.push_back(thread_idx);
 
-  return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
+  return builder.CreateStore(instruction, createGEP(builder, alloca, gepArgs));
 }
 
 llvm::Instruction *AddContextRestore(llvm::Value *val,
@@ -242,9 +242,9 @@ llvm::Instruction *AddContextRestore(llvm::Value *val,
   auto M = before->getParent()->getParent()->getParent();
   auto I32 = llvm::Type::getInt32Ty(M->getContext());
   auto inter_warp_index =
-      builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
+      createLoad(builder, M->getGlobalVariable("inter_warp_index"));
   auto intra_warp_index =
-      builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+      createLoad(builder, M->getGlobalVariable("intra_warp_index"));
   auto thread_idx = builder.CreateBinOp(
       Instruction::Add, intra_warp_index,
       builder.CreateBinOp(Instruction::Mul, inter_warp_index,
@@ -253,11 +253,11 @@ llvm::Instruction *AddContextRestore(llvm::Value *val,
   gepArgs.push_back(thread_idx);
 
   llvm::Instruction *gep =
-      dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
+      dyn_cast<Instruction>(createGEP(builder, alloca, gepArgs));
   if (isAlloca) {
     return gep;
   }
-  return builder.CreateLoad(gep);
+  return createLoad(builder, gep);
 }
 
 void AddContextSaveRestore(llvm::Instruction *instruction,
@@ -316,7 +316,7 @@ void handle_alloc(llvm::Function *F) {
     // generate a new alloc
     auto block_size_addr = M->getGlobalVariable("block_size");
     IRBuilder<> builder(inst);
-    auto block_size = builder.CreateLoad(block_size_addr);
+    auto block_size = createLoad(builder, block_size_addr);
 
     llvm::Type *elementType = NULL;
     if (dyn_cast<AllocaInst>(inst)->getType()->getElementType()) {
@@ -338,16 +338,16 @@ void handle_alloc(llvm::Function *F) {
       IRBuilder<> builder(user);
       // std::vector<llvm::Value *> gepArgs;
       auto inter_warp_index =
-          builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
+          createLoad(builder, M->getGlobalVariable("inter_warp_index"));
       auto intra_warp_index =
-          builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+          createLoad(builder, M->getGlobalVariable("intra_warp_index"));
       auto thread_idx = builder.CreateBinOp(
           Instruction::Add, intra_warp_index,
           builder.CreateBinOp(Instruction::Mul, inter_warp_index,
                               ConstantInt::get(I32, 32)),
           "thread_idx");
 
-      auto gep = builder.CreateGEP(Alloca, thread_idx);
+      auto gep = createGEP(builder, Alloca, thread_idx);
 
       user->replaceUsesOfWith(inst, gep);
     }
@@ -479,19 +479,19 @@ BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore,
     auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
     auto block_size = M->getGlobalVariable("block_size");
     auto warp_cnt =
-        builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size),
+        builder.CreateBinOp(Instruction::SDiv, createLoad(builder, block_size),
                             ConstantInt::get(I32, 32), "warp_number");
 
     cmpResult =
-        builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt);
+        builder.CreateICmpULT(createLoad(builder, inter_warp_index), warp_cnt);
   } else {
     auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
     auto block_size = M->getGlobalVariable("block_size");
     if (!need_nested_loop) {
-      cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
-                                        builder.CreateLoad(block_size));
+      cmpResult = builder.CreateICmpULT(createLoad(builder, intra_warp_index),
+                                        createLoad(builder, block_size));
     } else {
-      cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
+      cmpResult = builder.CreateICmpULT(createLoad(builder, intra_warp_index),
                                         ConstantInt::get(I32, 32));
     }
   }
@@ -513,13 +513,13 @@ BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore,
   if (intra_warp_loop) { // intra warp
     auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
     auto new_index = builder.CreateBinOp(
-        Instruction::Add, builder.CreateLoad(intra_warp_index),
+        Instruction::Add, createLoad(builder, intra_warp_index),
         ConstantInt::get(I32, 1), "intra_warp_index_increment");
     builder.CreateStore(new_index, intra_warp_index);
   } else { // inter warp
     auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
     auto new_index = builder.CreateBinOp(
-        Instruction::Add, builder.CreateLoad(inter_warp_index),
+        Instruction::Add, createLoad(builder, inter_warp_index),
         ConstantInt::get(I32, 1), "inter_warp_index_increment");
     builder.CreateStore(new_index, inter_warp_index);
   }
diff --git a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp b/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp
index 30fe5ab..7c2bcd2 100644
--- a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp
@@ -99,7 +99,7 @@ void mem_share2global(llvm::Module *M) {
             global_memory->setComdat(comdat);
             global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
             global_memory->setInitializer(undef);
-            global_memory->setAlignment(share_memory->getAlignment());
+            global_memory->setAlignment(share_memory->getAlign());
             corresponding_global_memory.insert(
                 std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                               global_memory));
diff --git a/compilation/KernelTranslation/src/x86/performance.cpp b/compilation/KernelTranslation/src/x86/performance.cpp
index 11d30e4..06a9cba 100644
--- a/compilation/KernelTranslation/src/x86/performance.cpp
+++ b/compilation/KernelTranslation/src/x86/performance.cpp
@@ -20,10 +20,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
diff --git a/compilation/KernelTranslation/src/x86/tool.cpp b/compilation/KernelTranslation/src/x86/tool.cpp
index c1652c5..fecca2f 100644
--- a/compilation/KernelTranslation/src/x86/tool.cpp
+++ b/compilation/KernelTranslation/src/x86/tool.cpp
@@ -1,8 +1,10 @@
 #include "tool.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -10,9 +12,16 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+
 #include <iostream>
 #include <set>
 
@@ -41,7 +50,7 @@ void DumpModule(llvm::Module *M, char *file_name) {
   std::string msg;
   llvm::raw_string_ostream os(msg);
   std::error_code EC;
-  ToolOutputFile Out(file_name, EC, sys::fs::F_None);
+  ToolOutputFile Out(file_name, EC, sys::fs::OF_None);
   if (EC) {
     errs() << "Fails to open output file: " << EC.message();
     return;
@@ -128,7 +137,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
   }
   builder.SetInsertPoint(phi);
 
-  llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
+  llvm::Instruction *loadedValue = createLoad(builder, alloca);
   phi->replaceAllUsesWith(loadedValue);
   phi->eraseFromParent();
 
@@ -219,13 +228,12 @@ void replace_dynamic_shared_memory(llvm::Module *M) {
     if (!dynamic_shared_memory_addr) {
       return;
     }
-    auto load_shared_memory =
-        new LoadInst(dynamic_shared_memory_addr, "new_load");
+    auto load_shared_memory = new LoadInst(
+        dynamic_shared_memory_addr->getType()->getPointerElementType(),
+        dynamic_shared_memory_addr, "new_load", &*F->begin()->begin());
     auto new_bit_cast =
         new BitCastInst(load_shared_memory,
                         dynamic_shared_memory_addr->getType(), "new_bit_cast");
-    new_bit_cast->insertBefore(&*F->begin()->begin());
-    load_shared_memory->insertBefore(new_bit_cast);
     dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) {
       auto *Instr = dyn_cast<Instruction>(U.getUser());
       return Instr != new_bit_cast && Instr != load_shared_memory;
@@ -281,21 +289,21 @@ void replace_built_in_function(llvm::Module *M) {
               auto block_size_addr = M->getGlobalVariable("block_size_x");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto val = builder.CreateLoad(block_size_addr);
+              auto val = createLoad(builder, block_size_addr);
               Call->replaceAllUsesWith(val);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
               auto block_size_addr = M->getGlobalVariable("block_size_y");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto val = builder.CreateLoad(block_size_addr);
+              auto val = createLoad(builder, block_size_addr);
               Call->replaceAllUsesWith(val);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
               auto block_size_addr = M->getGlobalVariable("block_size_z");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto val = builder.CreateLoad(block_size_addr);
+              auto val = createLoad(builder, block_size_addr);
               Call->replaceAllUsesWith(val);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" ||
@@ -307,15 +315,15 @@ void replace_built_in_function(llvm::Module *M) {
               builder.SetInsertPoint(Call);
 
               auto thread_idx = builder.CreateBinOp(
-                  Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
+                  Instruction::Mul, createLoad(builder, local_inter_warp_idx),
                   ConstantInt::get(I32, 32), "");
               thread_idx = builder.CreateBinOp(
-                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
+                  Instruction::Add, createLoad(builder, local_intra_warp_idx),
                   thread_idx, "thread_idx");
 
               thread_idx = builder.CreateBinOp(
                   Instruction::SRem, thread_idx,
-                  builder.CreateLoad(M->getGlobalVariable("block_size_x")),
+                  createLoad(builder, M->getGlobalVariable("block_size_x")),
                   "thread_id_x");
 
               Call->replaceAllUsesWith(thread_idx);
@@ -326,15 +334,15 @@ void replace_built_in_function(llvm::Module *M) {
               builder.SetInsertPoint(Call);
 
               auto thread_idx = builder.CreateBinOp(
-                  Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
+                  Instruction::Mul, createLoad(builder, local_inter_warp_idx),
                   ConstantInt::get(I32, 32), "");
               thread_idx = builder.CreateBinOp(
-                  Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
+                  Instruction::Add, createLoad(builder, local_intra_warp_idx),
                   thread_idx, "thread_idx");
               // tidy = tid / block_dim.x
               thread_idx = builder.CreateBinOp(
                   Instruction::SDiv, thread_idx,
-                  builder.CreateLoad(M->getGlobalVariable("block_size_x")),
+                  createLoad(builder, M->getGlobalVariable("block_size_x")),
                   "thread_id_y");
               Call->replaceAllUsesWith(thread_idx);
               need_remove.push_back(Call);
@@ -350,21 +358,21 @@ void replace_built_in_function(llvm::Module *M) {
               auto block_index_addr = M->getGlobalVariable("block_index_x");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto block_idx = builder.CreateLoad(block_index_addr);
+              auto block_idx = createLoad(builder, block_index_addr);
               Call->replaceAllUsesWith(block_idx);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") {
               auto block_index_addr = M->getGlobalVariable("block_index_y");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto block_idx = builder.CreateLoad(block_index_addr);
+              auto block_idx = createLoad(builder, block_index_addr);
               Call->replaceAllUsesWith(block_idx);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
               auto block_index_addr = M->getGlobalVariable("block_index_z");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto block_idx = builder.CreateLoad(block_index_addr);
+              auto block_idx = createLoad(builder, block_index_addr);
               Call->replaceAllUsesWith(block_idx);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" ||
@@ -373,21 +381,21 @@ void replace_built_in_function(llvm::Module *M) {
               auto grid_size_addr = M->getGlobalVariable("grid_size_x");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto grid_size = builder.CreateLoad(grid_size_addr);
+              auto grid_size = createLoad(builder, grid_size_addr);
               Call->replaceAllUsesWith(grid_size);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") {
               auto grid_size_addr = M->getGlobalVariable("grid_size_y");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto grid_size = builder.CreateLoad(grid_size_addr);
+              auto grid_size = createLoad(builder, grid_size_addr);
               Call->replaceAllUsesWith(grid_size);
               need_remove.push_back(Call);
             } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
               auto grid_size_addr = M->getGlobalVariable("grid_size_z");
               IRBuilder<> builder(context);
               builder.SetInsertPoint(Call);
-              auto grid_size = builder.CreateLoad(grid_size_addr);
+              auto grid_size = createLoad(builder, grid_size_addr);
               Call->replaceAllUsesWith(grid_size);
               need_remove.push_back(Call);
             }
@@ -401,7 +409,7 @@ void replace_built_in_function(llvm::Module *M) {
             // return the rank within the warp
             IRBuilder<> builder(context);
             builder.SetInsertPoint(Call);
-            auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx);
+            auto intra_warp_index = createLoad(builder, local_intra_warp_idx);
             Call->replaceAllUsesWith(intra_warp_index);
             need_remove.push_back(Call);
           }
@@ -460,7 +468,9 @@ void replace_built_in_function(llvm::Module *M) {
                                                            src_alloc, // Alloca
                                                            Indices,   // Indices
                                                            "", Call);
-                  auto new_load = new LoadInst(new_GEP, "", Call);
+                  auto new_load =
+                      new LoadInst(new_GEP->getType()->getPointerElementType(),
+                                   new_GEP, "", Call);
                   printf_args.push_back(new_load);
                 }
               }
@@ -531,7 +541,7 @@ void replace_asm_call(llvm::Module *M) {
             builder.SetInsertPoint(Call);
             auto intra_warp_index_addr =
                 M->getGlobalVariable("intra_warp_index");
-            auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr);
+            auto intra_warp_index = createLoad(builder, intra_warp_index_addr);
             Call->replaceAllUsesWith(intra_warp_index);
             need_remove.push_back(Call);
           }
@@ -652,20 +662,18 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
   return 0;
 }
 
-/*
-  Print IR to String Output for Debugging Purposes
-*/
-// void printModule(llvm::Module *M) {
-//   std::string str;
-//   llvm::raw_string_ostream ss(str);
-//   std::cout << "### Printing Module ###" << std::endl;
-//   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
-//     Function *F = &(*i);
-//     auto func_name = F->getName().str();
-//     std::cout << func_name << std::endl;
-//     for (Function::iterator b = F->begin(); b != F->end(); ++b) {
-//       BasicBlock *B = &(*b);
-//       errs() << *B;
-//     }
-//   }
-// }
+LoadInst *createLoad(IRBuilder<> &B, Value *addr, bool isVolatile) {
+  return B.CreateLoad(addr->getType()->getPointerElementType(), addr,
+                      isVolatile);
+}
+
+Value *createInBoundsGEP(IRBuilder<> &B, Value *ptr,
+                         ArrayRef<Value *> idxlist) {
+  return B.CreateInBoundsGEP(
+      ptr->getType()->getScalarType()->getPointerElementType(), ptr, idxlist);
+}
+
+Value *createGEP(IRBuilder<> &B, Value *ptr, ArrayRef<Value *> idxlist) {
+  return B.CreateGEP(ptr->getType()->getScalarType()->getPointerElementType(),
+                     ptr, idxlist);
+}
diff --git a/compilation/KernelTranslation/src/x86/warp_func.cpp b/compilation/KernelTranslation/src/x86/warp_func.cpp
index a25979a..cb81b6b 100644
--- a/compilation/KernelTranslation/src/x86/warp_func.cpp
+++ b/compilation/KernelTranslation/src/x86/warp_func.cpp
@@ -67,7 +67,8 @@ void handle_warp_vote(llvm::Module *M) {
     assert(warp_vote_ptr != NULL);
     auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index");
     auto intra_warp_index =
-        new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst);
+        new LoadInst(intra_warp_index_addr->getType()->getPointerElementType(),
+                     intra_warp_index_addr, "intra_warp_index", sync_inst);
 
     auto GEP = GetElementPtrInst::Create(NULL,          // Pointee type
                                          warp_vote_ptr, // Alloca
@@ -168,23 +169,22 @@ void handle_warp_shfl(llvm::Module *M) {
     auto shfl_offset = shfl_inst->getArgOperand(2);
 
     auto intra_warp_index =
-        builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
-    builder.CreateStore(
-        shfl_variable,
-        builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index}));
+        createLoad(builder, M->getGlobalVariable("intra_warp_index"));
+    builder.CreateStore(shfl_variable, createGEP(builder, warp_shfl_ptr,
+                                                 {ZERO, intra_warp_index}));
     // we should create barrier before store
     CreateIntraWarpBarrier(intra_warp_index);
     // load shuffled data
     auto new_intra_warp_index =
-        builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
+        createLoad(builder, M->getGlobalVariable("intra_warp_index"));
     auto shfl_name = shfl_inst->getCalledFunction()->getName().str();
     if (shfl_name.find("down") != shfl_name.npos) {
       auto calculate_offset = builder.CreateBinOp(
           Instruction::Add, new_intra_warp_index, shfl_offset);
       auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
                                            ConstantInt::get(I32, 32));
-      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
-      auto load_inst = builder.CreateLoad(gep);
+      auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index});
+      auto load_inst = createLoad(builder, gep);
 
       // create barrier
       CreateIntraWarpBarrier(new_intra_warp_index);
@@ -195,8 +195,8 @@ void handle_warp_shfl(llvm::Module *M) {
           Instruction::Sub, new_intra_warp_index, shfl_offset);
       auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
                                            ConstantInt::get(I32, 32));
-      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
-      auto load_inst = builder.CreateLoad(gep);
+      auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index});
+      auto load_inst = createLoad(builder, gep);
 
       // create barrier
       CreateIntraWarpBarrier(new_intra_warp_index);
@@ -207,8 +207,8 @@ void handle_warp_shfl(llvm::Module *M) {
           Instruction::Xor, new_intra_warp_index, shfl_offset);
       auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
                                            ConstantInt::get(I32, 32));
-      auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
-      auto load_inst = builder.CreateLoad(gep);
+      auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index});
+      auto load_inst = createLoad(builder, gep);
 
       // create barrier
       CreateIntraWarpBarrier(new_intra_warp_index);
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 6385824..94248b3 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -10,7 +10,9 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 add_subdirectory(threadPool)
 
 # compile x86 runtime library
+include_directories(./include/)
 include_directories(./include/x86)
+include_directories(./threadPool/include/)
 include_directories(./threadPool/include/x86)
-file(GLOB proj_SOURCES "src/vortex/*.cpp")
+file(GLOB proj_SOURCES "src/x86/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp
index d15dae1..3406285 100644
--- a/runtime/src/x86/cudaRuntimeImpl.cpp
+++ b/runtime/src/x86/cudaRuntimeImpl.cpp
@@ -9,13 +9,31 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; }
+cudaError_t cudaGetDevice(int *devPtr) {
+  *devPtr = 0;
+  return cudaSuccess;
+}
 const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
-cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
-cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
-cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); }
-cudaError_t cudaFree(void *devPtr) { free(devPtr); }
-cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); }
+cudaError_t cudaDeviceReset(void) {
+  scheduler_uninit();
+  return cudaSuccess;
+}
+cudaError_t cudaDeviceSynchronize(void) {
+  cuSynchronizeBarrier();
+  return cudaSuccess;
+}
+cudaError_t cudaThreadSynchronize(void) {
+  cuSynchronizeBarrier();
+  return cudaSuccess;
+}
+cudaError_t cudaFree(void *devPtr) {
+  free(devPtr);
+  return cudaSuccess;
+}
+cudaError_t cudaFreeHost(void *devPtr) {
+  free(devPtr);
+  return cudaSuccess;
+}
 
 cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
                              void **args, size_t sharedMem,
@@ -31,7 +49,7 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
 
   int lstatus = cuLaunchKernel(&ker);
 
-  // std::cout << "ret cudaLKernel" << std::endl;
+  return cudaSuccess;
 }
 cudaError_t cudaMalloc(void **devPtr, size_t size) {
   *devPtr = malloc(size);
@@ -68,15 +86,13 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
 cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count,
                                     size_t offset, cudaMemcpyKind kind) {
   assert(offset == 0 && "DO not support offset !=0\n");
-  memcpy(dst, src + offset, count);
+  memcpy(dst, (char *)src + offset, count);
   return cudaSuccess;
 }
 
 cudaError_t cudaSetDevice(int device) {
-  // error checking
-  // std::cout << "cudaSetDevice Called" << std::endl;
   init_device();
-  // std::cout << "cudaSetDevice Ret" << std::endl;
+  return cudaSuccess;
 }
 
 cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt
index 2ad0b79..d807752 100644
--- a/runtime/threadPool/CMakeLists.txt
+++ b/runtime/threadPool/CMakeLists.txt
@@ -11,7 +11,8 @@ set(LIB_NAME threadPool)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_BUILD_TYPE Debug)
+include_directories(./include)
 include_directories(./include/x86)
 
-file(GLOB proj_SOURCES "src/vortex/*.cpp")
+file(GLOB proj_SOURCES "src/x86/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp
index cd1c335..b952dba 100644
--- a/runtime/threadPool/src/x86/api.cpp
+++ b/runtime/threadPool/src/x86/api.cpp
@@ -29,7 +29,6 @@ int init_device() {
 
   // initialize scheduler
   int ret = scheduler_init(*device);
-
   if (ret != C_SUCCESS)
     return ret;
 
@@ -182,6 +181,7 @@ int schedulerEnqueueKernel(cu_kernel **k) {
 
   pthread_cond_broadcast(&(scheduler->wake_pool));
   MUTEX_UNLOCK(scheduler->work_queue_lock);
+  return 0;
 }
 
 /*
@@ -191,6 +191,7 @@ int cuLaunchKernel(cu_kernel **k) {
   if (!scheduler) {
     init_device();
   }
+  std::cout << "launch\n" << std::flush;
   // Calculate Block Size N/numBlocks
 
   cu_kernel *ker = *k;
@@ -238,6 +239,7 @@ int cuLaunchKernel(cu_kernel **k) {
       MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
     }
   }
+  return 0;
 }
 
 /*
@@ -346,7 +348,8 @@ RETRY:
       grid_size_x = gridDim.x;
       grid_size_y = gridDim.y;
       grid_size_z = gridDim.z;
-      dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
+      if (dynamic_shared_mem_size > 0)
+        dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
       int tmp = block_index;
       block_index_x = tmp / (grid_size_y * grid_size_z);
       tmp = tmp % (grid_size_y * grid_size_z);