diff --git a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp b/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp
index bcd747c..f92acf5 100644
--- a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp
+++ b/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp
@@ -9,20 +9,77 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
 #include <iostream>
 #include <map>
 #include <set>
 
 using namespace llvm;
 
+/// Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
+  if (GV->getInitializer()->isNullValue())
+    return std::vector<Function *>();
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function *> Result;
+  Result.reserve(CA->getNumOperands());
+  for (auto &V : CA->operands()) {
+    ConstantStruct *CS = cast<ConstantStruct>(V);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
 void RemoveCudaBuiltin(llvm::Module *M) {
 
   std::set<llvm::Function *> need_remove;
 
-  if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) {
-    gv->dropAllReferences();
-    gv->eraseFromParent();
+  // remove cuda built-in from Ctors
+  if (GlobalVariable *GV = M->getGlobalVariable("llvm.global_ctors")) {
+    std::vector<Function *> Ctors = parseGlobalCtors(GV);
+    if (!Ctors.empty()) {
+      ConstantArray *OldCA = cast<ConstantArray>(GV->getInitializer());
+      SmallVector<Constant *, 10> CAList;
+      for (int i = 0; i < OldCA->getNumOperands(); i++) {
+        if (!Ctors[i])
+          continue;
+        if (Ctors[i]->hasName() &&
+            Ctors[i]->getName().str().find("__cuda") == std::string::npos) {
+          std::cout << "keep: " << Ctors[i]->getName().str() << std::endl
+                    << std::flush;
+          CAList.push_back(OldCA->getOperand(i));
+        }
+      }
+
+      // Create the new array initializer.
+      ArrayType *ATy =
+          ArrayType::get(OldCA->getType()->getElementType(), CAList.size());
+      Constant *CA = ConstantArray::get(ATy, CAList);
+
+      // If we didn't change the number of elements, don't create a new GV.
+      if (CA->getType() == OldCA->getType()) {
+        GV->setInitializer(CA);
+      } else {
+        // Create the new global and insert it next to the existing list.
+        GlobalVariable *NGV = new GlobalVariable(
+            CA->getType(), GV->isConstant(), GV->getLinkage(), CA, "",
+            GV->getThreadLocalMode());
+        GV->getParent()->getGlobalList().insert(GV->getIterator(), NGV);
+        NGV->takeName(GV);
+
+        // Nuke the old list, replacing any uses with the new one.
+        if (!GV->use_empty()) {
+          Constant *V = NGV;
+          if (V->getType() != GV->getType())
+            V = ConstantExpr::getBitCast(V, GV->getType());
+          GV->replaceAllUsesWith(V);
+        }
+        GV->eraseFromParent();
+      }
+    }
   }
+
   Function *c_tor = NULL;
   if (c_tor = M->getFunction("__cuda_module_ctor")) {
     c_tor->dropAllReferences();
diff --git a/compilation/KernelTranslation/src/x86/init.cpp b/compilation/KernelTranslation/src/x86/init.cpp
index 988e0a5..773ba9e 100644
--- a/compilation/KernelTranslation/src/x86/init.cpp
+++ b/compilation/KernelTranslation/src/x86/init.cpp
@@ -360,6 +360,12 @@ void replace_cuda_math_built_in(llvm::Module *M) {
     if (func_name.find("_ZL3expd") != std::string::npos) {
       F->deleteBody();
     }
+    if (func_name.find("_ZL8copysigndd") != std::string::npos) {
+      F->deleteBody();
+    }
+    if (func_name.find("_ZL8copysigndd.8") != std::string::npos) {
+      F->deleteBody();
+    }
   }
 }
 
@@ -370,6 +376,8 @@ void init_block(llvm::Module *M, std::ofstream &fout) {
   remove_cuda_built_in(M);
   // replace CUDA math function, like expf
   replace_cuda_math_built_in(M);
+  // replace CUDA math function, like expf
+  replace_cuda_math_built_in(M);
 
   // lower ConstantExpression
   bool modified;
diff --git a/compilation/KernelTranslation/src/x86/tool.cpp b/compilation/KernelTranslation/src/x86/tool.cpp
index fecca2f..44f5a7b 100644
--- a/compilation/KernelTranslation/src/x86/tool.cpp
+++ b/compilation/KernelTranslation/src/x86/tool.cpp
@@ -464,10 +464,12 @@ void replace_built_in_function(llvm::Module *M) {
                   std::vector<Value *> Indices;
                   Indices.push_back(ConstantInt::get(I32, 0));
                   Indices.push_back(ConstantInt::get(I32, i));
-                  auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type
-                                                           src_alloc, // Alloca
-                                                           Indices,   // Indices
-                                                           "", Call);
+                  auto new_GEP = GetElementPtrInst::Create(
+                      cast<PointerType>(src_alloc->getType()->getScalarType())
+                          ->getElementType(),
+                      src_alloc, // Alloca
+                      Indices,   // Indices
+                      "", Call);
                   auto new_load =
                       new LoadInst(new_GEP->getType()->getPointerElementType(),
                                    new_GEP, "", Call);
@@ -503,8 +505,14 @@ void replace_built_in_function(llvm::Module *M) {
               Call->getCalledFunction()->setName("__nvvm_lohi_i2d");
             } else if (func_name == "llvm.nvvm.fabs.f") {
               Call->getCalledFunction()->setName("__nvvm_fabs_f");
+            } else if (func_name == "llvm.nvvm.fabs.d") {
+              Call->getCalledFunction()->setName("__nv_fabsd");
             } else if (func_name == "llvm.nvvm.mul24.i") {
               Call->getCalledFunction()->setName("__nvvm_mul24_i");
+            } else if (func_name == "llvm.nvvm.fmin.d") {
+              Call->getCalledFunction()->setName("__nv_fmind");
+            } else if (func_name == "llvm.nvvm.fmax.d") {
+              Call->getCalledFunction()->setName("__nv_fmaxd");
             }
           }
         }
diff --git a/compilation/KernelTranslation/src/x86/warp_func.cpp b/compilation/KernelTranslation/src/x86/warp_func.cpp
index cb81b6b..50b7320 100644
--- a/compilation/KernelTranslation/src/x86/warp_func.cpp
+++ b/compilation/KernelTranslation/src/x86/warp_func.cpp
@@ -70,10 +70,12 @@ void handle_warp_vote(llvm::Module *M) {
         new LoadInst(intra_warp_index_addr->getType()->getPointerElementType(),
                      intra_warp_index_addr, "intra_warp_index", sync_inst);
 
-    auto GEP = GetElementPtrInst::Create(NULL,          // Pointee type
-                                         warp_vote_ptr, // Alloca
-                                         {zero, intra_warp_index}, // Indices
-                                         "", sync_inst);
+    auto GEP = GetElementPtrInst::Create(
+        cast<PointerType>(warp_vote_ptr->getType()->getScalarType())
+            ->getElementType(),
+        warp_vote_ptr,            // Alloca
+        {zero, intra_warp_index}, // Indices
+        "", sync_inst);
 
     // as AVX only support 8bit for each thread
     // so we have to cast the predict into int8
diff --git a/runtime/include/x86/cudaKernelImpl.h b/runtime/include/x86/cudaKernelImpl.h
index 4aa94de..fbccbe9 100644
--- a/runtime/include/x86/cudaKernelImpl.h
+++ b/runtime/include/x86/cudaKernelImpl.h
@@ -19,7 +19,11 @@ float __nv_fmodf(float, float);
 int __nv_isnanf(float);
 int __nv_isinff(float);
 float __nv_fabsf(float);
+double __nv_fabsd(double);
+double __nv_fmind(double, double);
+double __nv_fmaxd(double, double);
 int __nvvm_mul24_i(int, int);
 double _ZL3expd(double);
+double _ZL8copysigndd(double, double);
 }
 #endif
diff --git a/runtime/src/x86/cudaKernelImpl.cpp b/runtime/src/x86/cudaKernelImpl.cpp
index 56803c4..36eedf1 100644
--- a/runtime/src/x86/cudaKernelImpl.cpp
+++ b/runtime/src/x86/cudaKernelImpl.cpp
@@ -15,5 +15,9 @@ float __nv_fmodf(float x, float y) { return fmod(x, y); }
 int __nv_isnanf(float v) { return isnan(v); }
 int __nv_isinff(float v) { return isinf(v); }
 float __nv_fabsf(float v) { return abs(v); }
+double __nv_fabsd(double v) { return abs(v); }
+double __nv_fmind(double a, double b) { return (a < b) ? a : b; }
+double __nv_fmaxd(double a, double b) { return (a > b) ? a : b; }
 int __nvvm_mul24_i(int a, int b) { return a * b; }
 double _ZL3expd(double base) { return exp(base); }
+double _ZL8copysigndd(double x, double y) { return y > 0 ? abs(x) : -abs(x); };
diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp
index f0c0e5b..0095b37 100644
--- a/runtime/threadPool/src/x86/api.cpp
+++ b/runtime/threadPool/src/x86/api.cpp
@@ -17,7 +17,11 @@
 Initialize the device
 */
 int device_max_compute_units = 1;
+bool device_initilized = false;
 int init_device() {
+  if (device_initilized)
+    return 0;
+  device_initilized = true;
   cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
   if (device == NULL)
     return C_ERROR_MEMALLOC;
@@ -231,6 +235,9 @@ void scheduler_uninit() {
   Counting Barrier basically
 */
 void cuSynchronizeBarrier() {
+  if (!device_initilized) {
+    init_device();
+  }
   while (1) {
     // (TODO): currently, we assume each kernel launch will  have a
     // following sync