diff --git a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp b/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp index bcd747c..f92acf5 100644 --- a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp +++ b/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp @@ -9,20 +9,77 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Transforms/Utils/CtorUtils.h" #include #include #include using namespace llvm; +/// Given a llvm.global_ctors list that we can understand, +/// return a list of the functions and null terminator as a vector. +static std::vector parseGlobalCtors(GlobalVariable *GV) { + if (GV->getInitializer()->isNullValue()) + return std::vector(); + ConstantArray *CA = cast(GV->getInitializer()); + std::vector Result; + Result.reserve(CA->getNumOperands()); + for (auto &V : CA->operands()) { + ConstantStruct *CS = cast(V); + Result.push_back(dyn_cast(CS->getOperand(1))); + } + return Result; +} + void RemoveCudaBuiltin(llvm::Module *M) { std::set need_remove; - if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) { - gv->dropAllReferences(); - gv->eraseFromParent(); + // remove cuda built-in from Ctors + if (GlobalVariable *GV = M->getGlobalVariable("llvm.global_ctors")) { + std::vector Ctors = parseGlobalCtors(GV); + if (!Ctors.empty()) { + ConstantArray *OldCA = cast(GV->getInitializer()); + SmallVector CAList; + for (int i = 0; i < OldCA->getNumOperands(); i++) { + if (!Ctors[i]) + continue; + if (Ctors[i]->hasName() && + Ctors[i]->getName().str().find("__cuda") == std::string::npos) { + std::cout << "keep: " << Ctors[i]->getName().str() << std::endl + << std::flush; + CAList.push_back(OldCA->getOperand(i)); + } + } + + // Create the new array initializer. + ArrayType *ATy = + ArrayType::get(OldCA->getType()->getElementType(), CAList.size()); + Constant *CA = ConstantArray::get(ATy, CAList); + + // If we didn't change the number of elements, don't create a new GV. + if (CA->getType() == OldCA->getType()) { + GV->setInitializer(CA); + } else { + // Create the new global and insert it next to the existing list. + GlobalVariable *NGV = new GlobalVariable( + CA->getType(), GV->isConstant(), GV->getLinkage(), CA, "", + GV->getThreadLocalMode()); + GV->getParent()->getGlobalList().insert(GV->getIterator(), NGV); + NGV->takeName(GV); + + // Nuke the old list, replacing any uses with the new one. + if (!GV->use_empty()) { + Constant *V = NGV; + if (V->getType() != GV->getType()) + V = ConstantExpr::getBitCast(V, GV->getType()); + GV->replaceAllUsesWith(V); + } + GV->eraseFromParent(); + } + } } + Function *c_tor = NULL; if (c_tor = M->getFunction("__cuda_module_ctor")) { c_tor->dropAllReferences(); diff --git a/compilation/KernelTranslation/src/x86/init.cpp b/compilation/KernelTranslation/src/x86/init.cpp index 988e0a5..773ba9e 100644 --- a/compilation/KernelTranslation/src/x86/init.cpp +++ b/compilation/KernelTranslation/src/x86/init.cpp @@ -360,6 +360,12 @@ void replace_cuda_math_built_in(llvm::Module *M) { if (func_name.find("_ZL3expd") != std::string::npos) { F->deleteBody(); } + if (func_name.find("_ZL8copysigndd") != std::string::npos) { + F->deleteBody(); + } + if (func_name.find("_ZL8copysigndd.8") != std::string::npos) { + F->deleteBody(); + } } } @@ -370,6 +376,8 @@ void init_block(llvm::Module *M, std::ofstream &fout) { remove_cuda_built_in(M); // replace CUDA math function, like expf replace_cuda_math_built_in(M); + // replace CUDA math function, like expf + replace_cuda_math_built_in(M); // lower ConstantExpression bool modified; diff --git a/compilation/KernelTranslation/src/x86/tool.cpp b/compilation/KernelTranslation/src/x86/tool.cpp index fecca2f..44f5a7b 100644 --- a/compilation/KernelTranslation/src/x86/tool.cpp +++ b/compilation/KernelTranslation/src/x86/tool.cpp @@ -464,10 +464,12 @@ void replace_built_in_function(llvm::Module *M) { std::vector Indices; Indices.push_back(ConstantInt::get(I32, 0)); Indices.push_back(ConstantInt::get(I32, i)); - auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type - src_alloc, // Alloca - Indices, // Indices - "", Call); + auto new_GEP = GetElementPtrInst::Create( + cast(src_alloc->getType()->getScalarType()) + ->getElementType(), + src_alloc, // Alloca + Indices, // Indices + "", Call); auto new_load = new LoadInst(new_GEP->getType()->getPointerElementType(), new_GEP, "", Call); @@ -503,8 +505,14 @@ void replace_built_in_function(llvm::Module *M) { Call->getCalledFunction()->setName("__nvvm_lohi_i2d"); } else if (func_name == "llvm.nvvm.fabs.f") { Call->getCalledFunction()->setName("__nvvm_fabs_f"); + } else if (func_name == "llvm.nvvm.fabs.d") { + Call->getCalledFunction()->setName("__nv_fabsd"); } else if (func_name == "llvm.nvvm.mul24.i") { Call->getCalledFunction()->setName("__nvvm_mul24_i"); + } else if (func_name == "llvm.nvvm.fmin.d") { + Call->getCalledFunction()->setName("__nv_fmind"); + } else if (func_name == "llvm.nvvm.fmax.d") { + Call->getCalledFunction()->setName("__nv_fmaxd"); } } } diff --git a/compilation/KernelTranslation/src/x86/warp_func.cpp b/compilation/KernelTranslation/src/x86/warp_func.cpp index cb81b6b..50b7320 100644 --- a/compilation/KernelTranslation/src/x86/warp_func.cpp +++ b/compilation/KernelTranslation/src/x86/warp_func.cpp @@ -70,10 +70,12 @@ void handle_warp_vote(llvm::Module *M) { new LoadInst(intra_warp_index_addr->getType()->getPointerElementType(), intra_warp_index_addr, "intra_warp_index", sync_inst); - auto GEP = GetElementPtrInst::Create(NULL, // Pointee type - warp_vote_ptr, // Alloca - {zero, intra_warp_index}, // Indices - "", sync_inst); + auto GEP = GetElementPtrInst::Create( + cast(warp_vote_ptr->getType()->getScalarType()) + ->getElementType(), + warp_vote_ptr, // Alloca + {zero, intra_warp_index}, // Indices + "", sync_inst); // as AVX only support 8bit for each thread // so we have to cast the predict into int8 diff --git a/runtime/include/x86/cudaKernelImpl.h b/runtime/include/x86/cudaKernelImpl.h index 4aa94de..fbccbe9 100644 --- a/runtime/include/x86/cudaKernelImpl.h +++ b/runtime/include/x86/cudaKernelImpl.h @@ -19,7 +19,11 @@ float __nv_fmodf(float, float); int __nv_isnanf(float); int __nv_isinff(float); float __nv_fabsf(float); +double __nv_fabsd(double); +double __nv_fmind(double, double); +double __nv_fmaxd(double, double); int __nvvm_mul24_i(int, int); double _ZL3expd(double); +double _ZL8copysigndd(double, double); } #endif diff --git a/runtime/src/x86/cudaKernelImpl.cpp b/runtime/src/x86/cudaKernelImpl.cpp index 56803c4..36eedf1 100644 --- a/runtime/src/x86/cudaKernelImpl.cpp +++ b/runtime/src/x86/cudaKernelImpl.cpp @@ -15,5 +15,9 @@ float __nv_fmodf(float x, float y) { return fmod(x, y); } int __nv_isnanf(float v) { return isnan(v); } int __nv_isinff(float v) { return isinf(v); } float __nv_fabsf(float v) { return abs(v); } +double __nv_fabsd(double v) { return abs(v); } +double __nv_fmind(double a, double b) { return (a < b) ? a : b; } +double __nv_fmaxd(double a, double b) { return (a > b) ? a : b; } int __nvvm_mul24_i(int a, int b) { return a * b; } double _ZL3expd(double base) { return exp(base); } +double _ZL8copysigndd(double x, double y) { return y > 0 ? abs(x) : -abs(x); }; diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp index f0c0e5b..0095b37 100644 --- a/runtime/threadPool/src/x86/api.cpp +++ b/runtime/threadPool/src/x86/api.cpp @@ -17,7 +17,11 @@ Initialize the device */ int device_max_compute_units = 1; +bool device_initilized = false; int init_device() { + if (device_initilized) + return 0; + device_initilized = true; cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); if (device == NULL) return C_ERROR_MEMALLOC; @@ -231,6 +235,9 @@ void scheduler_uninit() { Counting Barrier basically */ void cuSynchronizeBarrier() { + if (!device_initilized) { + init_device(); + } while (1) { // (TODO): currently, we assume each kernel launch will have a // following sync