#include "ReplaceCudaBuiltin.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/ToolOutputFile.h" #include #include #include using namespace llvm; /* insert sync after cudaKernel launch call void @_Z13staticReversePii(i32* %55, i32 64) %57 = call i32 @cudaDeviceSynchronize() */ void InsertSyncAfterKernelLaunch(llvm::Module *M) { LLVMContext *C = &M->getContext(); llvm::Type *Int32T = Type::getInt32Ty(*C); llvm::FunctionType *LauncherFuncT = FunctionType::get(Int32T, NULL); llvm::FunctionCallee _f = M->getOrInsertFunction("cudaDeviceSynchronize", LauncherFuncT); llvm::Function *func_launch = llvm::cast(_f.getCallee()); std::set launch_function_name; for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); auto func_name = F->getName().str(); for (Function::iterator b = F->begin(); b != F->end(); ++b) { BasicBlock *B = &(*b); for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { Instruction *inst = &(*i); if (llvm::CallBase *callInst = llvm::dyn_cast(inst)) { if (Function *calledFunction = callInst->getCalledFunction()) { if (calledFunction->getName().startswith("cudaLaunchKernel")) { // F is a kernel launch function launch_function_name.insert(func_name); } } } } } } for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); for (Function::iterator b = F->begin(); b != F->end(); ++b) { BasicBlock *B = &(*b); for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { Instruction *inst = &(*i); if (llvm::CallBase *callInst = llvm::dyn_cast(inst)) { if (Function *calledFunction = callInst->getCalledFunction()) { if (launch_function_name.find(calledFunction->getName().str()) != launch_function_name.end()) { // insert a sync after launch if (callInst->getNextNonDebugInstruction()) { llvm::CallInst::Create(func_launch, "", callInst->getNextNonDebugInstruction()); } } } } } } } } // Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) // Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) void ReplaceKernelLaunch(llvm::Module *M) { LLVMContext &context = M->getContext(); auto VoidTy = llvm::Type::getVoidTy(context); auto I8 = llvm::Type::getInt8PtrTy(context); std::map kernels; std::set need_remove; LLVMContext *C = &M->getContext(); llvm::Type *Int32T = Type::getInt32Ty(*C); llvm::Type *Int8T = Type::getInt8Ty(*C); llvm::FunctionType *LauncherFuncT = FunctionType::get(Type::getVoidTy(*C), NULL); llvm::FunctionType *LaunchFun2 = FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL); bool done = false; std::set cuda_register_kernel_names; std::string str; llvm::raw_string_ostream ss(str); /* When using << >>, clang generates cudaPushCallConfiguration with the same function definition as the kernel definition in the kernel bitcode define internal void @__cuda_register_globals(i8** %0) { entry: %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) ret void } */ Function *f_register_global = M->getFunction("__cuda_register_globals"); if (f_register_global) { for (Function::iterator b = f_register_global->begin(); b != f_register_global->end(); ++b) { BasicBlock *B = &(*b); for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { Instruction *inst = &(*i); if (llvm::CallInst *callInst = llvm::dyn_cast(inst)) { if (Function *calledFunction = callInst->getCalledFunction()) { if (calledFunction->getName().str() == "__cudaRegisterFunction") { Value *callOperand = callInst->getArgOperand(1); Function *functionOperand = dyn_cast(callInst->getArgOperand(1)); // call function is wrapped in a bitcast if (functionOperand == NULL) { std::vector arg_sizes; functionOperand = dyn_cast(callOperand->stripPointerCasts()); cuda_register_kernel_names.insert( functionOperand->getName().str()); std::cout << "Cuda Register Global Kernel: " << functionOperand->getName().str() << std::endl; } } } } } } } bool host_changed = false; for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); auto func_name = F->getName().str(); for (Function::iterator b = F->begin(); b != F->end(); ++b) { BasicBlock *B = &(*b); for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { Instruction *inst = &(*i); if (llvm::CallBase *callInst = llvm::dyn_cast(inst)) { if (Function *calledFunction = callInst->getCalledFunction()) { if (calledFunction->getName().startswith("cudaLaunchKernel")) { Value *callOperand = callInst->getArgOperand(0); Function *functionOperand = dyn_cast(callInst->getArgOperand(0)); // call function is wrapped in a bitcast if (functionOperand == NULL) { std::vector arg_sizes; functionOperand = dyn_cast(callOperand->stripPointerCasts()); FunctionType *ft = calledFunction->getFunctionType(); std::cout << " Parent (Caller) Function Name: " << func_name << ", cudaLaunchKernel Function: " << functionOperand->getName().str() << ", args " << functionOperand->arg_size() << std::endl; auto rep = kernels.find(functionOperand->getName().str()); if (rep != kernels.end()) { Function *FC = rep->second; BitCastInst *B = new BitCastInst(FC, I8, "", callInst); callInst->setArgOperand(0, B); continue; } std::vector Params; Params.push_back(I8); FunctionType *FT = FunctionType::get(VoidTy, Params, false); /* Because of the TODO in the 2nd if statement, need to get the prior name before _host is add */ std::string oldName = functionOperand->getName().str(); // if parent function is __host and same as the cudaKernelLaunch std::string newName = oldName + "_wrapper"; if (func_name == oldName && host_changed && oldName.find("_host") != std::string::npos) { newName = oldName.substr(0, oldName.length() - 5) + "_wrapper"; } std::cout << "Change Kernel Name to: " << newName << std::endl; Function *F = Function::Create(FT, Function::ExternalLinkage, newName, M); F->setDSOLocal(true); F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); BitCastInst *BC = new BitCastInst(F, I8, "", callInst); callInst->setArgOperand(0, BC); kernels.insert({functionOperand->getName().str(), F}); } } else if (cuda_register_kernel_names.find( calledFunction->getName()) != cuda_register_kernel_names.end()) { // if the called function collides with kernel definiton // TODO: some reason changes all occurences of the function name // for both cudaKernelLaunch calls and regular function call // errs() << *inst; host_changed = true; calledFunction->setName(calledFunction->getName() + "_host"); std::cout << std::endl; std::cout << "Change Host Function Name To: " << calledFunction->getName().str() << std::endl; } } } } } } } void ReplaceMemcpyToSymbol(llvm::Module *M) { LLVMContext &context = M->getContext(); auto I32 = llvm::Type::getInt32Ty(context); std::vector need_remove; for (Module::iterator F = M->begin(); F != M->end(); ++F) { for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BI = BB->begin(); BI != BB->end(); BI++) { if (auto Call = dyn_cast(BI)) { if (Call->getCalledFunction()) { auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "cudaMemcpyToSymbol") { std::vector args; // i32 @cudaMemcpyToSymbol(i8* %1, i8* %2, i64 %3, i64 %4, i32 %5) args.push_back(llvm::Type::getInt8PtrTy(context)); args.push_back(llvm::Type::getInt8PtrTy(context)); args.push_back(llvm::Type::getInt64Ty(context)); args.push_back(llvm::Type::getInt64Ty(context)); args.push_back(llvm::Type::getInt32Ty(context)); llvm::FunctionType *func_Type = FunctionType::get(I32, args, false); llvm::FunctionCallee _f = M->getOrInsertFunction("cudaMemcpyToSymbol_host", func_Type); llvm::Function *func = llvm::cast(_f.getCallee()); // construct argument(s) std::vector func_args; func_args.push_back(Call->getArgOperand(0)); func_args.push_back(Call->getArgOperand(1)); func_args.push_back(Call->getArgOperand(2)); func_args.push_back(Call->getArgOperand(3)); func_args.push_back(Call->getArgOperand(4)); auto c_inst = llvm::CallInst::Create(func, func_args, "", Call); // insert Call->replaceAllUsesWith(c_inst); need_remove.push_back(Call); } } } } } } for (auto inst : need_remove) { inst->eraseFromParent(); } } void ReplaceCudaBuiltin(llvm::Module *M) { InsertSyncAfterKernelLaunch(M); ReplaceKernelLaunch(M); ReplaceMemcpyToSymbol(M); }