From d7668ccd867cf80e354cb372669e911aba76b5d5 Mon Sep 17 00:00:00 2001 From: RobinHan Date: Fri, 17 Jun 2022 22:20:13 -0400 Subject: [PATCH] [WIP] migriate to LLVM14 --- .../src/x86/ReplaceCudaBuiltin.cpp | 21 ++++- compilation/KernelTranslation/CMakeLists.txt | 2 +- .../KernelTranslation/include/x86/tool.h | 8 ++ .../src/x86/generate_x86_format.cpp | 19 ++-- .../KernelTranslation/src/x86/init.cpp | 4 +- .../src/x86/insert_warp_loop.cpp | 38 ++++---- .../src/x86/memory_hierarchy.cpp | 2 +- .../KernelTranslation/src/x86/performance.cpp | 3 +- .../KernelTranslation/src/x86/tool.cpp | 90 ++++++++++--------- .../KernelTranslation/src/x86/warp_func.cpp | 24 ++--- runtime/CMakeLists.txt | 4 +- runtime/src/x86/cudaRuntimeImpl.cpp | 38 +++++--- runtime/threadPool/CMakeLists.txt | 3 +- runtime/threadPool/src/x86/api.cpp | 7 +- 14 files changed, 163 insertions(+), 100 deletions(-) diff --git a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp b/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp index 01a34b6..3991c42 100644 --- a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp +++ b/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp @@ -8,6 +8,7 @@ #include "llvm/Support/ToolOutputFile.h" #include #include +#include #include using namespace llvm; @@ -200,8 +201,23 @@ void ReplaceKernelLaunch(llvm::Module *M) { prior name before _host is add */ std::string oldName = functionOperand->getName().str(); + // For LLVM>=14, it will add _device_stub prefix for the kernel + // name, thus, we need to remove the prefix + // example: + // from: _Z24__device_stub__HistogramPjS_jj + // to: HistogramPjS_jj + oldName = std::regex_replace(oldName, + std::regex("__device_stub__"), ""); + // remove _Z24 + for (int i = 2; i < oldName.length(); i++) { + if (oldName[i] >= '0' && oldName[i] <= '9') + continue; + oldName = oldName.substr(i); + break; + } - // if parent function is __host and same as the cudaKernelLaunch + // if parent function is __host and same as the + // cudaKernelLaunch std::string newName = oldName + "_wrapper"; if (func_name == oldName && host_changed && oldName.find("_host") != std::string::npos) { @@ -220,12 +236,11 @@ void ReplaceKernelLaunch(llvm::Module *M) { kernels.insert({functionOperand->getName().str(), F}); } } else if (cuda_register_kernel_names.find( - calledFunction->getName()) != + calledFunction->getName().str()) != cuda_register_kernel_names.end()) { // if the called function collides with kernel definiton // TODO: some reason changes all occurences of the function name // for both cudaKernelLaunch calls and regular function call - // errs() << *inst; host_changed = true; calledFunction->setName(calledFunction->getName() + "_host"); std::cout << std::endl; diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt index 25968d6..a9b52dd 100644 --- a/compilation/KernelTranslation/CMakeLists.txt +++ b/compilation/KernelTranslation/CMakeLists.txt @@ -18,4 +18,4 @@ file(GLOB proj_HEADERS "include/x86/*.h") file(GLOB proj_SOURCES "src/x86/*.cpp") # Add core library. -add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES}) +add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/KernelTranslation/include/x86/tool.h b/compilation/KernelTranslation/include/x86/tool.h index e1b1e90..c4538f1 100644 --- a/compilation/KernelTranslation/include/x86/tool.h +++ b/compilation/KernelTranslation/include/x86/tool.h @@ -1,6 +1,8 @@ #ifndef __NVVM2x86_TOOL__ #define __NVVM2x86_TOOL__ +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" llvm::Module *LoadModuleFromFilr(char *file_name); void DumpModule(llvm::Module *M, char *file_name); @@ -22,4 +24,10 @@ bool has_barrier(llvm::BasicBlock *B); bool has_block_barrier(llvm::BasicBlock *B); bool has_barrier(llvm::Function *F); void replace_dynamic_shared_memory(llvm::Module *M); +llvm::LoadInst *createLoad(llvm::IRBuilder<> &B, llvm::Value *addr, + bool isVolatile = false); +llvm::Value *createInBoundsGEP(llvm::IRBuilder<> &B, llvm::Value *ptr, + llvm::ArrayRef idxlist); +llvm::Value *createGEP(llvm::IRBuilder<> &B, llvm::Value *ptr, + llvm::ArrayRef idxlist); #endif diff --git a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp index b2594cf..9d563ff 100644 --- a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp +++ b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp @@ -11,7 +11,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" -#include "llvm/Support/TargetRegistry.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" @@ -19,6 +19,7 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include +#include using namespace llvm; @@ -52,6 +53,14 @@ void decode_input(llvm::Module *M) { if (!isKernelFunction(M, F)) continue; auto func_name = F->getName().str(); + // remove mangle prefix + // remove _Z24 + for (int pos = 2; pos < func_name.length(); pos++) { + if (func_name[pos] >= '0' && func_name[pos] <= '9') + continue; + func_name = func_name.substr(pos); + break; + } llvm::IRBuilder<> Builder(M->getContext()); FunctionCallee fc = @@ -78,7 +87,7 @@ void decode_input(llvm::Module *M) { *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL, "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); - Value *loadedValue = Builder.CreateLoad(global_mem); + Value *loadedValue = createLoad(Builder, global_mem); llvm::FunctionType *LaunchFun2 = FunctionType::get( PointerType::get(PointerType::get(Int32T, 0), 0), NULL); @@ -120,12 +129,12 @@ void decode_input(llvm::Module *M) { Type *ArgType = ii->getType(); // calculate addr - Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx)); + Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx)); // load corresponding int* - GEP = Builder.CreateLoad(GEP); + GEP = createLoad(Builder, GEP); // bitcast GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0)); - Value *Arg = Builder.CreateLoad(GEP); + Value *Arg = createLoad(Builder, GEP); Arguments.push_back(Arg); ++idx; } diff --git a/compilation/KernelTranslation/src/x86/init.cpp b/compilation/KernelTranslation/src/x86/init.cpp index 62ae6ff..541994a 100644 --- a/compilation/KernelTranslation/src/x86/init.cpp +++ b/compilation/KernelTranslation/src/x86/init.cpp @@ -42,7 +42,7 @@ bool inline_warp_level_func(llvm::Module *M) { if (func_name == "_Z10__any_syncji" || func_name.find("shfl_down_sync") != std::string::npos) { InlineFunctionInfo IFI; - InlineFunction(c, IFI); + InlineFunction(*c, IFI); need_remove.insert(c->getCalledFunction()); changed = true; } @@ -102,7 +102,7 @@ bool inline_func_with_tid(llvm::Module *M) { } for (auto c : need_inline) { InlineFunctionInfo IFI; - InlineFunction(c, IFI); + InlineFunction(*c, IFI); } return changed; } diff --git a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp index 1c090bf..549ac31 100644 --- a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp +++ b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp @@ -179,7 +179,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction, llvm::AllocaInst *Alloca = nullptr; auto block_size_addr = M->getGlobalVariable("block_size"); - auto block_size = builder.CreateLoad(block_size_addr); + auto block_size = createLoad(builder, block_size_addr); Alloca = builder.CreateAlloca(AllocType, block_size, varName); contextArrays[varName] = Alloca; @@ -208,9 +208,9 @@ llvm::Instruction *AddContextSave(llvm::Instruction *instruction, std::vector gepArgs; auto inter_warp_index = - builder.CreateLoad(M->getGlobalVariable("inter_warp_index")); + createLoad(builder, M->getGlobalVariable("inter_warp_index")); auto intra_warp_index = - builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + createLoad(builder, M->getGlobalVariable("intra_warp_index")); auto thread_idx = builder.CreateBinOp( Instruction::Add, intra_warp_index, builder.CreateBinOp(Instruction::Mul, inter_warp_index, @@ -218,7 +218,7 @@ llvm::Instruction *AddContextSave(llvm::Instruction *instruction, "thread_idx"); gepArgs.push_back(thread_idx); - return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs)); + return builder.CreateStore(instruction, createGEP(builder, alloca, gepArgs)); } llvm::Instruction *AddContextRestore(llvm::Value *val, @@ -242,9 +242,9 @@ llvm::Instruction *AddContextRestore(llvm::Value *val, auto M = before->getParent()->getParent()->getParent(); auto I32 = llvm::Type::getInt32Ty(M->getContext()); auto inter_warp_index = - builder.CreateLoad(M->getGlobalVariable("inter_warp_index")); + createLoad(builder, M->getGlobalVariable("inter_warp_index")); auto intra_warp_index = - builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + createLoad(builder, M->getGlobalVariable("intra_warp_index")); auto thread_idx = builder.CreateBinOp( Instruction::Add, intra_warp_index, builder.CreateBinOp(Instruction::Mul, inter_warp_index, @@ -253,11 +253,11 @@ llvm::Instruction *AddContextRestore(llvm::Value *val, gepArgs.push_back(thread_idx); llvm::Instruction *gep = - dyn_cast(builder.CreateGEP(alloca, gepArgs)); + dyn_cast(createGEP(builder, alloca, gepArgs)); if (isAlloca) { return gep; } - return builder.CreateLoad(gep); + return createLoad(builder, gep); } void AddContextSaveRestore(llvm::Instruction *instruction, @@ -316,7 +316,7 @@ void handle_alloc(llvm::Function *F) { // generate a new alloc auto block_size_addr = M->getGlobalVariable("block_size"); IRBuilder<> builder(inst); - auto block_size = builder.CreateLoad(block_size_addr); + auto block_size = createLoad(builder, block_size_addr); llvm::Type *elementType = NULL; if (dyn_cast(inst)->getType()->getElementType()) { @@ -338,16 +338,16 @@ void handle_alloc(llvm::Function *F) { IRBuilder<> builder(user); // std::vector gepArgs; auto inter_warp_index = - builder.CreateLoad(M->getGlobalVariable("inter_warp_index")); + createLoad(builder, M->getGlobalVariable("inter_warp_index")); auto intra_warp_index = - builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + createLoad(builder, M->getGlobalVariable("intra_warp_index")); auto thread_idx = builder.CreateBinOp( Instruction::Add, intra_warp_index, builder.CreateBinOp(Instruction::Mul, inter_warp_index, ConstantInt::get(I32, 32)), "thread_idx"); - auto gep = builder.CreateGEP(Alloca, thread_idx); + auto gep = createGEP(builder, Alloca, thread_idx); user->replaceUsesOfWith(inst, gep); } @@ -479,19 +479,19 @@ BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore, auto inter_warp_index = M->getGlobalVariable("inter_warp_index"); auto block_size = M->getGlobalVariable("block_size"); auto warp_cnt = - builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size), + builder.CreateBinOp(Instruction::SDiv, createLoad(builder, block_size), ConstantInt::get(I32, 32), "warp_number"); cmpResult = - builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt); + builder.CreateICmpULT(createLoad(builder, inter_warp_index), warp_cnt); } else { auto intra_warp_index = M->getGlobalVariable("intra_warp_index"); auto block_size = M->getGlobalVariable("block_size"); if (!need_nested_loop) { - cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index), - builder.CreateLoad(block_size)); + cmpResult = builder.CreateICmpULT(createLoad(builder, intra_warp_index), + createLoad(builder, block_size)); } else { - cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index), + cmpResult = builder.CreateICmpULT(createLoad(builder, intra_warp_index), ConstantInt::get(I32, 32)); } } @@ -513,13 +513,13 @@ BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore, if (intra_warp_loop) { // intra warp auto intra_warp_index = M->getGlobalVariable("intra_warp_index"); auto new_index = builder.CreateBinOp( - Instruction::Add, builder.CreateLoad(intra_warp_index), + Instruction::Add, createLoad(builder, intra_warp_index), ConstantInt::get(I32, 1), "intra_warp_index_increment"); builder.CreateStore(new_index, intra_warp_index); } else { // inter warp auto inter_warp_index = M->getGlobalVariable("inter_warp_index"); auto new_index = builder.CreateBinOp( - Instruction::Add, builder.CreateLoad(inter_warp_index), + Instruction::Add, createLoad(builder, inter_warp_index), ConstantInt::get(I32, 1), "inter_warp_index_increment"); builder.CreateStore(new_index, inter_warp_index); } diff --git a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp b/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp index 30fe5ab..7c2bcd2 100644 --- a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp +++ b/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp @@ -99,7 +99,7 @@ void mem_share2global(llvm::Module *M) { global_memory->setComdat(comdat); global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage); global_memory->setInitializer(undef); - global_memory->setAlignment(share_memory->getAlignment()); + global_memory->setAlignment(share_memory->getAlign()); corresponding_global_memory.insert( std::pair(share_memory, global_memory)); diff --git a/compilation/KernelTranslation/src/x86/performance.cpp b/compilation/KernelTranslation/src/x86/performance.cpp index 11d30e4..06a9cba 100644 --- a/compilation/KernelTranslation/src/x86/performance.cpp +++ b/compilation/KernelTranslation/src/x86/performance.cpp @@ -20,10 +20,11 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/InitializePasses.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/PassInfo.h" #include "llvm/PassRegistry.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" diff --git a/compilation/KernelTranslation/src/x86/tool.cpp b/compilation/KernelTranslation/src/x86/tool.cpp index c1652c5..fecca2f 100644 --- a/compilation/KernelTranslation/src/x86/tool.cpp +++ b/compilation/KernelTranslation/src/x86/tool.cpp @@ -1,8 +1,10 @@ #include "tool.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/Config/llvm-config.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" @@ -10,9 +12,16 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" + #include #include @@ -41,7 +50,7 @@ void DumpModule(llvm::Module *M, char *file_name) { std::string msg; llvm::raw_string_ostream os(msg); std::error_code EC; - ToolOutputFile Out(file_name, EC, sys::fs::F_None); + ToolOutputFile Out(file_name, EC, sys::fs::OF_None); if (EC) { errs() << "Fails to open output file: " << EC.message(); return; @@ -128,7 +137,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) { } builder.SetInsertPoint(phi); - llvm::Instruction *loadedValue = builder.CreateLoad(alloca); + llvm::Instruction *loadedValue = createLoad(builder, alloca); phi->replaceAllUsesWith(loadedValue); phi->eraseFromParent(); @@ -219,13 +228,12 @@ void replace_dynamic_shared_memory(llvm::Module *M) { if (!dynamic_shared_memory_addr) { return; } - auto load_shared_memory = - new LoadInst(dynamic_shared_memory_addr, "new_load"); + auto load_shared_memory = new LoadInst( + dynamic_shared_memory_addr->getType()->getPointerElementType(), + dynamic_shared_memory_addr, "new_load", &*F->begin()->begin()); auto new_bit_cast = new BitCastInst(load_shared_memory, dynamic_shared_memory_addr->getType(), "new_bit_cast"); - new_bit_cast->insertBefore(&*F->begin()->begin()); - load_shared_memory->insertBefore(new_bit_cast); dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) { auto *Instr = dyn_cast(U.getUser()); return Instr != new_bit_cast && Instr != load_shared_memory; @@ -281,21 +289,21 @@ void replace_built_in_function(llvm::Module *M) { auto block_size_addr = M->getGlobalVariable("block_size_x"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto val = builder.CreateLoad(block_size_addr); + auto val = createLoad(builder, block_size_addr); Call->replaceAllUsesWith(val); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") { auto block_size_addr = M->getGlobalVariable("block_size_y"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto val = builder.CreateLoad(block_size_addr); + auto val = createLoad(builder, block_size_addr); Call->replaceAllUsesWith(val); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") { auto block_size_addr = M->getGlobalVariable("block_size_z"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto val = builder.CreateLoad(block_size_addr); + auto val = createLoad(builder, block_size_addr); Call->replaceAllUsesWith(val); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" || @@ -307,15 +315,15 @@ void replace_built_in_function(llvm::Module *M) { builder.SetInsertPoint(Call); auto thread_idx = builder.CreateBinOp( - Instruction::Mul, builder.CreateLoad(local_inter_warp_idx), + Instruction::Mul, createLoad(builder, local_inter_warp_idx), ConstantInt::get(I32, 32), ""); thread_idx = builder.CreateBinOp( - Instruction::Add, builder.CreateLoad(local_intra_warp_idx), + Instruction::Add, createLoad(builder, local_intra_warp_idx), thread_idx, "thread_idx"); thread_idx = builder.CreateBinOp( Instruction::SRem, thread_idx, - builder.CreateLoad(M->getGlobalVariable("block_size_x")), + createLoad(builder, M->getGlobalVariable("block_size_x")), "thread_id_x"); Call->replaceAllUsesWith(thread_idx); @@ -326,15 +334,15 @@ void replace_built_in_function(llvm::Module *M) { builder.SetInsertPoint(Call); auto thread_idx = builder.CreateBinOp( - Instruction::Mul, builder.CreateLoad(local_inter_warp_idx), + Instruction::Mul, createLoad(builder, local_inter_warp_idx), ConstantInt::get(I32, 32), ""); thread_idx = builder.CreateBinOp( - Instruction::Add, builder.CreateLoad(local_intra_warp_idx), + Instruction::Add, createLoad(builder, local_intra_warp_idx), thread_idx, "thread_idx"); // tidy = tid / block_dim.x thread_idx = builder.CreateBinOp( Instruction::SDiv, thread_idx, - builder.CreateLoad(M->getGlobalVariable("block_size_x")), + createLoad(builder, M->getGlobalVariable("block_size_x")), "thread_id_y"); Call->replaceAllUsesWith(thread_idx); need_remove.push_back(Call); @@ -350,21 +358,21 @@ void replace_built_in_function(llvm::Module *M) { auto block_index_addr = M->getGlobalVariable("block_index_x"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto block_idx = builder.CreateLoad(block_index_addr); + auto block_idx = createLoad(builder, block_index_addr); Call->replaceAllUsesWith(block_idx); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") { auto block_index_addr = M->getGlobalVariable("block_index_y"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto block_idx = builder.CreateLoad(block_index_addr); + auto block_idx = createLoad(builder, block_index_addr); Call->replaceAllUsesWith(block_idx); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") { auto block_index_addr = M->getGlobalVariable("block_index_z"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto block_idx = builder.CreateLoad(block_index_addr); + auto block_idx = createLoad(builder, block_index_addr); Call->replaceAllUsesWith(block_idx); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" || @@ -373,21 +381,21 @@ void replace_built_in_function(llvm::Module *M) { auto grid_size_addr = M->getGlobalVariable("grid_size_x"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto grid_size = builder.CreateLoad(grid_size_addr); + auto grid_size = createLoad(builder, grid_size_addr); Call->replaceAllUsesWith(grid_size); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") { auto grid_size_addr = M->getGlobalVariable("grid_size_y"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto grid_size = builder.CreateLoad(grid_size_addr); + auto grid_size = createLoad(builder, grid_size_addr); Call->replaceAllUsesWith(grid_size); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") { auto grid_size_addr = M->getGlobalVariable("grid_size_z"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto grid_size = builder.CreateLoad(grid_size_addr); + auto grid_size = createLoad(builder, grid_size_addr); Call->replaceAllUsesWith(grid_size); need_remove.push_back(Call); } @@ -401,7 +409,7 @@ void replace_built_in_function(llvm::Module *M) { // return the rank within the warp IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx); + auto intra_warp_index = createLoad(builder, local_intra_warp_idx); Call->replaceAllUsesWith(intra_warp_index); need_remove.push_back(Call); } @@ -460,7 +468,9 @@ void replace_built_in_function(llvm::Module *M) { src_alloc, // Alloca Indices, // Indices "", Call); - auto new_load = new LoadInst(new_GEP, "", Call); + auto new_load = + new LoadInst(new_GEP->getType()->getPointerElementType(), + new_GEP, "", Call); printf_args.push_back(new_load); } } @@ -531,7 +541,7 @@ void replace_asm_call(llvm::Module *M) { builder.SetInsertPoint(Call); auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index"); - auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr); + auto intra_warp_index = createLoad(builder, intra_warp_index_addr); Call->replaceAllUsesWith(intra_warp_index); need_remove.push_back(Call); } @@ -652,20 +662,18 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) { return 0; } -/* - Print IR to String Output for Debugging Purposes -*/ -// void printModule(llvm::Module *M) { -// std::string str; -// llvm::raw_string_ostream ss(str); -// std::cout << "### Printing Module ###" << std::endl; -// for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { -// Function *F = &(*i); -// auto func_name = F->getName().str(); -// std::cout << func_name << std::endl; -// for (Function::iterator b = F->begin(); b != F->end(); ++b) { -// BasicBlock *B = &(*b); -// errs() << *B; -// } -// } -// } +LoadInst *createLoad(IRBuilder<> &B, Value *addr, bool isVolatile) { + return B.CreateLoad(addr->getType()->getPointerElementType(), addr, + isVolatile); +} + +Value *createInBoundsGEP(IRBuilder<> &B, Value *ptr, + ArrayRef idxlist) { + return B.CreateInBoundsGEP( + ptr->getType()->getScalarType()->getPointerElementType(), ptr, idxlist); +} + +Value *createGEP(IRBuilder<> &B, Value *ptr, ArrayRef idxlist) { + return B.CreateGEP(ptr->getType()->getScalarType()->getPointerElementType(), + ptr, idxlist); +} diff --git a/compilation/KernelTranslation/src/x86/warp_func.cpp b/compilation/KernelTranslation/src/x86/warp_func.cpp index a25979a..cb81b6b 100644 --- a/compilation/KernelTranslation/src/x86/warp_func.cpp +++ b/compilation/KernelTranslation/src/x86/warp_func.cpp @@ -67,7 +67,8 @@ void handle_warp_vote(llvm::Module *M) { assert(warp_vote_ptr != NULL); auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index"); auto intra_warp_index = - new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst); + new LoadInst(intra_warp_index_addr->getType()->getPointerElementType(), + intra_warp_index_addr, "intra_warp_index", sync_inst); auto GEP = GetElementPtrInst::Create(NULL, // Pointee type warp_vote_ptr, // Alloca @@ -168,23 +169,22 @@ void handle_warp_shfl(llvm::Module *M) { auto shfl_offset = shfl_inst->getArgOperand(2); auto intra_warp_index = - builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); - builder.CreateStore( - shfl_variable, - builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index})); + createLoad(builder, M->getGlobalVariable("intra_warp_index")); + builder.CreateStore(shfl_variable, createGEP(builder, warp_shfl_ptr, + {ZERO, intra_warp_index})); // we should create barrier before store CreateIntraWarpBarrier(intra_warp_index); // load shuffled data auto new_intra_warp_index = - builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + createLoad(builder, M->getGlobalVariable("intra_warp_index")); auto shfl_name = shfl_inst->getCalledFunction()->getName().str(); if (shfl_name.find("down") != shfl_name.npos) { auto calculate_offset = builder.CreateBinOp( Instruction::Add, new_intra_warp_index, shfl_offset); auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset, ConstantInt::get(I32, 32)); - auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index}); - auto load_inst = builder.CreateLoad(gep); + auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index}); + auto load_inst = createLoad(builder, gep); // create barrier CreateIntraWarpBarrier(new_intra_warp_index); @@ -195,8 +195,8 @@ void handle_warp_shfl(llvm::Module *M) { Instruction::Sub, new_intra_warp_index, shfl_offset); auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset, ConstantInt::get(I32, 32)); - auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index}); - auto load_inst = builder.CreateLoad(gep); + auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index}); + auto load_inst = createLoad(builder, gep); // create barrier CreateIntraWarpBarrier(new_intra_warp_index); @@ -207,8 +207,8 @@ void handle_warp_shfl(llvm::Module *M) { Instruction::Xor, new_intra_warp_index, shfl_offset); auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset, ConstantInt::get(I32, 32)); - auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index}); - auto load_inst = builder.CreateLoad(gep); + auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index}); + auto load_inst = createLoad(builder, gep); // create barrier CreateIntraWarpBarrier(new_intra_warp_index); diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 6385824..94248b3 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -10,7 +10,9 @@ set(CMAKE_VERBOSE_MAKEFILE ON) add_subdirectory(threadPool) # compile x86 runtime library +include_directories(./include/) include_directories(./include/x86) +include_directories(./threadPool/include/) include_directories(./threadPool/include/x86) -file(GLOB proj_SOURCES "src/vortex/*.cpp") +file(GLOB proj_SOURCES "src/x86/*.cpp") add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp index d15dae1..3406285 100644 --- a/runtime/src/x86/cudaRuntimeImpl.cpp +++ b/runtime/src/x86/cudaRuntimeImpl.cpp @@ -9,13 +9,31 @@ #include #include #include -cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; } +cudaError_t cudaGetDevice(int *devPtr) { + *devPtr = 0; + return cudaSuccess; +} const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; } -cudaError_t cudaDeviceReset(void) { scheduler_uninit(); } -cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); } -cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); } -cudaError_t cudaFree(void *devPtr) { free(devPtr); } -cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); } +cudaError_t cudaDeviceReset(void) { + scheduler_uninit(); + return cudaSuccess; +} +cudaError_t cudaDeviceSynchronize(void) { + cuSynchronizeBarrier(); + return cudaSuccess; +} +cudaError_t cudaThreadSynchronize(void) { + cuSynchronizeBarrier(); + return cudaSuccess; +} +cudaError_t cudaFree(void *devPtr) { + free(devPtr); + return cudaSuccess; +} +cudaError_t cudaFreeHost(void *devPtr) { + free(devPtr); + return cudaSuccess; +} cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, @@ -31,7 +49,7 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, int lstatus = cuLaunchKernel(&ker); - // std::cout << "ret cudaLKernel" << std::endl; + return cudaSuccess; } cudaError_t cudaMalloc(void **devPtr, size_t size) { *devPtr = malloc(size); @@ -68,15 +86,13 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count, size_t offset, cudaMemcpyKind kind) { assert(offset == 0 && "DO not support offset !=0\n"); - memcpy(dst, src + offset, count); + memcpy(dst, (char *)src + offset, count); return cudaSuccess; } cudaError_t cudaSetDevice(int device) { - // error checking - // std::cout << "cudaSetDevice Called" << std::endl; init_device(); - // std::cout << "cudaSetDevice Ret" << std::endl; + return cudaSuccess; } cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt index 2ad0b79..d807752 100644 --- a/runtime/threadPool/CMakeLists.txt +++ b/runtime/threadPool/CMakeLists.txt @@ -11,7 +11,8 @@ set(LIB_NAME threadPool) set(CMAKE_CXX_STANDARD 14) set(CMAKE_BUILD_TYPE Debug) +include_directories(./include) include_directories(./include/x86) -file(GLOB proj_SOURCES "src/vortex/*.cpp") +file(GLOB proj_SOURCES "src/x86/*.cpp") add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp index cd1c335..b952dba 100644 --- a/runtime/threadPool/src/x86/api.cpp +++ b/runtime/threadPool/src/x86/api.cpp @@ -29,7 +29,6 @@ int init_device() { // initialize scheduler int ret = scheduler_init(*device); - if (ret != C_SUCCESS) return ret; @@ -182,6 +181,7 @@ int schedulerEnqueueKernel(cu_kernel **k) { pthread_cond_broadcast(&(scheduler->wake_pool)); MUTEX_UNLOCK(scheduler->work_queue_lock); + return 0; } /* @@ -191,6 +191,7 @@ int cuLaunchKernel(cu_kernel **k) { if (!scheduler) { init_device(); } + std::cout << "launch\n" << std::flush; // Calculate Block Size N/numBlocks cu_kernel *ker = *k; @@ -238,6 +239,7 @@ int cuLaunchKernel(cu_kernel **k) { MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock); } } + return 0; } /* @@ -346,7 +348,8 @@ RETRY: grid_size_x = gridDim.x; grid_size_y = gridDim.y; grid_size_z = gridDim.z; - dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size); + if (dynamic_shared_mem_size > 0) + dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size); int tmp = block_index; block_index_x = tmp / (grid_size_y * grid_size_z); tmp = tmp % (grid_size_y * grid_size_z);