[WIP] migriate to LLVM14
This commit is contained in:
parent
bcdcccecc9
commit
d7668ccd86
|
@ -8,6 +8,7 @@
|
|||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -200,8 +201,23 @@ void ReplaceKernelLaunch(llvm::Module *M) {
|
|||
prior name before _host is add
|
||||
*/
|
||||
std::string oldName = functionOperand->getName().str();
|
||||
// For LLVM>=14, it will add _device_stub prefix for the kernel
|
||||
// name, thus, we need to remove the prefix
|
||||
// example:
|
||||
// from: _Z24__device_stub__HistogramPjS_jj
|
||||
// to: HistogramPjS_jj
|
||||
oldName = std::regex_replace(oldName,
|
||||
std::regex("__device_stub__"), "");
|
||||
// remove _Z24
|
||||
for (int i = 2; i < oldName.length(); i++) {
|
||||
if (oldName[i] >= '0' && oldName[i] <= '9')
|
||||
continue;
|
||||
oldName = oldName.substr(i);
|
||||
break;
|
||||
}
|
||||
|
||||
// if parent function is __host and same as the cudaKernelLaunch
|
||||
// if parent function is __host and same as the
|
||||
// cudaKernelLaunch
|
||||
std::string newName = oldName + "_wrapper";
|
||||
if (func_name == oldName && host_changed &&
|
||||
oldName.find("_host") != std::string::npos) {
|
||||
|
@ -220,12 +236,11 @@ void ReplaceKernelLaunch(llvm::Module *M) {
|
|||
kernels.insert({functionOperand->getName().str(), F});
|
||||
}
|
||||
} else if (cuda_register_kernel_names.find(
|
||||
calledFunction->getName()) !=
|
||||
calledFunction->getName().str()) !=
|
||||
cuda_register_kernel_names.end()) {
|
||||
// if the called function collides with kernel definiton
|
||||
// TODO: some reason changes all occurences of the function name
|
||||
// for both cudaKernelLaunch calls and regular function call
|
||||
// errs() << *inst;
|
||||
host_changed = true;
|
||||
calledFunction->setName(calledFunction->getName() + "_host");
|
||||
std::cout << std::endl;
|
||||
|
|
|
@ -18,4 +18,4 @@ file(GLOB proj_HEADERS "include/x86/*.h")
|
|||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
|
||||
# Add core library.
|
||||
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
|
||||
add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef __NVVM2x86_TOOL__
|
||||
#define __NVVM2x86_TOOL__
|
||||
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
llvm::Module *LoadModuleFromFilr(char *file_name);
|
||||
void DumpModule(llvm::Module *M, char *file_name);
|
||||
|
@ -22,4 +24,10 @@ bool has_barrier(llvm::BasicBlock *B);
|
|||
bool has_block_barrier(llvm::BasicBlock *B);
|
||||
bool has_barrier(llvm::Function *F);
|
||||
void replace_dynamic_shared_memory(llvm::Module *M);
|
||||
llvm::LoadInst *createLoad(llvm::IRBuilder<> &B, llvm::Value *addr,
|
||||
bool isVolatile = false);
|
||||
llvm::Value *createInBoundsGEP(llvm::IRBuilder<> &B, llvm::Value *ptr,
|
||||
llvm::ArrayRef<llvm::Value *> idxlist);
|
||||
llvm::Value *createGEP(llvm::IRBuilder<> &B, llvm::Value *ptr,
|
||||
llvm::ArrayRef<llvm::Value *> idxlist);
|
||||
#endif
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/MC/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
@ -19,6 +19,7 @@
|
|||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -52,6 +53,14 @@ void decode_input(llvm::Module *M) {
|
|||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
auto func_name = F->getName().str();
|
||||
// remove mangle prefix
|
||||
// remove _Z24
|
||||
for (int pos = 2; pos < func_name.length(); pos++) {
|
||||
if (func_name[pos] >= '0' && func_name[pos] <= '9')
|
||||
continue;
|
||||
func_name = func_name.substr(pos);
|
||||
break;
|
||||
}
|
||||
llvm::IRBuilder<> Builder(M->getContext());
|
||||
|
||||
FunctionCallee fc =
|
||||
|
@ -78,7 +87,7 @@ void decode_input(llvm::Module *M) {
|
|||
*M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
"thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
|
||||
0, false);
|
||||
Value *loadedValue = Builder.CreateLoad(global_mem);
|
||||
Value *loadedValue = createLoad(Builder, global_mem);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 = FunctionType::get(
|
||||
PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
@ -120,12 +129,12 @@ void decode_input(llvm::Module *M) {
|
|||
Type *ArgType = ii->getType();
|
||||
|
||||
// calculate addr
|
||||
Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx));
|
||||
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
|
||||
// load corresponding int*
|
||||
GEP = Builder.CreateLoad(GEP);
|
||||
GEP = createLoad(Builder, GEP);
|
||||
// bitcast
|
||||
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
|
||||
Value *Arg = Builder.CreateLoad(GEP);
|
||||
Value *Arg = createLoad(Builder, GEP);
|
||||
Arguments.push_back(Arg);
|
||||
++idx;
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ bool inline_warp_level_func(llvm::Module *M) {
|
|||
if (func_name == "_Z10__any_syncji" ||
|
||||
func_name.find("shfl_down_sync") != std::string::npos) {
|
||||
InlineFunctionInfo IFI;
|
||||
InlineFunction(c, IFI);
|
||||
InlineFunction(*c, IFI);
|
||||
need_remove.insert(c->getCalledFunction());
|
||||
changed = true;
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ bool inline_func_with_tid(llvm::Module *M) {
|
|||
}
|
||||
for (auto c : need_inline) {
|
||||
InlineFunctionInfo IFI;
|
||||
InlineFunction(c, IFI);
|
||||
InlineFunction(*c, IFI);
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
|
|
@ -179,7 +179,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
|||
llvm::AllocaInst *Alloca = nullptr;
|
||||
|
||||
auto block_size_addr = M->getGlobalVariable("block_size");
|
||||
auto block_size = builder.CreateLoad(block_size_addr);
|
||||
auto block_size = createLoad(builder, block_size_addr);
|
||||
Alloca = builder.CreateAlloca(AllocType, block_size, varName);
|
||||
|
||||
contextArrays[varName] = Alloca;
|
||||
|
@ -208,9 +208,9 @@ llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
|
|||
std::vector<llvm::Value *> gepArgs;
|
||||
|
||||
auto inter_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("inter_warp_index"));
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("intra_warp_index"));
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, intra_warp_index,
|
||||
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
|
||||
|
@ -218,7 +218,7 @@ llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
|
|||
"thread_idx");
|
||||
gepArgs.push_back(thread_idx);
|
||||
|
||||
return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
|
||||
return builder.CreateStore(instruction, createGEP(builder, alloca, gepArgs));
|
||||
}
|
||||
|
||||
llvm::Instruction *AddContextRestore(llvm::Value *val,
|
||||
|
@ -242,9 +242,9 @@ llvm::Instruction *AddContextRestore(llvm::Value *val,
|
|||
auto M = before->getParent()->getParent()->getParent();
|
||||
auto I32 = llvm::Type::getInt32Ty(M->getContext());
|
||||
auto inter_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("inter_warp_index"));
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("intra_warp_index"));
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, intra_warp_index,
|
||||
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
|
||||
|
@ -253,11 +253,11 @@ llvm::Instruction *AddContextRestore(llvm::Value *val,
|
|||
gepArgs.push_back(thread_idx);
|
||||
|
||||
llvm::Instruction *gep =
|
||||
dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
|
||||
dyn_cast<Instruction>(createGEP(builder, alloca, gepArgs));
|
||||
if (isAlloca) {
|
||||
return gep;
|
||||
}
|
||||
return builder.CreateLoad(gep);
|
||||
return createLoad(builder, gep);
|
||||
}
|
||||
|
||||
void AddContextSaveRestore(llvm::Instruction *instruction,
|
||||
|
@ -316,7 +316,7 @@ void handle_alloc(llvm::Function *F) {
|
|||
// generate a new alloc
|
||||
auto block_size_addr = M->getGlobalVariable("block_size");
|
||||
IRBuilder<> builder(inst);
|
||||
auto block_size = builder.CreateLoad(block_size_addr);
|
||||
auto block_size = createLoad(builder, block_size_addr);
|
||||
|
||||
llvm::Type *elementType = NULL;
|
||||
if (dyn_cast<AllocaInst>(inst)->getType()->getElementType()) {
|
||||
|
@ -338,16 +338,16 @@ void handle_alloc(llvm::Function *F) {
|
|||
IRBuilder<> builder(user);
|
||||
// std::vector<llvm::Value *> gepArgs;
|
||||
auto inter_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("inter_warp_index"));
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("intra_warp_index"));
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, intra_warp_index,
|
||||
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
|
||||
ConstantInt::get(I32, 32)),
|
||||
"thread_idx");
|
||||
|
||||
auto gep = builder.CreateGEP(Alloca, thread_idx);
|
||||
auto gep = createGEP(builder, Alloca, thread_idx);
|
||||
|
||||
user->replaceUsesOfWith(inst, gep);
|
||||
}
|
||||
|
@ -479,19 +479,19 @@ BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore,
|
|||
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
|
||||
auto block_size = M->getGlobalVariable("block_size");
|
||||
auto warp_cnt =
|
||||
builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size),
|
||||
builder.CreateBinOp(Instruction::SDiv, createLoad(builder, block_size),
|
||||
ConstantInt::get(I32, 32), "warp_number");
|
||||
|
||||
cmpResult =
|
||||
builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt);
|
||||
builder.CreateICmpULT(createLoad(builder, inter_warp_index), warp_cnt);
|
||||
} else {
|
||||
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
|
||||
auto block_size = M->getGlobalVariable("block_size");
|
||||
if (!need_nested_loop) {
|
||||
cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
|
||||
builder.CreateLoad(block_size));
|
||||
cmpResult = builder.CreateICmpULT(createLoad(builder, intra_warp_index),
|
||||
createLoad(builder, block_size));
|
||||
} else {
|
||||
cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
|
||||
cmpResult = builder.CreateICmpULT(createLoad(builder, intra_warp_index),
|
||||
ConstantInt::get(I32, 32));
|
||||
}
|
||||
}
|
||||
|
@ -513,13 +513,13 @@ BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore,
|
|||
if (intra_warp_loop) { // intra warp
|
||||
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
|
||||
auto new_index = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(intra_warp_index),
|
||||
Instruction::Add, createLoad(builder, intra_warp_index),
|
||||
ConstantInt::get(I32, 1), "intra_warp_index_increment");
|
||||
builder.CreateStore(new_index, intra_warp_index);
|
||||
} else { // inter warp
|
||||
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
|
||||
auto new_index = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(inter_warp_index),
|
||||
Instruction::Add, createLoad(builder, inter_warp_index),
|
||||
ConstantInt::get(I32, 1), "inter_warp_index_increment");
|
||||
builder.CreateStore(new_index, inter_warp_index);
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ void mem_share2global(llvm::Module *M) {
|
|||
global_memory->setComdat(comdat);
|
||||
global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
|
||||
global_memory->setInitializer(undef);
|
||||
global_memory->setAlignment(share_memory->getAlignment());
|
||||
global_memory->setAlignment(share_memory->getAlign());
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
|
|
|
@ -20,10 +20,11 @@
|
|||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/MC/TargetRegistry.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/Host.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
#include "tool.h"
|
||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||
#include "llvm/Config/llvm-config.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/GlobalVariable.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
|
@ -10,9 +12,16 @@
|
|||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/ErrorOr.h"
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/ManagedStatic.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
|
@ -41,7 +50,7 @@ void DumpModule(llvm::Module *M, char *file_name) {
|
|||
std::string msg;
|
||||
llvm::raw_string_ostream os(msg);
|
||||
std::error_code EC;
|
||||
ToolOutputFile Out(file_name, EC, sys::fs::F_None);
|
||||
ToolOutputFile Out(file_name, EC, sys::fs::OF_None);
|
||||
if (EC) {
|
||||
errs() << "Fails to open output file: " << EC.message();
|
||||
return;
|
||||
|
@ -128,7 +137,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
|
|||
}
|
||||
builder.SetInsertPoint(phi);
|
||||
|
||||
llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
|
||||
llvm::Instruction *loadedValue = createLoad(builder, alloca);
|
||||
phi->replaceAllUsesWith(loadedValue);
|
||||
phi->eraseFromParent();
|
||||
|
||||
|
@ -219,13 +228,12 @@ void replace_dynamic_shared_memory(llvm::Module *M) {
|
|||
if (!dynamic_shared_memory_addr) {
|
||||
return;
|
||||
}
|
||||
auto load_shared_memory =
|
||||
new LoadInst(dynamic_shared_memory_addr, "new_load");
|
||||
auto load_shared_memory = new LoadInst(
|
||||
dynamic_shared_memory_addr->getType()->getPointerElementType(),
|
||||
dynamic_shared_memory_addr, "new_load", &*F->begin()->begin());
|
||||
auto new_bit_cast =
|
||||
new BitCastInst(load_shared_memory,
|
||||
dynamic_shared_memory_addr->getType(), "new_bit_cast");
|
||||
new_bit_cast->insertBefore(&*F->begin()->begin());
|
||||
load_shared_memory->insertBefore(new_bit_cast);
|
||||
dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) {
|
||||
auto *Instr = dyn_cast<Instruction>(U.getUser());
|
||||
return Instr != new_bit_cast && Instr != load_shared_memory;
|
||||
|
@ -281,21 +289,21 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
auto block_size_addr = M->getGlobalVariable("block_size_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto val = builder.CreateLoad(block_size_addr);
|
||||
auto val = createLoad(builder, block_size_addr);
|
||||
Call->replaceAllUsesWith(val);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto val = builder.CreateLoad(block_size_addr);
|
||||
auto val = createLoad(builder, block_size_addr);
|
||||
Call->replaceAllUsesWith(val);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto val = builder.CreateLoad(block_size_addr);
|
||||
auto val = createLoad(builder, block_size_addr);
|
||||
Call->replaceAllUsesWith(val);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" ||
|
||||
|
@ -307,15 +315,15 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
builder.SetInsertPoint(Call);
|
||||
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
|
||||
Instruction::Mul, createLoad(builder, local_inter_warp_idx),
|
||||
ConstantInt::get(I32, 32), "");
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
|
||||
Instruction::Add, createLoad(builder, local_intra_warp_idx),
|
||||
thread_idx, "thread_idx");
|
||||
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::SRem, thread_idx,
|
||||
builder.CreateLoad(M->getGlobalVariable("block_size_x")),
|
||||
createLoad(builder, M->getGlobalVariable("block_size_x")),
|
||||
"thread_id_x");
|
||||
|
||||
Call->replaceAllUsesWith(thread_idx);
|
||||
|
@ -326,15 +334,15 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
builder.SetInsertPoint(Call);
|
||||
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
|
||||
Instruction::Mul, createLoad(builder, local_inter_warp_idx),
|
||||
ConstantInt::get(I32, 32), "");
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
|
||||
Instruction::Add, createLoad(builder, local_intra_warp_idx),
|
||||
thread_idx, "thread_idx");
|
||||
// tidy = tid / block_dim.x
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::SDiv, thread_idx,
|
||||
builder.CreateLoad(M->getGlobalVariable("block_size_x")),
|
||||
createLoad(builder, M->getGlobalVariable("block_size_x")),
|
||||
"thread_id_y");
|
||||
Call->replaceAllUsesWith(thread_idx);
|
||||
need_remove.push_back(Call);
|
||||
|
@ -350,21 +358,21 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
auto block_index_addr = M->getGlobalVariable("block_index_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
auto block_idx = createLoad(builder, block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
auto block_idx = createLoad(builder, block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
auto block_idx = createLoad(builder, block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" ||
|
||||
|
@ -373,21 +381,21 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
auto grid_size_addr = M->getGlobalVariable("grid_size_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = builder.CreateLoad(grid_size_addr);
|
||||
auto grid_size = createLoad(builder, grid_size_addr);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = builder.CreateLoad(grid_size_addr);
|
||||
auto grid_size = createLoad(builder, grid_size_addr);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = builder.CreateLoad(grid_size_addr);
|
||||
auto grid_size = createLoad(builder, grid_size_addr);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
|
@ -401,7 +409,7 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
// return the rank within the warp
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx);
|
||||
auto intra_warp_index = createLoad(builder, local_intra_warp_idx);
|
||||
Call->replaceAllUsesWith(intra_warp_index);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
|
@ -460,7 +468,9 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
src_alloc, // Alloca
|
||||
Indices, // Indices
|
||||
"", Call);
|
||||
auto new_load = new LoadInst(new_GEP, "", Call);
|
||||
auto new_load =
|
||||
new LoadInst(new_GEP->getType()->getPointerElementType(),
|
||||
new_GEP, "", Call);
|
||||
printf_args.push_back(new_load);
|
||||
}
|
||||
}
|
||||
|
@ -531,7 +541,7 @@ void replace_asm_call(llvm::Module *M) {
|
|||
builder.SetInsertPoint(Call);
|
||||
auto intra_warp_index_addr =
|
||||
M->getGlobalVariable("intra_warp_index");
|
||||
auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr);
|
||||
auto intra_warp_index = createLoad(builder, intra_warp_index_addr);
|
||||
Call->replaceAllUsesWith(intra_warp_index);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
|
@ -652,20 +662,18 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Print IR to String Output for Debugging Purposes
|
||||
*/
|
||||
// void printModule(llvm::Module *M) {
|
||||
// std::string str;
|
||||
// llvm::raw_string_ostream ss(str);
|
||||
// std::cout << "### Printing Module ###" << std::endl;
|
||||
// for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
// Function *F = &(*i);
|
||||
// auto func_name = F->getName().str();
|
||||
// std::cout << func_name << std::endl;
|
||||
// for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
// BasicBlock *B = &(*b);
|
||||
// errs() << *B;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
LoadInst *createLoad(IRBuilder<> &B, Value *addr, bool isVolatile) {
|
||||
return B.CreateLoad(addr->getType()->getPointerElementType(), addr,
|
||||
isVolatile);
|
||||
}
|
||||
|
||||
Value *createInBoundsGEP(IRBuilder<> &B, Value *ptr,
|
||||
ArrayRef<Value *> idxlist) {
|
||||
return B.CreateInBoundsGEP(
|
||||
ptr->getType()->getScalarType()->getPointerElementType(), ptr, idxlist);
|
||||
}
|
||||
|
||||
Value *createGEP(IRBuilder<> &B, Value *ptr, ArrayRef<Value *> idxlist) {
|
||||
return B.CreateGEP(ptr->getType()->getScalarType()->getPointerElementType(),
|
||||
ptr, idxlist);
|
||||
}
|
||||
|
|
|
@ -67,7 +67,8 @@ void handle_warp_vote(llvm::Module *M) {
|
|||
assert(warp_vote_ptr != NULL);
|
||||
auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index");
|
||||
auto intra_warp_index =
|
||||
new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst);
|
||||
new LoadInst(intra_warp_index_addr->getType()->getPointerElementType(),
|
||||
intra_warp_index_addr, "intra_warp_index", sync_inst);
|
||||
|
||||
auto GEP = GetElementPtrInst::Create(NULL, // Pointee type
|
||||
warp_vote_ptr, // Alloca
|
||||
|
@ -168,23 +169,22 @@ void handle_warp_shfl(llvm::Module *M) {
|
|||
auto shfl_offset = shfl_inst->getArgOperand(2);
|
||||
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
builder.CreateStore(
|
||||
shfl_variable,
|
||||
builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index}));
|
||||
createLoad(builder, M->getGlobalVariable("intra_warp_index"));
|
||||
builder.CreateStore(shfl_variable, createGEP(builder, warp_shfl_ptr,
|
||||
{ZERO, intra_warp_index}));
|
||||
// we should create barrier before store
|
||||
CreateIntraWarpBarrier(intra_warp_index);
|
||||
// load shuffled data
|
||||
auto new_intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
createLoad(builder, M->getGlobalVariable("intra_warp_index"));
|
||||
auto shfl_name = shfl_inst->getCalledFunction()->getName().str();
|
||||
if (shfl_name.find("down") != shfl_name.npos) {
|
||||
auto calculate_offset = builder.CreateBinOp(
|
||||
Instruction::Add, new_intra_warp_index, shfl_offset);
|
||||
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
|
||||
ConstantInt::get(I32, 32));
|
||||
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = builder.CreateLoad(gep);
|
||||
auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = createLoad(builder, gep);
|
||||
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(new_intra_warp_index);
|
||||
|
@ -195,8 +195,8 @@ void handle_warp_shfl(llvm::Module *M) {
|
|||
Instruction::Sub, new_intra_warp_index, shfl_offset);
|
||||
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
|
||||
ConstantInt::get(I32, 32));
|
||||
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = builder.CreateLoad(gep);
|
||||
auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = createLoad(builder, gep);
|
||||
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(new_intra_warp_index);
|
||||
|
@ -207,8 +207,8 @@ void handle_warp_shfl(llvm::Module *M) {
|
|||
Instruction::Xor, new_intra_warp_index, shfl_offset);
|
||||
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
|
||||
ConstantInt::get(I32, 32));
|
||||
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = builder.CreateLoad(gep);
|
||||
auto gep = createGEP(builder, warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = createLoad(builder, gep);
|
||||
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(new_intra_warp_index);
|
||||
|
|
|
@ -10,7 +10,9 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
|||
add_subdirectory(threadPool)
|
||||
|
||||
# compile x86 runtime library
|
||||
include_directories(./include/)
|
||||
include_directories(./include/x86)
|
||||
include_directories(./threadPool/include/)
|
||||
include_directories(./threadPool/include/x86)
|
||||
file(GLOB proj_SOURCES "src/vortex/*.cpp")
|
||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
||||
|
|
|
@ -9,13 +9,31 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; }
|
||||
cudaError_t cudaGetDevice(int *devPtr) {
|
||||
*devPtr = 0;
|
||||
return cudaSuccess;
|
||||
}
|
||||
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
|
||||
cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
|
||||
cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
|
||||
cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); }
|
||||
cudaError_t cudaFree(void *devPtr) { free(devPtr); }
|
||||
cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); }
|
||||
cudaError_t cudaDeviceReset(void) {
|
||||
scheduler_uninit();
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaDeviceSynchronize(void) {
|
||||
cuSynchronizeBarrier();
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaThreadSynchronize(void) {
|
||||
cuSynchronizeBarrier();
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaFree(void *devPtr) {
|
||||
free(devPtr);
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaFreeHost(void *devPtr) {
|
||||
free(devPtr);
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
void **args, size_t sharedMem,
|
||||
|
@ -31,7 +49,7 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|||
|
||||
int lstatus = cuLaunchKernel(&ker);
|
||||
|
||||
// std::cout << "ret cudaLKernel" << std::endl;
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
||||
*devPtr = malloc(size);
|
||||
|
@ -68,15 +86,13 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
|||
cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count,
|
||||
size_t offset, cudaMemcpyKind kind) {
|
||||
assert(offset == 0 && "DO not support offset !=0\n");
|
||||
memcpy(dst, src + offset, count);
|
||||
memcpy(dst, (char *)src + offset, count);
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaSetDevice(int device) {
|
||||
// error checking
|
||||
// std::cout << "cudaSetDevice Called" << std::endl;
|
||||
init_device();
|
||||
// std::cout << "cudaSetDevice Ret" << std::endl;
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
||||
|
|
|
@ -11,7 +11,8 @@ set(LIB_NAME threadPool)
|
|||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(./include)
|
||||
include_directories(./include/x86)
|
||||
|
||||
file(GLOB proj_SOURCES "src/vortex/*.cpp")
|
||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
||||
|
|
|
@ -29,7 +29,6 @@ int init_device() {
|
|||
|
||||
// initialize scheduler
|
||||
int ret = scheduler_init(*device);
|
||||
|
||||
if (ret != C_SUCCESS)
|
||||
return ret;
|
||||
|
||||
|
@ -182,6 +181,7 @@ int schedulerEnqueueKernel(cu_kernel **k) {
|
|||
|
||||
pthread_cond_broadcast(&(scheduler->wake_pool));
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -191,6 +191,7 @@ int cuLaunchKernel(cu_kernel **k) {
|
|||
if (!scheduler) {
|
||||
init_device();
|
||||
}
|
||||
std::cout << "launch\n" << std::flush;
|
||||
// Calculate Block Size N/numBlocks
|
||||
|
||||
cu_kernel *ker = *k;
|
||||
|
@ -238,6 +239,7 @@ int cuLaunchKernel(cu_kernel **k) {
|
|||
MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -346,7 +348,8 @@ RETRY:
|
|||
grid_size_x = gridDim.x;
|
||||
grid_size_y = gridDim.y;
|
||||
grid_size_z = gridDim.z;
|
||||
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
||||
if (dynamic_shared_mem_size > 0)
|
||||
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
||||
int tmp = block_index;
|
||||
block_index_x = tmp / (grid_size_y * grid_size_z);
|
||||
tmp = tmp % (grid_size_y * grid_size_z);
|
||||
|
|
Loading…
Reference in New Issue