Refactoring the codebase. Remove useless variables; Add comments; Remove useless header files; Remove hard code and support both x86 and ARM CPU

This commit is contained in:
Ruobing Han 2023-12-13 14:29:17 -05:00
parent 50d615da64
commit fd56811650
50 changed files with 249 additions and 531 deletions

View File

@ -51,7 +51,7 @@ jobs:
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
./reverse
- name: Execute the dynamic shared memory demo
run: |
@ -63,7 +63,7 @@ jobs:
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
./reverse
- name: Execute Hetero-mark benchmark
run: |
@ -79,5 +79,5 @@ jobs:
hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread
./lavaMD -boxes1d 10

View File

@ -1,13 +1,9 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(CudaOnX86)
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
project(CuPBoP)
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.")
set(CMAKE_CXX_STANDARD "14")
set(MAJOR_VERSION 0)
set(MINOR_VERSION 1)
set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
set(COX_VERSION ${VERSION_STRING})
# get LLVM PATH get PATH for head file
if(DEFINED LLVM_CONFIG_PATH)
if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
if(EXISTS "${LLVM_CONFIG_PATH}")
@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH)
else()
message(FATAL_ERROR "llvm-config is required")
endif()
# get CUDA PATH
if(DEFINED CUDA_PATH)
message(STATUS "Using CUDA: ${CUDA_PATH}")
else()
@ -45,7 +41,7 @@ if(DEBUG)
endif()
set(CMAKE_CXX_FLAGS
"-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
"-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused")
set(GCC_COVERAGE_LINK_FLAGS
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
@ -54,5 +50,8 @@ add_subdirectory(compilation)
add_subdirectory(runtime)
enable_testing()
option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data)
set(HETERO_MARK_DATA
"/tmp/data"
CACHE PATH "The path to download hetero-mark dataset.")
add_subdirectory(test)

View File

@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \
-L$CuPBoP_PATH/build/runtime \
-L$CuPBoP_PATH/build/runtime/threadPool \
host.o kernel.o \
-I../.. -lc -lx86Runtime -lthreadPool -lpthread
-I../.. -lc -lCPUruntime -lthreadPool -lpthread
# Execute
./vecadd
```

View File

@ -1,20 +1,16 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(
NVVM2X86
DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend"
LANGUAGES CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
# compile kernel translator
include_directories(./KernelTranslation/include/x86)
# build kernel translator
include_directories(./KernelTranslation/include/cpu)
add_subdirectory(KernelTranslation)
add_executable(kernelTranslator KernelTranslation.cpp)
target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
# compile host translator
include_directories(./HostTranslation/include/x86)
# build host translator
include_directories(./HostTranslation/include/cpu)
add_subdirectory(HostTranslation)
add_executable(hostTranslator HostTranslation.cpp)

View File

@ -4,12 +4,7 @@
#include "ReplaceCudaBuiltin.h"
#include "ReplaceKernelArgs.h"
#include "tool.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include <assert.h>
#include <fstream>
#include <iostream>
#include <stdlib.h>
using namespace llvm;

View File

@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
set(LIB_NAME cudaRuntime2cpuRuntime)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)
include_directories(./include/x86)
include_directories(./include/cpu)
include_directories(../../common)
file(GLOB proj_HEADERS "include/x86/*.h")
file(GLOB proj_SOURCES "src/x86/*.cpp")
file(GLOB proj_HEADERS "include/cpu/*.h")
file(GLOB proj_SOURCES "src/cpu/*.cpp")
# Add core library.
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})

View File

@ -3,16 +3,8 @@
*/
#include "RemoveCudaBuiltin.h"
#include "debug.hpp"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Transforms/Utils/CtorUtils.h"
#include <iostream>
#include <map>
#include <set>
using namespace llvm;

View File

@ -1,16 +1,14 @@
#include "RemoveMetadata.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include <iostream>
#include "llvm/Support/Host.h"
using namespace llvm;
void RemoveMetadata(llvm::Module *M) {
// change the target triple to the host triple
M->setTargetTriple(llvm::sys::getProcessTriple());
// use the default DataLayout
M->setDataLayout("");
SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) {
F->removeFnAttr("min-legal-vector-width");
F->removeFnAttr("no-trapping-math");
F->removeFnAttr(llvm::Attribute::OptimizeNone);
F->removeFnAttr("target-cpu");
F->removeFnAttr("target-features");
}
}

View File

@ -1,12 +1,7 @@
#include "ReplaceConstantMemory.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include <assert.h>
#include <fstream>
#include <iostream>
#include <map>
#include <set>

View File

@ -1,13 +1,6 @@
#include "ReplaceCudaBuiltin.h"
#include "debug.hpp"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include <iostream>
#include <map>
#include <regex>
#include <set>
@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
std::map<std::string, Function *> kernels;
std::set<llvm::Function *> need_remove;
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
llvm::FunctionType *LauncherFuncT =
FunctionType::get(Type::getVoidTy(*C), NULL);
llvm::FunctionType *LaunchFun2 =
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
bool done = false;
std::set<std::string> cuda_register_kernel_names;
@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
std::vector<size_t> arg_sizes;
functionOperand =
dyn_cast<Function>(callOperand->stripPointerCasts());
FunctionType *ft = calledFunction->getFunctionType();
DEBUG_INFO("Parent (Caller) Function Name: %s, "
"cudaLaunchKernel Function: %s, args : %d\n",
func_name.c_str(),

View File

@ -1,12 +1,5 @@
#include "ReplaceKernelArgs.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include <iostream>
#include <map>
#include <set>
@ -23,12 +16,9 @@ using namespace llvm;
// to use use-analysis to find the arguments in the future
void ReplaceKernelArg(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto VoidTy = llvm::Type::getVoidTy(context);
auto I8 = llvm::Type::getInt8PtrTy(context);
std::map<std::string, Function *> kernels;
std::set<llvm::Function *> need_replace;
LLVMContext *C = &M->getContext();
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);

View File

@ -1,4 +1,4 @@
#include "generate_x86_format.h"
#include "generate_cpu_format.h"
#include "handle_sync.h"
#include "init.h"
#include "insert_sync.h"
@ -6,17 +6,14 @@
#include "performance.h"
#include "tool.h"
#include "warp_func.h"
#include "llvm/IR/Module.h"
#include <assert.h>
#include <fstream>
#include <iostream>
#include <llvm/Support/raw_ostream.h>
#include <map>
#include <set>
#include <stdlib.h>
using namespace llvm;
// to support constant memory variables, we need to convert information
// from kernelTranslator to HostTranslator, since HostTranslator knows nothing
// about the kernel functions, we need to write the information to a file
// by KernelTranslator and read it in HostTranslator
std::string PATH = "kernel_meta.log";
int main(int argc, char **argv) {
@ -26,8 +23,9 @@ int main(int argc, char **argv) {
std::ofstream fout;
fout.open(PATH);
// inline, and create auxiliary global variables
// inline __device__ functions, and create auxiliary global variables
init_block(program, fout);
// insert sync before each vote, and replace the
// original vote function to warp vote
handle_warp_vote(program);
@ -40,17 +38,18 @@ int main(int argc, char **argv) {
// split block by sync
split_block_by_sync(program);
// add loop for intra&intera thread
// add loop for intra&intera thread, it refers 'hierarchical collapsing' in
// COX paper.
insert_warp_loop(program);
// (TODO): replace this patch
replace_built_in_function(program);
// TODO: replace with a more general function
// Not only for x86 backend
generate_x86_format(program);
// the input kernel programs have NVIDIA metadata, they need to be replaced to
// CPU metadata
generate_cpu_format(program);
// performance optimization
// execute O3 pipeline on the transformed program
performance_optimization(program);
VerifyModule(program);

View File

@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
set(LIB_NAME spmd2mpmd)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)
include_directories(./include/x86)
include_directories(./include/cpu)
include_directories(../../common)
file(GLOB proj_HEADERS "include/x86/*.h")
file(GLOB proj_SOURCES "src/x86/*.cpp")
file(GLOB proj_HEADERS "include/cpu/*.h")
file(GLOB proj_SOURCES "src/cpu/*.cpp")
# Add core library.
add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})

View File

@ -0,0 +1,10 @@
#ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__
#define __NVVM2CPU_GENERATE_CPU_FORMAT__
#include "llvm/IR/Module.h"
void generate_cpu_format(llvm::Module *M);
void set_meta_data(llvm::Module *M);
#endif

View File

@ -1,10 +0,0 @@
#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
#define __NVVM2x86_GENERATE_X86_FORMAT__
#include "llvm/IR/Module.h"
void generate_x86_format(llvm::Module *M);
void set_meta_data(llvm::Module *M);
#endif

View File

@ -0,0 +1,125 @@
#include "generate_cpu_format.h"
#include "debug.hpp"
#include "tool.h"
#include "llvm/Support/Host.h"
using namespace llvm;
// set TargetTriple and DataLayout same as the host CPU
void set_meta_data(llvm::Module *M) {
M->setTargetTriple(llvm::sys::getProcessTriple());
// use the default DataLayout
M->setDataLayout("");
}
// as pthread only accept a single void* for input
// we have to decode this input inside the kernel
void decode_input(llvm::Module *M) {
std::set<llvm::Function *> need_remove;
llvm::Type *Int32T = Type::getInt32Ty(M->getContext());
llvm::Type *Int8T = Type::getInt8Ty(M->getContext());
llvm::FunctionType *LauncherFuncT = FunctionType::get(
Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false);
// generate Wrapper Function type
// now we only support a single int32*
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
if (!isKernelFunction(M, F))
continue;
auto func_name = F->getName().str();
// filter out _Z24 and other mangled prefix
for (int pos = 2; pos < func_name.length(); pos++) {
if (func_name[pos] >= '0' && func_name[pos] <= '9')
continue;
func_name = func_name.substr(pos);
break;
}
llvm::IRBuilder<> Builder(M->getContext());
FunctionCallee fc =
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
Builder.SetInsertPoint(Block);
// WorkGroup has only a single input
Function::arg_iterator ai = WorkGroup->arg_begin();
SmallVector<Value *, 8> Arguments;
Value *input_arg = &*ai;
// convert to int**
input_arg = Builder.CreateBitOrPointerCast(
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
size_t idx = 0;
// replace original arguments with the unpacked values
// for example, for a function f(int* a, char* b),
// we will generate a function f_wrapper(int** input)
// and replace the original arguments with the unpacked values
// e.g., a = (int*)input[0], b = (char*)input[1]
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
ii != ee; ++ii) {
Type *ArgType = ii->getType();
// calculate addr
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
// load corresponding int*
GEP = createLoad(Builder, GEP);
// bitcast
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
Value *Arg = createLoad(Builder, GEP);
Arguments.push_back(Arg);
++idx;
}
Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
Builder.CreateRetVoid();
}
for (auto f : need_remove) {
f->dropAllReferences();
f->eraseFromParent();
}
}
// after flat/hierarchical collapsing, the barrier instructions are useless
void remove_barrier(llvm::Module *M) {
std::vector<Instruction *> need_remove;
for (auto F = M->begin(); F != M->end(); ++F)
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) {
if (auto Call = dyn_cast<CallInst>(Inst)) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") {
need_remove.push_back(Call);
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
void remove_useless_var(llvm::Module *M) {
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
}
void generate_cpu_format(llvm::Module *M) {
DEBUG_INFO("generate cpu format\n");
// change metadata
set_meta_data(M);
// decode argument
decode_input(M);
// remove barrier
remove_barrier(M);
// remove useless func/variable
remove_useless_var(M);
}

View File

@ -1,13 +1,7 @@
#include "handle_sync.h"
#include "debug.hpp"
#include "tool.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include <set>
#include <string>

View File

@ -2,26 +2,11 @@
#include "debug.hpp"
#include "memory_hierarchy.h"
#include "tool.h"
#include <fstream>
#include <iostream>
#include <set>
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <set>
using namespace llvm;
@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) {
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
if (c->getCalledFunction()) {
@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) {
}
bool find_sreg_inst(llvm::Function *F) {
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
if (c->getCalledFunction()) {
@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) {
Passes.run(*M);
}
// transform constant expression into sequence of instructions
bool lower_constant_expr(llvm::Module *M) {
bool modified = false;
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::vector<CallInst *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) {
return modified;
}
// replace _ZL3expd, just delete its body
void replace_cuda_math_built_in(llvm::Module *M) {
// replace _ZL3expd, just delete its body
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();

View File

@ -4,29 +4,9 @@
#include "handle_sync.h"
#include "tool.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <queue>
using namespace llvm;
@ -44,7 +24,7 @@ public:
std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
// insert sync in the entry
// insert sync after the entry and before the first non-AllocaInst
BasicBlock *entry = &(*F.begin());
for (auto i = entry->begin(); i != entry->end(); i++) {
if (!isa<AllocaInst>(i)) {
@ -54,10 +34,8 @@ public:
}
for (Function::iterator I = F.begin(); I != F.end(); ++I) {
BasicBlock::iterator BI = I->begin();
// insert barrier before return
for (; BI != I->end(); BI++) {
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
if (Ret) {
insert_inter_warp_sync_before.push_back(&(*BI));
@ -125,7 +103,7 @@ public:
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
// first find all conditional barriers
// find all conditional barriers
std::vector<BasicBlock *> conditionalBarriers;
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
BasicBlock *b = &*i;
@ -148,12 +126,9 @@ public:
conditionalBarriers.pop_back();
// insert barrier in the start of if-condition
BasicBlock *pos = b;
BasicBlock *pred = firstNonBackedgePredecessor(b);
while (PDT->getPostDomTree().dominates(b, pred)) {
pos = pred;
// If our BB post dominates the given block, we know it is not the
// branching block that makes the barrier conditional.
pred = firstNonBackedgePredecessor(pred);
@ -468,7 +443,6 @@ public:
auto header_block = L->getHeader();
assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
"has more than 2 successors of the for-head\n");
BasicBlock *for_body = NULL;
for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
i++) {
auto bb = header_block->getTerminator()->getSuccessor(i);

View File

@ -4,43 +4,20 @@
#include "handle_sync.h"
#include "tool.h"
#include <assert.h>
#include <iostream>
#include <set>
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <map>
#include <set>
#include <sstream>
#include <tuple>
#include <vector>
using namespace llvm;
@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
Function *FF = instruction->getParent()->getParent();
Module *M = instruction->getParent()->getParent()->getParent();
LLVMContext &C = M->getContext();
const llvm::DataLayout &Layout = M->getDataLayout();
llvm::Type *elementType;
if (isa<AllocaInst>(instruction)) {
@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
}
Type *AllocType = elementType;
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
llvm::Value *ItemSize = nullptr;
llvm::AllocaInst *Alloca = nullptr;
auto block_size_addr = M->getGlobalVariable("block_size");
@ -697,9 +669,6 @@ public:
is_single_conditional_branch_block = 1;
} else {
// generate by replicate local variable
printf(
"[WARNING] match single conditional branch with HARD CODE\n");
bool branch_to_intra_init = false;
for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
auto block_name = entryCandidate->getName().str();
@ -755,7 +724,7 @@ public:
entry = entryCandidate;
break;
}
// delete useless PR, those PRs only have branch
// delete useless PR, those PRs only have branch instructions
if (entry == exit) {
if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
return;

View File

@ -1,29 +1,10 @@
#include "memory_hierarchy.h"
#include "debug.hpp"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <assert.h>
#include <fstream>
#include <iostream>
#include <map>
#include <set>
#include <sstream>
#include <tuple>
#include <vector>
void mem_share2global(llvm::Module *M) {
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int64T = Type::getInt64Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
std::set<llvm::Instruction *> need_remove;
std::set<GlobalVariable *> need_remove_share_memory;
@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) {
// generate global type pointer
PointerType *PointerTy =
PointerType::get(array_type->getElementType(), 0);
llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
*M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
NULL, "dynamic_shared_memory", NULL,
@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) {
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory));
} else if (element_type->isFloatTy()) {
auto FP_type = llvm::Type::getFloatTy(*C);
auto FP_type = llvm::Type::getFloatTy(M->getContext());
auto zero = llvm::ConstantFP::get(FP_type, 0);
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) {
}
void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int64T = Type::getInt64Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
std::set<llvm::Instruction *> need_remove;
std::set<GlobalVariable *> need_remove_constant_memory;
@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
if (auto PT = dyn_cast<PointerType>(I->getType())) {
unsigned AS = PT->getAddressSpace();
if (AS == 4) { // find a share memory
if (AS == 4) { // find a constant memory
need_remove_constant_memory.insert(constant_memory);
// generate the corresponding global memory variable
auto new_name = "wrapper_global_" + constant_memory->getName().str();
@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
if (constant_memory->hasExternalLinkage() &&
array_type->getArrayNumElements() == 0) {
// external shared memory of []
// external constant memory of []
// generate global type pointer
PointerType *PointerTy =
PointerType::get(array_type->getElementType(), 0);

View File

@ -1,43 +1,13 @@
#include "performance.h"
#include "debug.hpp"
#include "tool.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Host.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <map>
#include <set>
#include <sstream>
#include <tuple>
#include <vector>
using namespace llvm;
@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) {
llvm::legacy::PassManager Passes;
// add target machine info
llvm::Triple triple("x86_64-unknown-linux-gnu");
llvm::Triple triple(llvm::sys::getProcessTriple());
std::string Error;
const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) {
Options.FloatABIType = FloatABI::Hard;
TargetMachine *TM = TheTarget->createTargetMachine(
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""),
Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
assert(TM && "No Machine Information\n");
@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) {
Builder.LoopVectorize = true;
Builder.SLPVectorize = true;
Builder.VerifyInput = true;
Builder.VerifyOutput = true;
Builder.populateModulePassManager(Passes);
Passes.run(*M);
}

View File

@ -1,29 +1,13 @@
#include "tool.h"
#include "debug.hpp"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorOr.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <set>
using namespace llvm;
@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
Value *val = phi->getIncomingValue(incoming);
BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
builder.SetInsertPoint(incomingBB->getTerminator());
llvm::Instruction *store = builder.CreateStore(val, alloca);
builder.CreateStore(val, alloca);
}
builder.SetInsertPoint(phi);
@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) {
}
}
bool changed = false;
for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
Instruction *instr = *i;
BreakPHIToAllocas(dyn_cast<PHINode>(instr));
@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) {
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Load = dyn_cast<LoadInst>(BI)) {
auto load_from = Load->getOperand(0);
} else if (auto Call = dyn_cast<CallInst>(BI)) {
if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->getCalledFunction()) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) {
if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->getCalledFunction()) {
auto func_name = Call->getCalledFunction()->getName().str();
auto callFn = Call->getCalledFunction();
if (func_name == "vprintf") {
/*
* replace CUDA's printf to C's printf
@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) {
dyn_cast<PointerType>(BC->getOperand(0)->getType());
auto SrcTy = SrcPointTy->getElementType();
// reverse the bitcast
auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
new BitCastInst(BC, SrcPointTy, "", Call);
assert(SrcTy->isStructTy() == 1);
auto StructTy = dyn_cast<StructType>(SrcTy);
for (int i = 0; i < StructTy->getNumElements(); i++) {
@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) {
void replace_asm_call(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::vector<CallInst *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);

View File

@ -2,15 +2,6 @@
#include "warp_func.h"
#include "debug.hpp"
#include "tool.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <set>
using namespace llvm;
@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) {
res = BinaryOperator::CreateNot(res, "", sync_inst);
}
auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
new llvm::StoreInst(res, GEP, "", sync_inst);
// create barrier
CreateIntraWarpBarrier(sync_inst);
/*

View File

@ -1,186 +0,0 @@
#include "generate_x86_format.h"
#include "debug.hpp"
#include "tool.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <map>
using namespace llvm;
void set_meta_data(llvm::Module *M) {
M->setTargetTriple("x86_64-unknown-linux-gnu");
M->setDataLayout(
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
}
// as pthread only accept a single void* for input
// we have to decode this input inside the kernel
void decode_input(llvm::Module *M) {
std::set<llvm::Function *> need_remove;
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
llvm::FunctionType *LauncherFuncT = FunctionType::get(
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
std::set<GlobalVariable *> dynmaic_memory;
std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
// generate Wrapper Function type
// now we only support a single int32*
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
if (!isKernelFunction(M, F))
continue;
auto func_name = F->getName().str();
// remove mangle prefix
// remove _Z24
for (int pos = 2; pos < func_name.length(); pos++) {
if (func_name[pos] >= '0' && func_name[pos] <= '9')
continue;
func_name = func_name.substr(pos);
break;
}
llvm::IRBuilder<> Builder(M->getContext());
FunctionCallee fc =
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
Builder.SetInsertPoint(Block);
// WorkGroup has only a single input
Function::arg_iterator ai = WorkGroup->arg_begin();
SmallVector<Value *, 8> Arguments;
Value *input_arg = &*ai;
// convert to int**
input_arg = Builder.CreateBitOrPointerCast(
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
// dynamic memory load in the wrapper function
GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
if (share_memory != NULL) {
dynmaic_memory.insert(share_memory);
llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
*M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
"thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
0, false);
Value *loadedValue = createLoad(Builder, global_mem);
llvm::FunctionType *LaunchFun2 = FunctionType::get(
PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
FunctionCallee fc2 =
M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
co->setSelectionKind(Comdat::SelectionKind::Any);
WorkGroup2->setComdat(co);
BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
llvm::IRBuilder<> Builder2(M->getContext());
Builder2.SetInsertPoint(Block2);
Builder2.CreateRet(share_memory);
auto PT = dyn_cast<PointerType>(share_memory->getType());
auto element_type = PT->getElementType();
AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
Value *new_ar = new_arr;
Value *gptr = Builder.CreateBitOrPointerCast(
share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
Builder.CreateStore(new_ar, gptr);
}
size_t idx = 0;
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
ii != ee; ++ii) {
Type *ArgType = ii->getType();
// calculate addr
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
// load corresponding int*
GEP = createLoad(Builder, GEP);
// bitcast
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
Value *Arg = createLoad(Builder, GEP);
Arguments.push_back(Arg);
++idx;
}
CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
Builder.CreateRetVoid();
}
for (auto f : need_remove) {
f->dropAllReferences();
f->eraseFromParent();
}
}
void remove_barrier(llvm::Module *M) {
std::vector<Instruction *> need_remove;
for (auto F = M->begin(); F != M->end(); ++F)
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") {
need_remove.push_back(Call);
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
void remove_useless_var(llvm::Module *M) {
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
}
void generate_x86_format(llvm::Module *M) {
DEBUG_INFO("generate x86 format\n");
// change metadata
set_meta_data(M);
// decode argument
decode_input(M);
// remove barrier
remove_barrier(M);
// remove useless func/variable
remove_useless_var(M);
}

View File

@ -1,9 +1,9 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(
X86runtime
DESCRIPTION "Implementation CUDA runtime API with x86"
CPUruntime
DESCRIPTION "Implementation CUDA runtime API with CPUs"
LANGUAGES CXX)
set(LIB_NAME x86Runtime)
set(LIB_NAME CPUruntime)
set(CMAKE_VERBOSE_MAKEFILE ON)
# compile threadPool implementation
@ -12,9 +12,9 @@ add_subdirectory(threadPool)
# compile x86 runtime library
include_directories(../common)
include_directories(./include/)
include_directories(./include/x86)
include_directories(./include/cpu)
include_directories(./threadPool/include/)
include_directories(./threadPool/include/x86)
include_directories(./threadPool/include/cpu)
include_directories(../external/moodycamel/)
file(GLOB proj_SOURCES "src/x86/*.cpp")
file(GLOB proj_SOURCES "src/cpu/*.cpp")
add_library(${LIB_NAME} SHARED ${proj_SOURCES})

View File

@ -1,4 +1,4 @@
#ifndef __RUNTIME_IMPL__
#ifndef __KERNEL_IMPL__
#define __KERNEL_IMPL__
#include "structures.h"
#include <stdint.h>

View File

@ -10,27 +10,31 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
cudaError_t cudaGetDevice(int *devPtr) {
*devPtr = 0;
return cudaSuccess;
}
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
cudaError_t cudaDeviceReset(void) {
scheduler_uninit();
return cudaSuccess;
}
cudaError_t cudaDeviceReset(void) { return cudaSuccess; }
cudaError_t cudaDeviceSynchronize(void) {
cuSynchronizeBarrier();
return cudaSuccess;
}
cudaError_t cudaThreadSynchronize(void) {
cuSynchronizeBarrier();
return cudaSuccess;
}
cudaError_t cudaFree(void *devPtr) {
free(devPtr);
return cudaSuccess;
}
cudaError_t cudaFreeHost(void *devPtr) {
free(devPtr);
return cudaSuccess;
@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
cu_kernel *ker =
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
int lstatus = cuLaunchKernel(&ker);
cuLaunchKernel(&ker);
return cudaSuccess;
}
cudaError_t cudaMalloc(void **devPtr, size_t size) {
*devPtr = malloc(size);
if (devPtr == NULL)
return cudaErrorMemoryAllocation;
return cudaSuccess;
}
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
memset(devPtr, value, count);
return cudaSuccess;
}
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
cudaMemcpyKind kind) {
if (kind == cudaMemcpyHostToHost) {
@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
return cudaSuccess;
}
static int stream_counter = 1;
/*
From our evaluation, CPU backend can gain little benefit
from multi stream. Thus, we only use single stream
@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess;
const char *cudaGetErrorString(cudaError_t error) {
if (error == cudaSuccess) {
return "Cuda Get Error Success";
} else {
return "Cuda Get Error Failed";
}
}

View File

@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
set(LIB_NAME threadPool)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)
include_directories(../../common)
include_directories(./include)
include_directories(./include/x86)
include_directories(./include/cpu)
include_directories(../../external/moodycamel)
file(GLOB proj_SOURCES "src/x86/*.cpp")
file(GLOB proj_SOURCES "src/cpu/*.cpp")
add_library(${LIB_NAME} SHARED ${proj_SOURCES})

View File

@ -1,3 +1,19 @@
/*
This file contains the implementation of the CPU thread pool. For a kernel
launch, the host thread will enqueue the kernel to the kernelQueue, and the
threads in the thread pool will try to fetch work from the queue. After a
thread fetches a kernel from the queue, it will execute the kernel. After the
kernel execution, the thread will try to fetch another kernel from the queue.
If the queue is empty, the thread will wait for the next kernel launch.
By default, we try to use all CPU cores for execution. Thus, for a kernel
launch, the host thread pushes P kernel variables to the queue, where P is the
number of CPU cores.
For some lightweight kernels, useing fewer CPU cores can speed up the overall
execution time, due to fewer CPU cores lead to lower synchronization overhead.
*/
#include "api.h"
#include "blockingconcurrentqueue.h"
#include "debug.hpp"
@ -9,9 +25,6 @@
#include <stdlib.h>
#include <thread>
/*
Initialize the device
*/
int device_max_compute_units = 1;
bool device_initilized = false;
int init_device() {
@ -32,7 +45,6 @@ int init_device() {
}
// Create Kernel
static int kernelIds = 0;
cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
void **args, size_t sharedMem, cudaStream_t stream) {
cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
@ -71,13 +83,11 @@ __thread int warp_shfl[32] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/*
Enqueue Kernel (k) to the scheduler kernelQueue
*/
int TaskToExecute;
// Enqueue Kernel to the scheduler kernelQueue
int schedulerEnqueueKernel(cu_kernel *k) {
int totalBlocks =
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
int totalBlocks = k->totalBlocks;
// by default, all CPU cores are used to execute GPU blocks equally
int gpuBlockToExecutePerCpuThread =
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) {
return C_SUCCESS;
}
/*
Kernel Launch with numBlocks and numThreadsPerBlock
*/
// Push kernel to the kernelQueue
int cuLaunchKernel(cu_kernel **k) {
if (!device_initilized) {
init_device();
}
// Calculate Block Size N/numBlocks
cu_kernel *ker = *k;
int status = C_RUN;
// set complete to false, this variable is used for sync
for (int i = 0; i < scheduler->num_worker_threads; i++) {
scheduler->thread_pool[i].completeTask = 0;
}
schedulerEnqueueKernel(ker);
schedulerEnqueueKernel(*k);
return 0;
}
/*
Thread Gets Work
*/
// threads in thread-pool try to fetch work from the queue
int get_work(c_thread *th) {
int dynamic_shared_mem_size = 0;
dim3 gridDim;
@ -136,6 +139,7 @@ int get_work(c_thread *th) {
grid_size_x = gridDim.x;
grid_size_y = gridDim.y;
grid_size_z = gridDim.z;
// allocate dynamic shared memory
if (dynamic_shared_mem_size > 0)
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
// execute GPU blocks
@ -153,7 +157,8 @@ int get_work(c_thread *th) {
}
// if cannot get tasks, check whether programs stop
if (scheduler->threadpool_shutdown_requested) {
return true; // thread exit
// thread exit
break;
}
}
return 0;
@ -176,9 +181,7 @@ void *driver_thread(void *p) {
}
}
/*
Initialize the scheduler
*/
// Initialize the scheduler
int scheduler_init(cu_device device) {
scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
scheduler->num_worker_threads = device.max_compute_units;
@ -198,8 +201,6 @@ int scheduler_init(cu_device device) {
return C_SUCCESS;
}
void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
/*
Barrier for Kernel Launch
*/

View File

@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \
$HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc $HeteroMark_PATH/src/common/benchmark/*.cc \
$HeteroMark_PATH/src/common/command_line_option/*.cc $HeteroMark_PATH/src/common/time_measurement/*.cc \
-L$CuPBoP_BUILD_PATH/runtime -L$CuPBoP_BUILD_PATH/runtime/threadPool \
-I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool
-I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool
case $1 in
aes)