add backbone, including basic features for compilation

This commit is contained in:
RobinHan 2022-01-11 11:01:42 -05:00
commit addf0a95b7
49 changed files with 4831 additions and 0 deletions

41
CMakeLists.txt Normal file
View File

@ -0,0 +1,41 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(CudaOnX86)
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
set(CMAKE_CXX_STANDARD "14")
set(MAJOR_VERSION 0)
set(MINOR_VERSION 1)
set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
set(COX_VERSION ${VERSION_STRING})
# get LLVM PATH get PATH for head file
if(DEFINED LLVM_CONFIG_PATH)
if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
if(EXISTS "${LLVM_CONFIG_PATH}")
set(LLVM_CONFIG "${LLVM_CONFIG_PATH}")
else()
message(FATAL_ERROR "llvm-config is not found in ${LLVM_CONFIG_PATH}")
endif()
message(STATUS "Using llvm-config: ${LLVM_CONFIG}")
execute_process(
COMMAND "${LLVM_CONFIG}" "--cxxflags"
OUTPUT_VARIABLE LLVM_CXX_FLAG
OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(
COMMAND "${LLVM_CONFIG}" "--libdir"
OUTPUT_VARIABLE LLVM_LIB_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(
COMMAND "${LLVM_CONFIG}" "--libs"
OUTPUT_VARIABLE LLVM_LINK_FLAG
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
else()
message(FATAL_ERROR "llvm-config is required")
endif()
set(CMAKE_CXX_FLAGS "${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
set(GCC_COVERAGE_LINK_FLAGS
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
add_subdirectory(compilation)

34
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,34 @@
# Contributing to COX
Thank you for your interest in contributing to COX!
We appreciate all contributions, including but not limited to:
- Add documentation
- Add new features and components
- Fix bugs
## How to contribute?
0. (Optional) Open an issue and discuss your idea before start
1. Fork the latest version COX
2. Commit to the forked repo
3. Create a Pull Request to COX main branch
## Code style
We follow the Clang format in this repo.
To make sure your contribution is following the correct style,
we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
```bash
# Python environment is required
pip install pre-commit
```
Then, from the repository folder, execute the following instruction:
```bash
pre-commit install
```
With pre-commit plugin, each local commit will be automatically checked.

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Ruobing Han
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

62
README.md Normal file
View File

@ -0,0 +1,62 @@
# COX: CUDA on X86
## Introduction
This project consists of two parts: a series of LLVM passes that
achieve a SPMD NVVM IR as input, and output the corresponding
MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
## Install
### Prerequisites
* Linux: Verified on Ubuntu 18.04
* LLVM10.0
* NVIDIA CUDA-toolkit
* x86 CPU
* pthread
* GCC 7.5.0
### Installation
1. Clone from github
```bash
git clone https://github.com/drcut/open_source_template
cd open_source_template
```
2. Build the transformer for NVVM IR to LLVM IR for X86
```bash
mkdir build && cd build
cmake .. -DLLVM_CONFIG_PATH=`which llvm-config` # need path to llvm-config
make
```
## Run Vecadd samples
```bash
# Generate bitcode from human-readable LLVM IR
llvm-as ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
# use LLVM passes to transform NVVM IR (SPMD) to LLVM IR (MPMD+SIMD).
# NOTE: we hard-code the grid size (1, 1, 1)
# and block size (1024, 1, 1) into the generated LLVM IR
./compilation/nvvm2x86 \
../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc \
kernel.bc 1 1 1 32 1 1
# generate object file from LLVM IR
llc --filetype=obj kernel.bc
# link generated kernel function
# with host function and generate excutable file
g++ ../compilation/examples/vecadd/host.cpp \
kernel.o -lpthread -o vecadd_example
# execute the executable file
./vecadd_example
```
## Author
[Ruobing Han](https://drcut.github.io/) is a CS phd student in
Georgia Institute Technology, under the supervision
of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).

View File

@ -0,0 +1,23 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(
NVVM2X86
DESCRIPTION "Translate NVVM IR to LLVM IR for X86"
LANGUAGES CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
# compile kernel translator
include_directories(./KernelTranslation/include)
add_subdirectory(KernelTranslation)
add_executable(kernelTranslator KernelTranslation.cpp)
target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
# compile host translator
include_directories(./HostTranslation/include)
add_subdirectory(HostTranslation)
add_executable(hostTranslator HostTranslation.cpp)
target_link_libraries(hostTranslator spmd2mpmd cudaRuntime2cpuRuntime
${GCC_COVERAGE_LINK_FLAGS})

View File

@ -0,0 +1,25 @@
#include "ReplaceKernelLaunch.h"
#include "tool.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include <assert.h>
#include <iostream>
#include <stdlib.h>
using namespace llvm;
int main(int argc, char **argv) {
assert(argc == 3 && "incorrect number of arguments\n");
char *input_host_path = argv[1];
char *output_host_path = argv[2];
// load LLVM module(s)
llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
VerifyModule(hostModule);
// process host module
ReplaceKernelLaunch(hostModule);
VerifyModule(hostModule);
DumpModule(hostModule, output_host_path);
return 0;
}

View File

@ -0,0 +1,21 @@
cmake_minimum_required(VERSION 3.1)
# C project
project(
HostTranslation
DESCRIPTION "Translate CUDA host modules to CPU host modules,
mainly replace CUDA Runtime APIs with CPU Runtime APIs"
LANGUAGES CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(LIB_NAME cudaRuntime2cpuRuntime)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)
include_directories(./include)
file(GLOB proj_HEADERS "include/*.h")
file(GLOB proj_SOURCES "lib/*.cpp")
# Add core library.
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})

View File

@ -0,0 +1,11 @@
#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
#include "llvm/IR/Module.h"
/*
* Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
* Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
*/
void ReplaceKernelLaunch(llvm::Module *M);
#endif

View File

@ -0,0 +1,94 @@
#include "ReplaceKernelLaunch.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include <iostream>
#include <map>
#include <set>
using namespace llvm;
// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
void ReplaceKernelLaunch(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto VoidTy = llvm::Type::getVoidTy(context);
auto I8 = llvm::Type::getInt8PtrTy(context);
std::map<std::string, BitCastInst *> kernels;
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
llvm::FunctionType *LauncherFuncT =
FunctionType::get(Type::getVoidTy(*C), NULL);
llvm::FunctionType *LaunchFun2 =
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
bool done = false;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
Value *callOperand = callInst->getArgOperand(0);
Function *functionOperand =
dyn_cast<Function>(callInst->getArgOperand(0));
// call function is wrapped in a bitcast
if (functionOperand == NULL) {
std::vector<size_t> arg_sizes;
functionOperand =
dyn_cast<Function>(callOperand->stripPointerCasts());
FunctionType *ft = calledFunction->getFunctionType();
std::cout << " Parent (Caller) Function Name: " << func_name
<< ", cudaLaunchKernel Function: "
<< functionOperand->getName().str() << ", args "
<< functionOperand->arg_size() << std::endl;
auto rep = kernels.find(functionOperand->getName().str());
if (rep != kernels.end()) {
callInst->setArgOperand(0, rep->second);
continue;
}
std::vector<Type *> Params;
Params.push_back(I8);
FunctionType *FT = FunctionType::get(VoidTy, Params, false);
std::string newName =
functionOperand->getName().str() + "_wrapper";
Function *F =
Function::Create(FT, Function::ExternalLinkage, newName, M);
F->setDSOLocal(true);
BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
callInst->setArgOperand(0, BC);
kernels.insert({functionOperand->getName().str(), BC});
}
}
}
}
}
}
}
}

View File

@ -0,0 +1,53 @@
#include "generate_x86_format.h"
#include "handle_sync.h"
#include "init.h"
#include "insert_sync.h"
#include "insert_warp_loop.h"
#include "performance.h"
#include "tool.h"
#include "warp_func.h"
#include "llvm/IR/Module.h"
#include <assert.h>
#include <iostream>
#include <map>
#include <set>
#include <stdlib.h>
using namespace llvm;
int main(int argc, char **argv) {
assert(argc == 9 && "incorrect number of arguments\n");
llvm::Module *program = LoadModuleFromFilr(argv[1]);
// get size of grid and dim from input arguments
int *grid_dim = new int[3];
int *block_dim = new int[3];
grid_dim[0] = atoi(argv[3]);
grid_dim[1] = atoi(argv[4]);
grid_dim[2] = atoi(argv[5]);
block_dim[0] = atoi(argv[6]);
block_dim[1] = atoi(argv[7]);
block_dim[2] = atoi(argv[8]);
// inline, and create auxiliary global variables
init_block(program);
// insert sync before each vote, and replace the
// original vote function to warp vote
handle_warp_vote(program);
// replace warp shuffle
handle_warp_shfl(program);
// insert sync
insert_sync(program);
// split block by sync
split_block_by_sync(program);
// add loop for intra&intera thread
insert_warp_loop(program);
// (TODO): replace this patch
replace_built_in_function(program, grid_dim, block_dim);
// VerifyModule(program);
generate_x86_format(program);
// performance optimization
performance_optimization(program);
DumpModule(program, argv[2]);
return 0;
}

View File

@ -0,0 +1,21 @@
cmake_minimum_required(VERSION 3.1)
# C project
project(
KernelTranslation
DESCRIPTION
"Translate SPMD Kernel to MPMD format with hierarchical collapsing"
LANGUAGES CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(LIB_NAME spmd2mpmd)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)
include_directories(./include)
file(GLOB proj_HEADERS "include/*.h")
file(GLOB proj_SOURCES "lib/*.cpp")
# Add core library.
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})

View File

@ -0,0 +1,8 @@
#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
#define __NVVM2x86_GENERATE_X86_FORMAT__
#include "llvm/IR/Module.h"
void generate_x86_format(llvm::Module *M);
#endif

View File

@ -0,0 +1,10 @@
#ifndef __NVVM2x86_HANDLE_SYNC__
#define __NVVM2x86_HANDLE_SYNC__
#include "llvm/IR/Module.h"
using namespace llvm;
void split_block_by_sync(llvm::Module *M);
#endif

View File

@ -0,0 +1,7 @@
#ifndef __NVVM2x86_INIT__
#define __NVVM2x86_INIT__
#include "llvm/IR/Module.h"
void init_block(llvm::Module *M);
#endif

View File

@ -0,0 +1,9 @@
#ifndef __NVVM2x86_INSERT_SYNC__
#define __NVVM2x86_INSERT_SYNC__
#include "llvm/IR/Function.h"
// insert extra barrier
void insert_sync(llvm::Module *M);
#endif

View File

@ -0,0 +1,12 @@
#ifndef __NVVM2x86_INSERT_WARP_LOOP__
#define __NVVM2x86_INSERT_WARP_LOOP__
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
using namespace llvm;
void insert_warp_loop(llvm::Module *M);
#endif

View File

@ -0,0 +1,9 @@
#ifndef __NVVM2x86_MEMORY_HIERARCHY__
#define __NVVM2x86_MEMORY_HIERARCHY__
#include "llvm/IR/Module.h"
using namespace llvm;
void mem_share2global(llvm::Module *M);
#endif

View File

@ -0,0 +1,7 @@
#ifndef __NVVM2x86_PERFORMANCE__
#define __NVVM2x86_PERFORMANCE__
#include "llvm/IR/Module.h"
void performance_optimization(llvm::Module *M);
#endif

View File

@ -0,0 +1,24 @@
#ifndef __NVVM2x86_TOOL__
#define __NVVM2x86_TOOL__
#include "llvm/IR/Module.h"
llvm::Module *LoadModuleFromFilr(char *file_name);
void DumpModule(llvm::Module *M, char *file_name);
bool isKernelFunction(llvm::Module *M, llvm::Function *F);
void replace_block(llvm::Function *F, llvm::BasicBlock *before,
llvm::BasicBlock *after);
llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore);
llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
void VerifyModule(llvm::Module *);
void phi2alloc(llvm::Module *M);
void remove_cuda_built_in(llvm::Module *M);
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
void replace_asm_call(llvm::Module *M);
bool find_block_barrier_in_region(llvm::BasicBlock *start,
llvm::BasicBlock *end);
bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end);
bool has_warp_barrier(llvm::BasicBlock *B);
bool has_barrier(llvm::BasicBlock *B);
bool has_block_barrier(llvm::BasicBlock *B);
bool has_barrier(llvm::Function *F);
#endif

View File

@ -0,0 +1,10 @@
#ifndef __NVVM2x86_WARP_FUNC__
#define __NVVM2x86_WARP_FUNC__
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
void handle_warp_vote(llvm::Module *M);
void handle_warp_shfl(llvm::Module *M);
#endif

View File

@ -0,0 +1,119 @@
#include "generate_x86_format.h"
#include "tool.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
using namespace llvm;
void set_meta_data(llvm::Module *M) {
M->setTargetTriple("x86_64-unknown-linux-gnu");
M->setDataLayout(
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
}
// as pthread only accept a single void* for input
// we have to decode this input inside the kernel
void decode_input(llvm::Module *M) {
std::set<llvm::Function *> need_remove;
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
llvm::FunctionType *LauncherFuncT = FunctionType::get(
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
// generate Wrapper Function type
// now we only support a single int32*
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
if (!isKernelFunction(M, F))
continue;
auto func_name = F->getName().str();
llvm::IRBuilder<> Builder(M->getContext());
FunctionCallee fc =
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
Builder.SetInsertPoint(Block);
// WorkGroup has only a single input
Function::arg_iterator ai = WorkGroup->arg_begin();
SmallVector<Value *, 8> Arguments;
Value *input_arg = &*ai;
// convert to int**
input_arg = Builder.CreateBitOrPointerCast(
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
size_t idx = 0;
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
ii != ee; ++ii) {
Type *ArgType = ii->getType();
// calculate addr
Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx));
// load corresponding int*
GEP = Builder.CreateLoad(GEP);
// bitcast
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
Value *Arg = Builder.CreateLoad(GEP);
Arguments.push_back(Arg);
++idx;
}
CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
Builder.CreateRetVoid();
}
for (auto f : need_remove) {
f->dropAllReferences();
f->eraseFromParent();
}
}
void remove_barrier(llvm::Module *M) {
std::vector<Instruction *> need_remove;
for (auto F = M->begin(); F != M->end(); ++F)
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") {
need_remove.push_back(Call);
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
void generate_x86_format(llvm::Module *M) {
// change metadata
set_meta_data(M);
// decode argument
decode_input(M);
// remove barrier
remove_barrier(M);
}

View File

@ -0,0 +1,57 @@
#include "handle_sync.h"
#include "tool.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include <set>
#include <string>
using namespace llvm;
void split_block_by_sync(llvm::Function *F) {
std::set<llvm::Instruction *> sync_inst;
bool jump_first_sync = 1;
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (jump_first_sync) {
jump_first_sync = 0;
Instruction *next_inst = &(*std::next(i));
sync_inst.insert(next_inst);
continue;
}
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier.sync") {
sync_inst.insert(Call);
// we should also sync the next instruction
// so that we can get a block with sync inst only
Instruction *next_inst = &(*std::next(i));
sync_inst.insert(next_inst);
}
}
}
}
int _tmp = 0;
for (auto inst : sync_inst) {
inst->getParent()->splitBasicBlock(
inst, inst->getParent()->getName().str() + "_after_block_sync_" +
std::to_string(_tmp++));
}
}
void split_block_by_sync(llvm::Module *M) {
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
if (isKernelFunction(M, F))
split_block_by_sync(F);
}
}

View File

@ -0,0 +1,302 @@
#include "init.h"
#include "memory_hierarchy.h"
#include "tool.h"
#include <iostream>
#include <set>
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
using namespace llvm;
void inline_func_vote(llvm::Module *M) {
std::set<llvm::Function *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
if (c->getCalledFunction()) {
if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
InlineFunctionInfo IFI;
InlineFunction(c, IFI);
need_remove.insert(c->getCalledFunction());
}
}
}
}
}
}
for (auto f : need_remove) {
f->dropAllReferences();
f->eraseFromParent();
}
}
void create_global_variable(llvm::Module *M) {
llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
auto zero = llvm::ConstantInt::get(I32, 0, true);
// we need global variable used for warp shuffle
llvm::Type *WarpArrayType = llvm::ArrayType::get(I32, 32);
llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
zero, "intra_warp_index", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
zero, "inter_warp_index", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size_x", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size_y", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size_z", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "grid_size", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_index", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
// TLS variable used for warp-level collective operators
new llvm::GlobalVariable(
*M, WarpArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
"warp_shfl", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
auto warp_vote = new llvm::GlobalVariable(
*M, VoteArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
"warp_vote", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
warp_vote->setAlignment(llvm::MaybeAlign(32));
}
void remove_metadata(llvm::Module *M) {
SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
F->getAllMetadata(MDs);
for (auto &MD : MDs) {
F->setMetadata(MD.first, NULL);
}
F->removeFnAttr("target-features");
F->removeFnAttr("target-cpu");
}
}
void init_llvm_pass() {
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
InitializeAllAsmParsers();
PassRegistry &Registry = *PassRegistry::getPassRegistry();
initializeCore(Registry);
initializeScalarOpts(Registry);
initializeVectorization(Registry);
initializeIPO(Registry);
initializeAnalysis(Registry);
initializeTransformUtils(Registry);
initializeInstCombine(Registry);
initializeInstrumentation(Registry);
initializeTarget(Registry);
llvm::StringMap<llvm::cl::Option *> &opts = llvm::cl::getRegisteredOptions();
llvm::cl::Option *O = nullptr;
O = opts["scalarize-load-store"];
assert(O && "could not find LLVM option 'scalarize-load-store'");
O->addOccurrence(1, StringRef("scalarize-load-store"), StringRef("1"), false);
// LLVM inner loop vectorizer does not check whether the loop inside
// another loop, in which case even a small trip count loops might be
// worthwhile to vectorize.
O = opts["vectorizer-min-trip-count"];
assert(O && "could not find LLVM option 'vectorizer-min-trip-count'");
O->addOccurrence(1, StringRef("vectorizer-min-trip-count"), StringRef("2"),
false);
// Disable jump threading optimization with following two options from
// duplicating blocks. Using jump threading will mess up parallel region
// construction especially when kernel contains barriers.
// TODO: If enabled then parallel region construction code needs
// improvements and make sure it doesn't disallow other optimizations like
// vectorization.
O = opts["jump-threading-threshold"];
assert(O && "could not find LLVM option 'jump-threading-threshold'");
O->addOccurrence(1, StringRef("jump-threading-threshold"), StringRef("0"),
false);
O = opts["jump-threading-implication-search-threshold"];
assert(O && "could not find LLVM option "
"'jump-threading-implication-search-threshold'");
O->addOccurrence(1, StringRef("jump-threading-implication-search-threshold"),
StringRef("0"), false);
// Enable diagnostics from the loop vectorizer.
O = opts["pass-remarks-missed"];
assert(O && "could not find LLVM option 'pass-remarks-missed'");
O->addOccurrence(1, StringRef("pass-remarks-missed"),
StringRef("loop-vectorize"), false);
O->addOccurrence(1, StringRef("pass-remarks-missed"),
StringRef("slp-vectorize"), false);
O = opts["pass-remarks-analysis"];
assert(O && "could not find LLVM option 'pass-remarks-analysis'");
O->addOccurrence(1, StringRef("pass-remarks-analysis"),
StringRef("loop-vectorize"), false);
O->addOccurrence(1, StringRef("pass-remarks-analysis"),
StringRef("slp-vectorize"), false);
O = opts["pass-remarks"];
assert(O && "could not find LLVM option 'pass-remarks'");
O->addOccurrence(1, StringRef("pass-remarks"), StringRef("loop-vectorize"),
false);
O->addOccurrence(1, StringRef("pass-remarks"), StringRef("slp-vectorize"),
false);
}
void llvm_preprocess(llvm::Module *M) {
init_llvm_pass();
auto Registry = PassRegistry::getPassRegistry();
llvm::legacy::PassManager Passes;
std::vector<std::string> passes;
passes.push_back("lowerswitch");
passes.push_back("mem2reg");
passes.push_back("simplifycfg");
passes.push_back("loop-simplify");
for (auto pass : passes) {
const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
if (PIs) {
Pass *thispass = PIs->createPass();
Passes.add(thispass);
} else {
printf("Pass: %s not found\n", pass.c_str());
}
}
Passes.run(*M);
}
bool lower_constant_expr(llvm::Module *M) {
bool modified = false;
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::vector<CallInst *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto load_inst = dyn_cast<llvm::LoadInst>(BI)) {
auto load_from = load_inst->getOperand(0);
if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
modified = true;
auto ReplInst = get_element_ptr->getAsInstruction();
ReplInst->insertBefore(load_inst);
std::vector<Instruction *> Users;
// Do not replace use during iteration of use. Do it in another loop
for (auto U : get_element_ptr->users()) {
if (auto InstUser = dyn_cast<Instruction>(U)) {
Users.push_back(InstUser);
}
}
for (auto &User : Users)
User->replaceUsesOfWith(get_element_ptr, ReplInst);
}
} else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
auto store_to = store_inst->getOperand(1);
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
modified = true;
auto ReplInst = addr_cast->getAsInstruction();
ReplInst->insertBefore(store_inst);
std::vector<Instruction *> Users;
// Do not replace use during iteration of use. Do it in another loop
for (auto U : addr_cast->users()) {
if (auto InstUser = dyn_cast<Instruction>(U)) {
Users.push_back(InstUser);
}
}
for (auto &User : Users)
User->replaceUsesOfWith(addr_cast, ReplInst);
}
} else if (auto get_element_ptr =
dyn_cast<llvm::GetElementPtrInst>(BI)) {
auto get_from = get_element_ptr->getOperand(0);
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
modified = true;
auto ReplInst = addr_cast->getAsInstruction();
ReplInst->insertBefore(get_element_ptr);
std::vector<Instruction *> Users;
// Do not replace use during iteration of use. Do it in another loop
for (auto U : addr_cast->users()) {
if (auto InstUser = dyn_cast<Instruction>(U)) {
Users.push_back(InstUser);
}
}
for (auto &User : Users)
User->replaceUsesOfWith(addr_cast, ReplInst);
}
}
}
}
}
return modified;
}
void init_block(llvm::Module *M) {
// using official llvm preprocess
llvm_preprocess(M);
// remove useles Cuda function
remove_cuda_built_in(M);
// lower ConstantExpression
bool modified;
do {
modified = lower_constant_expr(M);
} while (modified);
// remove useless metadata
remove_metadata(M);
// inline vote function
inline_func_vote(M);
// create global variable for warp and vote
create_global_variable(M);
// replace phi with data load
phi2alloc(M);
// replace share memory
mem_share2global(M);
// replace asm Inline
replace_asm_call(M);
}

View File

@ -0,0 +1,494 @@
#include "insert_sync.h"
#include "assert.h"
#include "handle_sync.h"
#include "tool.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <queue>
using namespace llvm;
class InsertBuiltInBarrier : public llvm::FunctionPass {
public:
static char ID;
InsertBuiltInBarrier() : FunctionPass(ID) {}
virtual bool runOnFunction(Function &F) {
if (!isKernelFunction(F.getParent(), &F))
return 0;
std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
// insert sync in the entry
BasicBlock *entry = &(*F.begin());
for (auto i = entry->begin(); i != entry->end(); i++) {
if (!isa<AllocaInst>(i)) {
insert_inter_warp_sync_before.push_back(&(*(i)));
break;
}
}
for (Function::iterator I = F.begin(); I != F.end(); ++I) {
BasicBlock::iterator BI = I->begin();
// insert barrier before return
for (; BI != I->end(); BI++) {
llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
if (Ret) {
insert_inter_warp_sync_before.push_back(&(*BI));
}
}
}
if (insert_intra_warp_sync_before.empty() &&
insert_inter_warp_sync_before.empty())
return 0;
for (auto inst : insert_intra_warp_sync_before) {
CreateIntraWarpBarrier(inst);
}
for (auto inst : insert_inter_warp_sync_before) {
CreateInterWarpBarrier(inst);
}
return 1;
}
};
class InsertConditionalBarrier : public llvm::FunctionPass {
public:
static char ID;
InsertConditionalBarrier() : FunctionPass(ID) {}
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addPreserved<PostDominatorTreeWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
}
BasicBlock *firstNonBackedgePredecessor(llvm::BasicBlock *bb) {
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
pred_iterator I = pred_begin(bb), E = pred_end(bb);
if (I == E)
return NULL;
while (DT->dominates(bb, *I) && I != E)
++I;
if (I == E)
return NULL;
else
return *I;
}
BasicBlock *firstNonBackedgeSuccessor(llvm::BasicBlock *bb) {
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto t = bb->getTerminator();
assert(t->getNumSuccessors() <= 2);
for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
BasicBlock *successor = t->getSuccessor(i);
bool isBackedge = DT->dominates(successor, bb);
if (isBackedge)
continue;
return successor;
}
};
virtual bool runOnFunction(Function &F) {
if (!isKernelFunction(F.getParent(), &F))
return 0;
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
// first find all conditional barriers
std::vector<BasicBlock *> conditionalBarriers;
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
BasicBlock *b = &*i;
if (!has_barrier(b))
continue;
// Unconditional barrier postdominates the entry node.
if (PDT->getPostDomTree().dominates(b, &F.getEntryBlock()))
continue;
conditionalBarriers.push_back(b);
}
if (conditionalBarriers.size() == 0)
return 0;
bool changed = false;
while (!conditionalBarriers.empty()) {
BasicBlock *b = conditionalBarriers.back();
conditionalBarriers.pop_back();
// insert barrier in the start of if-condition
BasicBlock *pos = b;
BasicBlock *pred = firstNonBackedgePredecessor(b);
while (PDT->getPostDomTree().dominates(b, pred)) {
pos = pred;
// If our BB post dominates the given block, we know it is not the
// branching block that makes the barrier conditional.
pred = firstNonBackedgePredecessor(pred);
if (pred == b)
break; // Traced across a loop edge, skip this case.
}
// we should create warp/block barrier based on the conditional barrier
if (has_warp_barrier(b)) {
CreateIntraWarpBarrier(pred->getTerminator());
} else {
CreateInterWarpBarrier(pred->getTerminator());
}
changed = true;
// insert barrier in the merge point for then-else branches
// also insert barrier at the end of conditional branch
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
std::queue<llvm::BasicBlock *> successor_queue;
for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
auto ss = pred->getTerminator()->getSuccessor(i);
if (!DT->dominates(ss, pred))
successor_queue.push(ss);
}
std::set<llvm::BasicBlock *> visited;
llvm::BasicBlock *merge_point = NULL;
while (!successor_queue.empty()) {
auto curr = successor_queue.front();
successor_queue.pop();
if (visited.find(curr) != visited.end())
continue;
visited.insert(curr);
if (PDT->getPostDomTree().dominates(curr, pred)) {
// find the truly merge point
merge_point = curr;
if (has_warp_barrier(b)) {
CreateIntraWarpBarrier(&(*curr->begin()));
for (BasicBlock *Pred : predecessors(curr)) {
CreateIntraWarpBarrier(&(*Pred->getTerminator()));
}
} else {
CreateInterWarpBarrier(&(*curr->begin()));
for (BasicBlock *Pred : predecessors(curr)) {
CreateInterWarpBarrier(&(*Pred->getTerminator()));
}
}
break;
}
for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
auto ss = curr->getTerminator()->getSuccessor(i);
if (!DT->dominates(ss, curr))
successor_queue.push(ss);
}
}
assert(merge_point && "do not find merge point\n");
changed = true;
// we may create a new conditional barrier after insert
if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
conditionalBarriers.push_back(pred);
// find any block which are not dominated by header
// but be posdiminated by merge point
std::queue<llvm::BasicBlock *> if_body;
std::set<llvm::BasicBlock *> visited_block;
for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
if_body.push(pred->getTerminator()->getSuccessor(i));
}
while (!if_body.empty()) {
auto curr = if_body.front();
if_body.pop();
if (visited_block.find(curr) != visited_block.end())
continue;
visited_block.insert(curr);
if (!PDT->getPostDomTree().dominates(merge_point, curr))
continue;
if (!DT->dominates(pred, curr) &&
PDT->getPostDomTree().dominates(merge_point, curr)) {
// we should insert barrier at the beginning and
// end of its predecessor
if (has_warp_barrier(b)) {
CreateIntraWarpBarrier(&(*curr->begin()));
for (BasicBlock *Pred : predecessors(curr)) {
CreateIntraWarpBarrier(&(*Pred->getTerminator()));
}
} else {
CreateInterWarpBarrier(&(*curr->begin()));
for (BasicBlock *Pred : predecessors(curr)) {
CreateInterWarpBarrier(&(*Pred->getTerminator()));
}
}
}
for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
if_body.push(curr->getTerminator()->getSuccessor(i));
}
}
}
return changed;
}
};
class InsertBarrierForSpecialCase : public llvm::FunctionPass {
public:
static char ID;
InsertBarrierForSpecialCase() : FunctionPass(ID) {}
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
}
virtual bool runOnFunction(Function &F) {
if (!isKernelFunction(F.getParent(), &F))
return 0;
bool changed = false;
std::set<BasicBlock *> if_head;
// insert an extra block for the following case
// 1) there is a merge point for an if-else branch,
// but this merge point has other income edge
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
BasicBlock *b = &*i;
BasicBlock *merge_point = NULL;
if (b->getTerminator()->getNumSuccessors() == 2) {
auto b1 = b->getTerminator()->getSuccessor(0);
auto b2 = b->getTerminator()->getSuccessor(1);
if (PDT->getPostDomTree().dominates(b1, b2)) {
merge_point = b1;
} else if (PDT->getPostDomTree().dominates(b2, b2)) {
merge_point = b2;
} else {
assert(0 && "find complex if-else branch\n");
}
std::cout << std::flush;
for (BasicBlock *Pred : predecessors(merge_point)) {
if (!DT->dominates(b, Pred)) {
// we need to insert an extra block to be the merge point
// for the if-branch
if_head.insert(b);
}
}
}
}
auto M = F.getParent();
for (auto head : if_head) {
assert(head->getTerminator()->getNumSuccessors() == 2);
BasicBlock *merge_point = NULL;
auto s1 = head->getTerminator()->getSuccessor(0);
auto s2 = head->getTerminator()->getSuccessor(1);
if (PDT->getPostDomTree().dominates(s1, s2)) {
merge_point = s1;
} else {
merge_point = s2;
}
if (!find_barrier_in_region(head, merge_point)) {
printf("do not need to handle tri-income if: %s\n",
merge_point->getName().str().c_str());
continue;
}
BasicBlock *Block = BasicBlock::Create(M->getContext(), "if_end", &F);
llvm::IRBuilder<> Builder(M->getContext());
Builder.SetInsertPoint(Block);
auto br_inst = Builder.CreateBr(merge_point);
assert(has_barrier(head) && "preheader does not have barrier\n");
if (has_warp_barrier(head)) {
CreateIntraWarpBarrier(br_inst);
} else {
CreateInterWarpBarrier(br_inst);
}
// replace usage in if-branch
std::set<Instruction *> need_replace;
for (BasicBlock *Pred : predecessors(merge_point)) {
if (DT->dominates(head, Pred) && Pred != Block) {
need_replace.insert(Pred->getTerminator());
}
}
for (auto inst : need_replace) {
inst->replaceUsesOfWith(merge_point, Block);
}
changed = 1;
}
return changed;
}
};
class InsertConditionalForBarrier : public llvm::LoopPass {
public:
static char ID;
InsertConditionalForBarrier() : LoopPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<DominatorTreeWrapperPass>();
}
bool runOnLoop(Loop *L, LPPassManager &LPM) {
if (!isKernelFunction(L->getHeader()->getParent()->getParent(),
L->getHeader()->getParent()))
return 0;
// check whether this loop has barrier
bool is_conditional_loop = 0;
bool is_warp = 0;
for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); i != e;
++i) {
for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
++j) {
if (auto Call = dyn_cast<CallInst>(j)) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier.sync") {
is_conditional_loop = true;
if (func_name == "llvm.nvvm.bar.warp.sync") {
is_warp = 1;
}
break;
}
}
}
}
if (!is_conditional_loop)
return 0;
// insert barrier at the beginning of header
// and the end of pre header, so that we can get a
// single block connected with latch
if (!is_warp) {
auto prehead_block = L->getLoopPreheader();
CreateInterWarpBarrier(prehead_block->getTerminator());
auto header_block = L->getHeader();
CreateInterWarpBarrier(&(*header_block->begin()));
} else {
auto prehead_block = L->getLoopPreheader();
CreateIntraWarpBarrier(prehead_block->getTerminator());
auto header_block = L->getHeader();
CreateIntraWarpBarrier(&(*header_block->begin()));
}
// as we assume all loops are rotated, we have to insert
// barrier before the condition jump of the loop exit
if (auto exit_block = L->getExitingBlock()) {
auto conditional_br =
dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
assert(conditional_br && conditional_br->isConditional());
// insert barrier at the beginning of successor of exit
if (!is_warp)
CreateInterWarpBarrier(conditional_br);
else
CreateIntraWarpBarrier(conditional_br);
} else {
// handle break in for-loop
printf("loop has multiply exists\n");
// this time, we have also insert sync before the for-body
auto header_block = L->getHeader();
assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
"has more than 2 successors of the for-head\n");
BasicBlock *for_body = NULL;
for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
i++) {
auto bb = header_block->getTerminator()->getSuccessor(i);
if (L->contains(bb)) {
if (is_warp) {
CreateIntraWarpBarrier(&(*bb->begin()));
} else {
CreateInterWarpBarrier(&(*bb->begin()));
}
}
}
SmallVector<llvm::BasicBlock *, 8> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
while (!ExitingBlocks.empty()) {
auto exit_block = ExitingBlocks.back();
ExitingBlocks.pop_back();
auto conditional_br =
dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
assert(conditional_br && conditional_br->isConditional());
// insert barrier at the beginning of successor of exit
if (!is_warp)
CreateInterWarpBarrier(conditional_br);
else
CreateIntraWarpBarrier(conditional_br);
}
}
return 1;
}
};
char InsertBuiltInBarrier::ID = 0;
char InsertConditionalBarrier::ID = 0;
char InsertConditionalForBarrier::ID = 0;
char InsertBarrierForSpecialCase::ID = 0;
namespace {
static RegisterPass<InsertConditionalBarrier>
insert_conditional_barrier("insert-conditional-if-barriers",
"Insert conditional barriers for if body");
static RegisterPass<InsertConditionalForBarrier>
insert_conditional_for_barrier("insert-conditional-for-barriers",
"Insert conditional barriers for for loop");
static RegisterPass<InsertBarrierForSpecialCase>
insert_special_case("insert-special-case-barriers",
"Insert barriers for special cases");
static RegisterPass<InsertBuiltInBarrier>
insert_built_in_barrier("insert-built-in-barriers",
"Insert built in barriers");
} // namespace
void insert_sync(llvm::Module *M) {
auto Registry = PassRegistry::getPassRegistry();
llvm::legacy::PassManager Passes;
std::vector<std::string> passes;
passes.push_back("insert-built-in-barriers");
passes.push_back("insert-conditional-if-barriers");
passes.push_back("insert-conditional-for-barriers");
passes.push_back("insert-special-case-barriers");
for (auto pass : passes) {
const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
if (PIs) {
Pass *thispass = PIs->createPass();
Passes.add(thispass);
} else {
assert(0 && "Pass not found\n");
}
}
Passes.run(*M);
}

View File

@ -0,0 +1,848 @@
#include "insert_warp_loop.h"
#include "handle_sync.h"
#include "tool.h"
#include <assert.h>
#include <iostream>
#include <set>
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <map>
#include <set>
#include <sstream>
#include <tuple>
#include <vector>
using namespace llvm;
struct ParallelRegion {
std::set<llvm::BasicBlock *> wrapped_block;
llvm::BasicBlock *successor_block;
llvm::BasicBlock *start_block;
llvm::BasicBlock *end_block;
bool inst_in_region(llvm::Instruction *inst) {
for (auto bb : wrapped_block) {
if (inst->getParent()->getName().str() == bb->getName().str())
return true;
}
return false;
}
bool inst_used_in_region(llvm::Instruction *inst) {
for (auto ui = inst->use_begin(); ui != inst->use_end(); ++ui) {
auto *user = dyn_cast<Instruction>(ui->getUser());
if (user == NULL)
continue;
if (inst_in_region(user)) {
return 1;
}
}
return 0;
}
};
std::map<llvm::Instruction *, unsigned> tempInstructionIds;
std::map<std::string, llvm::Instruction *> contextArrays;
int tempInstructionIndex = 0;
int need_nested_loop;
bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
if (isa<BranchInst>(instr))
return true;
llvm::Module *M = instr->getParent()->getParent()->getParent();
llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
if (load != NULL) {
auto load_addr = load->getPointerOperand();
if (load_addr == M->getGlobalVariable("intra_warp_index"))
return true;
if (load_addr == M->getGlobalVariable("inter_warp_index"))
return true;
if (load_addr == M->getGlobalVariable("warp_vote"))
return true;
}
// TODO: we should further analyze whether the local variable
// is same among all threads within a wrap
return false;
}
// generate countpart alloc in the beginning of the Function
llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
bool intra_warp_loop) {
std::ostringstream var;
if (std::string(instruction->getName().str()) != "") {
var << instruction->getName().str();
} else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) {
var << tempInstructionIds[instruction];
} else {
tempInstructionIds[instruction] = tempInstructionIndex++;
var << tempInstructionIds[instruction];
}
if (intra_warp_loop)
var << "_intra_warp_";
else
var << "_inter_warp_";
std::string varName = var.str();
if (contextArrays.find(varName) != contextArrays.end())
return contextArrays[varName];
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
Function *FF = instruction->getParent()->getParent();
Module *M = instruction->getParent()->getParent()->getParent();
LLVMContext &C = M->getContext();
const llvm::DataLayout &Layout = M->getDataLayout();
llvm::Type *elementType;
if (isa<AllocaInst>(instruction)) {
elementType =
dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
} else {
elementType = instruction->getType();
}
Type *AllocType = elementType;
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
if (InstCast) {
unsigned Alignment = InstCast->getAlignment();
uint64_t StoreSize = Layout.getTypeStoreSize(InstCast->getAllocatedType());
if ((Alignment > 1) && (StoreSize & (Alignment - 1))) {
uint64_t AlignedSize = (StoreSize & (~(Alignment - 1))) + Alignment;
assert(AlignedSize > StoreSize);
uint64_t RequiredExtraBytes = AlignedSize - StoreSize;
if (isa<ArrayType>(elementType)) {
ArrayType *StructPadding = ArrayType::get(
Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
std::vector<Type *> PaddedStructElements;
PaddedStructElements.push_back(elementType);
PaddedStructElements.push_back(StructPadding);
const ArrayRef<Type *> NewStructElements(PaddedStructElements);
AllocType = StructType::get(M->getContext(), NewStructElements, true);
uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
assert(NewStoreSize == AlignedSize);
} else if (isa<StructType>(elementType)) {
StructType *OldStruct = dyn_cast<StructType>(elementType);
ArrayType *StructPadding = ArrayType::get(
Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
std::vector<Type *> PaddedStructElements;
for (unsigned j = 0; j < OldStruct->getNumElements(); j++)
PaddedStructElements.push_back(OldStruct->getElementType(j));
PaddedStructElements.push_back(StructPadding);
const ArrayRef<Type *> NewStructElements(PaddedStructElements);
AllocType = StructType::get(OldStruct->getContext(), NewStructElements,
OldStruct->isPacked());
uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
assert(NewStoreSize == AlignedSize);
}
}
}
llvm::Value *ItemSize = nullptr;
llvm::AllocaInst *Alloca = nullptr;
auto block_size_addr = M->getGlobalVariable("block_size");
auto block_size = builder.CreateLoad(block_size_addr);
Alloca = builder.CreateAlloca(AllocType, block_size, varName);
contextArrays[varName] = Alloca;
return Alloca;
}
// save the local variable into replicated array
llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
llvm::Instruction *alloca,
bool intra_warp_loop) {
if (isa<AllocaInst>(instruction)) {
return NULL;
}
llvm::Module *M = instruction->getParent()->getParent()->getParent();
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
/* Save the produced variable to the array. */
BasicBlock::iterator definition =
(dyn_cast<Instruction>(instruction))->getIterator();
++definition;
IRBuilder<> builder(&*definition);
std::vector<llvm::Value *> gepArgs;
auto inter_warp_index =
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
auto intra_warp_index =
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
auto thread_idx = builder.CreateBinOp(
Instruction::Add, intra_warp_index,
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
ConstantInt::get(I32, 32)),
"thread_idx");
gepArgs.push_back(thread_idx);
return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
}
llvm::Instruction *AddContextRestore(llvm::Value *val,
llvm::Instruction *alloca,
llvm::Instruction *before, bool isAlloca,
bool intra_warp_loop) {
assert(val != NULL);
assert(alloca != NULL);
IRBuilder<> builder(alloca);
if (before != NULL) {
builder.SetInsertPoint(before);
} else if (isa<Instruction>(val)) {
builder.SetInsertPoint(dyn_cast<Instruction>(val));
before = dyn_cast<Instruction>(val);
} else {
assert(false && "Unknown context restore location!");
}
std::vector<llvm::Value *> gepArgs;
auto M = before->getParent()->getParent()->getParent();
auto I32 = llvm::Type::getInt32Ty(M->getContext());
auto inter_warp_index =
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
auto intra_warp_index =
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
auto thread_idx = builder.CreateBinOp(
Instruction::Add, intra_warp_index,
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
ConstantInt::get(I32, 32)),
"thread_idx");
gepArgs.push_back(thread_idx);
llvm::Instruction *gep =
dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
if (isAlloca) {
return gep;
}
return builder.CreateLoad(gep);
}
void AddContextSaveRestore(llvm::Instruction *instruction,
bool intra_warp_loop) {
/* Allocate the context data array for the variable. */
llvm::Instruction *alloca = GetContextArray(instruction, intra_warp_loop);
llvm::Instruction *theStore =
AddContextSave(instruction, alloca, intra_warp_loop);
std::vector<Instruction *> uses;
for (Instruction::use_iterator ui = instruction->use_begin(),
ue = instruction->use_end();
ui != ue; ++ui) {
llvm::Instruction *user = cast<Instruction>(ui->getUser());
if (user == NULL)
continue;
if (user == theStore)
continue;
uses.push_back(user);
}
for (auto user : uses) {
Instruction *contextRestoreLocation = user;
llvm::Value *loadedValue =
AddContextRestore(user, alloca, contextRestoreLocation,
isa<AllocaInst>(instruction), intra_warp_loop);
user->replaceUsesOfWith(instruction, loadedValue);
}
}
void handle_alloc(llvm::Function *F) {
auto M = F->getParent();
LLVMContext &C = M->getContext();
auto I32 = llvm::Type::getInt32Ty(C);
std::vector<llvm::Instruction *> instruction_to_fix;
for (auto bb = F->begin(); bb != F->end(); bb++) {
for (auto ii = bb->begin(); ii != bb->end(); ii++) {
if (llvm::AllocaInst *i = dyn_cast<AllocaInst>(ii)) {
instruction_to_fix.push_back(i);
}
}
}
std::vector<llvm::Instruction *> need_remove;
for (auto inst : instruction_to_fix) {
// generate a new alloc
auto block_size_addr = M->getGlobalVariable("block_size");
IRBuilder<> builder(inst);
auto block_size = builder.CreateLoad(block_size_addr);
llvm::Type *elementType = NULL;
if (dyn_cast<AllocaInst>(inst)->getType()->getElementType()) {
elementType = dyn_cast<AllocaInst>(inst)->getType()->getElementType();
}
assert(elementType != NULL);
auto Alloca = builder.CreateAlloca(elementType, block_size,
inst->getName().str() + "inter_warp");
// replace all usage
std::set<Instruction *> replace_user;
for (Instruction::use_iterator ui = inst->use_begin(), ue = inst->use_end();
ui != ue; ++ui) {
replace_user.insert(dyn_cast<Instruction>(ui->getUser()));
}
for (auto user : replace_user) {
IRBuilder<> builder(user);
// std::vector<llvm::Value *> gepArgs;
auto inter_warp_index =
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
auto intra_warp_index =
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
auto thread_idx = builder.CreateBinOp(
Instruction::Add, intra_warp_index,
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
ConstantInt::get(I32, 32)),
"thread_idx");
auto gep = builder.CreateGEP(Alloca, thread_idx);
user->replaceUsesOfWith(inst, gep);
}
need_remove.push_back(inst);
}
for (auto inst : need_remove) {
inst->dropAllReferences();
inst->eraseFromParent();
}
}
void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
bool intra_warp_loop = 1;
// we should handle allocation generated by PHI
{
std::vector<llvm::Instruction *> instruction_to_fix;
auto F = PRs[0].start_block->getParent();
for (auto bb = F->begin(); bb != F->end(); bb++) {
for (auto ii = bb->begin(); ii != bb->end(); ii++) {
if (isa<AllocaInst>(&(*ii)))
instruction_to_fix.push_back(&(*ii));
}
for (auto inst : instruction_to_fix) {
AddContextSaveRestore(inst, intra_warp_loop);
}
}
}
for (auto parallel_regions : PRs) {
std::set<llvm::Instruction *> instruction_in_region;
std::vector<llvm::Instruction *> instruction_to_fix;
for (auto bb : parallel_regions.wrapped_block) {
for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
++instr) {
llvm::Instruction *instruction = &*instr;
instruction_in_region.insert(instruction);
}
}
/* Find all the instructions that define new values and
check if they need to be context saved. */
for (auto bb : parallel_regions.wrapped_block) {
for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
++instr) {
llvm::Instruction *instruction = &*instr;
if (ShouldNotBeContextSaved(instruction))
continue;
for (Instruction::use_iterator ui = instruction->use_begin(),
ue = instruction->use_end();
ui != ue; ++ui) {
llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
if (user == NULL)
continue;
if (isa<AllocaInst>(instruction) ||
(instruction_in_region.find(user) ==
instruction_in_region.end())) {
instruction_to_fix.push_back(instruction);
break;
}
}
}
}
for (auto inst : instruction_to_fix) {
AddContextSaveRestore(inst, intra_warp_loop);
}
}
}
BasicBlock *insert_loop_init(llvm::BasicBlock *InsertInitBefore,
bool intra_warp_loop) {
llvm::Module *M = InsertInitBefore->getParent()->getParent();
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::string block_name =
(intra_warp_loop) ? "intra_warp_init" : "inter_warp_init";
BasicBlock *loop_init = BasicBlock::Create(
context, block_name, InsertInitBefore->getParent(), InsertInitBefore);
IRBuilder<> builder(context);
builder.SetInsertPoint(loop_init);
if (intra_warp_loop) { // intra warp
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
} else { // inter warp
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
}
builder.CreateBr(InsertInitBefore);
return loop_init;
}
BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore,
llvm::BasicBlock *LoopEnd, bool intra_warp_loop) {
llvm::Module *M = InsertCondBefore->getParent()->getParent();
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::string block_name =
(intra_warp_loop) ? "intra_warp_cond" : "inter_warp_cond";
BasicBlock *loop_cond = BasicBlock::Create(
context, block_name, InsertCondBefore->getParent(), InsertCondBefore);
IRBuilder<> builder(context);
builder.SetInsertPoint(loop_cond);
llvm::Value *cmpResult = NULL;
if (!intra_warp_loop) {
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
auto block_size = M->getGlobalVariable("block_size");
auto warp_cnt =
builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size),
ConstantInt::get(I32, 32), "warp_number");
cmpResult =
builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt);
} else {
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
auto block_size = M->getGlobalVariable("block_size");
if (!need_nested_loop) {
cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
builder.CreateLoad(block_size));
} else {
cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
ConstantInt::get(I32, 32));
}
}
builder.CreateCondBr(cmpResult, InsertCondBefore, LoopEnd);
return loop_cond;
}
BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore,
bool intra_warp_loop) {
llvm::Module *M = InsertIncBefore->getParent()->getParent();
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::string block_name =
(intra_warp_loop) ? "intra_warp_inc" : "inter_warp_inc";
BasicBlock *loop_inc = BasicBlock::Create(
context, block_name, InsertIncBefore->getParent(), InsertIncBefore);
IRBuilder<> builder(context);
builder.SetInsertPoint(loop_inc);
if (intra_warp_loop) { // intra warp
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
auto new_index = builder.CreateBinOp(
Instruction::Add, builder.CreateLoad(intra_warp_index),
ConstantInt::get(I32, 1), "intra_warp_index_increment");
builder.CreateStore(new_index, intra_warp_index);
} else { // inter warp
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
auto new_index = builder.CreateBinOp(
Instruction::Add, builder.CreateLoad(inter_warp_index),
ConstantInt::get(I32, 1), "inter_warp_index_increment");
builder.CreateStore(new_index, inter_warp_index);
}
builder.CreateBr(InsertIncBefore);
return loop_inc;
}
void add_warp_loop(std::vector<ParallelRegion> parallel_regions,
bool intra_warp_loop) {
for (auto region : parallel_regions) {
auto start_block = region.start_block;
auto tail_block = region.end_block;
auto next_block = region.successor_block;
auto loop_cond = insert_loop_cond(start_block, next_block, intra_warp_loop);
auto loop_init = insert_loop_init(loop_cond, intra_warp_loop);
auto F = start_block->getParent();
for (Function::iterator i = F->begin(); i != F->end(); ++i) {
llvm::BasicBlock *bb = &(*i);
if (bb == loop_cond)
continue;
bb->getTerminator()->replaceUsesOfWith(start_block, loop_init);
}
auto loop_inc = insert_loop_inc(loop_cond, intra_warp_loop);
tail_block->getTerminator()->replaceUsesOfWith(next_block, loop_inc);
// we have to reset inter/intra warp index to 0, as these maybe used
// outside PR when there are conditional loop/branch
llvm::Module *M = start_block->getParent()->getParent();
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
BasicBlock *reset_index = BasicBlock::Create(start_block->getContext(),
"reset_block", F, next_block);
IRBuilder<> builder(start_block->getContext());
builder.SetInsertPoint(reset_index);
if (intra_warp_loop) { // intra warp
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
} else { // inter warp
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
}
builder.CreateBr(next_block);
loop_cond->getTerminator()->replaceUsesOfWith(next_block, reset_index);
// add metadata
MDNode *Dummy =
MDNode::getTemporary(context, ArrayRef<Metadata *>()).release();
MDNode *AccessGroupMD = MDNode::getDistinct(context, {});
MDNode *ParallelAccessMD = MDNode::get(
context,
{MDString::get(context, "llvm.loop.parallel_accesses"), AccessGroupMD});
MDNode *Root = MDNode::get(context, {Dummy, ParallelAccessMD});
Root->replaceOperandWith(0, Root);
MDNode::deleteTemporary(Dummy);
// We now have
// !1 = metadata !{metadata !1} <- self-referential root
loop_cond->getTerminator()->setMetadata("llvm.loop", Root);
for (auto bb : region.wrapped_block) {
for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); ii != ee;
ii++) {
if (!ii->mayReadOrWriteMemory()) {
continue;
}
MDNode *NewMD = MDNode::get(bb->getContext(), AccessGroupMD);
MDNode *OldMD = ii->getMetadata("llvm.mem.parallel_loop_access");
if (OldMD != nullptr) {
NewMD = llvm::MDNode::concatenate(OldMD, NewMD);
}
ii->setMetadata("llvm.mem.parallel_loop_access", NewMD);
}
}
}
}
void print_parallel_region(std::vector<ParallelRegion> parallel_regions) {
printf("get PR:\n");
for (auto region : parallel_regions) {
auto start = region.start_block;
auto end = region.end_block;
auto next = region.successor_block;
printf("parallel region: %s->%s next: %s\n", start->getName().str().c_str(),
end->getName().str().c_str(), next->getName().str().c_str());
printf("have: \n");
for (auto b : region.wrapped_block) {
printf("%s\n", b->getName().str().c_str());
}
}
}
void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
std::vector<Instruction *> need_remove;
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync") {
need_remove.push_back(Call);
}
if (!intra_warp_loop && (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync")) {
need_remove.push_back(Call);
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
class InsertWarpLoopPass : public llvm::FunctionPass {
public:
static char ID;
bool intra_warp_loop;
DominatorTree *DT;
PostDominatorTree *PDT;
InsertWarpLoopPass(bool intra_warp = 0)
: FunctionPass(ID), intra_warp_loop(intra_warp) {}
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<PostDominatorTreeWrapperPass>();
}
void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop,
std::vector<ParallelRegion> &parallel_regions) {
ParallelRegion current_region;
SmallVector<BasicBlock *, 4> pending_blocks;
BasicBlock *region_entry_barrier = NULL;
BasicBlock *entry = NULL;
BasicBlock *exit = B->getSinglePredecessor();
for (BasicBlock *Pred : predecessors(B)) {
pending_blocks.push_back(Pred);
}
if (pending_blocks.size() > 1) {
// becuase we have insert the sync and split by them,
// so if B has several income edges, it must be a merge point
// for a conditional if. We can safely ignore it
// TODO: we have to further check whether this conditional if
// is for inter warp or intra warp
return;
}
while (!pending_blocks.empty()) {
BasicBlock *current = pending_blocks.back();
pending_blocks.pop_back();
// avoid infinite recursion of loops
if (current_region.wrapped_block.count(current) != 0) {
continue;
}
// If we reach another barrier this must be the
// parallel region entry.
bool has_barrier = 0;
for (auto i = current->begin(), e = current->end(); i != e; ++i) {
if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
auto func_name = call_inst->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync")
has_barrier = 1;
if (func_name == "llvm.nvvm.bar.warp.sync" && intra_warp_loop)
has_barrier = 1;
}
}
// if we reach a block which only has a single condtional branch,
// it is the start point of a B-condition, we have to stop here
bool is_single_conditional_branch_block = 0;
if (auto br = dyn_cast<llvm::BranchInst>(current->getTerminator())) {
if (br->isConditional()) {
if (current->size() == 1) {
is_single_conditional_branch_block = 1;
} else {
// generate by replicate local variable
printf(
"[WARNING] match single conditional branch with HARD CODE\n");
bool branch_to_intra_init = false;
for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
auto block_name = entryCandidate->getName().str();
if (find_block_barrier_in_region(current, B)) {
if (block_name.find("warp_init") != block_name.npos) {
is_single_conditional_branch_block = 1;
break;
}
}
}
}
}
}
if (has_barrier || is_single_conditional_branch_block) {
if (region_entry_barrier == NULL)
region_entry_barrier = current;
else if (region_entry_barrier != current) {
// this means there is not PR before B, just return
return;
}
continue;
}
// Non-barrier block, this must be on the region.
current_region.wrapped_block.insert(current);
// Add predecessors to pending queue.
for (BasicBlock *Pred : predecessors(current)) {
pending_blocks.push_back(Pred);
}
}
if (current_region.wrapped_block.empty()) {
return;
}
// if do not find entry node, this means all predecessor
// blocks do not need to execute multiply times
if (region_entry_barrier == NULL) {
return;
}
// Find the entry node.
assert(region_entry_barrier != NULL);
for (unsigned
suc = 0,
num = region_entry_barrier->getTerminator()->getNumSuccessors();
suc < num; ++suc) {
llvm::BasicBlock *entryCandidate =
region_entry_barrier->getTerminator()->getSuccessor(suc);
if (current_region.wrapped_block.count(entryCandidate) == 0)
continue;
entry = entryCandidate;
break;
}
// delete useless PR, those PRs only have branch
if (entry == exit) {
if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
return;
}
}
bool is_useless = true;
auto iter = entry;
do {
if (iter->size() != 1 || !isa<llvm::BranchInst>(entry->begin())) {
is_useless = false;
break;
}
if (iter->getTerminator()->getNumSuccessors() > 1) {
is_useless = false;
break;
}
iter = iter->getTerminator()->getSuccessor(0);
} while (iter != exit);
if (is_useless) {
return;
}
assert(current_region.wrapped_block.count(entry) != 0);
current_region.start_block = entry;
current_region.end_block = exit;
current_region.successor_block = B;
parallel_regions.push_back(current_region);
}
std::vector<ParallelRegion> getParallelRegions(llvm::Function *F,
bool intra_warp_loop) {
std::vector<ParallelRegion> parallel_regions;
SmallVector<BasicBlock *, 4> exit_blocks;
for (Function::iterator s = F->begin(); s != F->end(); s++) {
if (llvm::CallInst *call_inst =
llvm::dyn_cast<llvm::CallInst>(s->begin())) {
auto func_name = call_inst->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") {
exit_blocks.push_back(&(*s));
}
// when handling intra warp loop, we need also split the blocks
// between warp barrier
if (intra_warp_loop && func_name == "llvm.nvvm.bar.warp.sync") {
exit_blocks.push_back(&(*s));
}
}
}
// First find all the ParallelRegions in the Function.
while (!exit_blocks.empty()) {
BasicBlock *exit = exit_blocks.back();
exit_blocks.pop_back();
getParallelRegionBefore(exit, intra_warp_loop, parallel_regions);
}
return parallel_regions;
}
virtual bool runOnFunction(Function &F) {
if (!isKernelFunction(F.getParent(), &F))
return 0;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
// find parallel region we need to wrap
auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
assert(!parallel_regions.empty() && "can not find any parallel regions\n");
// print_parallel_region(parallel_regions);
add_warp_loop(parallel_regions, intra_warp_loop);
if (intra_warp_loop) {
handle_local_variable_intra_warp(parallel_regions);
}
remove_barrier(&F, intra_warp_loop);
return 1;
}
};
char InsertWarpLoopPass::ID = 0;
namespace {
static RegisterPass<InsertWarpLoopPass> X("insert-warp-loop",
"Insert inter/intra warp loop");
} // namespace
bool has_warp_barrier(llvm::Module *M) {
for (auto F = M->begin(); F != M->end(); ++F)
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync") {
return true;
}
}
}
}
return false;
}
void insert_warp_loop(llvm::Module *M) {
llvm::legacy::PassManager Passes;
need_nested_loop = has_warp_barrier(M);
// use nested loop only when there are warp-level barrier
if (need_nested_loop) {
bool intra_warp = true;
Passes.add(new InsertWarpLoopPass(intra_warp));
// insert inter warp loop
Passes.add(new InsertWarpLoopPass(!intra_warp));
Passes.run(*M);
} else {
bool intra_warp = true;
// only need a single loop, with size=block_size
Passes.add(new InsertWarpLoopPass(intra_warp));
Passes.run(*M);
// remove all barriers
for (auto F = M->begin(); F != M->end(); ++F)
remove_barrier(dyn_cast<llvm::Function>(F), false);
}
}

View File

@ -0,0 +1,126 @@
#include "memory_hierarchy.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <assert.h>
#include <map>
#include <set>
#include <sstream>
#include <tuple>
#include <vector>
void mem_share2global(llvm::Module *M) {
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int64T = Type::getInt64Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
std::set<llvm::Instruction *> need_remove;
std::set<GlobalVariable *> need_remove_share_memory;
// find all share memory and generate corresponding global memory
for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
if (GlobalVariable *share_memory = dyn_cast<GlobalVariable>(I)) {
if (auto PT = dyn_cast<PointerType>(I->getType())) {
unsigned AS = PT->getAddressSpace();
if (AS == 3) { // find a share memory
need_remove_share_memory.insert(share_memory);
// generate the corresponding global memory variable
auto new_name = "wrapper_global_" + share_memory->getName().str();
auto element_type = PT->getElementType();
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
ConstantAggregateZero *const_array =
ConstantAggregateZero::get(array_type);
global_memory->setInitializer(const_array);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory));
} else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
auto zero = llvm::ConstantInt::get(int_type, 0, true);
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, int_type, false, llvm::GlobalValue::ExternalLinkage, zero,
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
false);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory));
} else {
assert(0 && "The required Share Memory Type is not supported\n");
}
}
}
}
}
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
BasicBlock *b = &*i;
for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
auto read_array = get_element_ptr->getPointerOperand();
if (GlobalVariable *read_share_memory =
dyn_cast<llvm::GlobalVariable>(read_array)) {
// find a GetElementPtr which read share memory
if (corresponding_global_memory.find(read_share_memory) !=
corresponding_global_memory.end()) {
std::vector<Value *> Indices;
for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
Indices.push_back(get_element_ptr->getOperand(i + 1));
auto new_GEP = GetElementPtrInst::Create(
NULL, // Pointee type
corresponding_global_memory.find(read_share_memory)
->second, // Alloca
Indices, // Indices
"", get_element_ptr);
// replace all get_element_ptr with new_GEP:
// we can not directly use:
// get_element_ptr->replaceAllUsesWith(new_GEP);
// as get_element_ptr and new_GEP have different return type
llvm::Type *original_type = get_element_ptr->getType();
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
new_GEP, original_type, "", get_element_ptr);
get_element_ptr->replaceAllUsesWith(FormatASC);
need_remove.insert(get_element_ptr);
}
}
} else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
auto read_array = addr_cast->getOperand(0);
if (GlobalVariable *read_share_memory =
dyn_cast<llvm::GlobalVariable>(read_array)) {
// find a GetElementPtr which read share memory
if (corresponding_global_memory.find(read_share_memory) !=
corresponding_global_memory.end()) {
llvm::Type *original_type = addr_cast->getType();
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
corresponding_global_memory.find(read_share_memory)->second,
original_type, "", addr_cast);
addr_cast->replaceAllUsesWith(FormatASC);
need_remove.insert(addr_cast);
}
}
}
}
}
}
for (auto i : need_remove) {
i->dropAllReferences();
i->eraseFromParent();
}
for (auto i : need_remove_share_memory) {
i->dropAllReferences();
i->eraseFromParent();
}
}

View File

@ -0,0 +1,88 @@
#include "performance.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <map>
#include <set>
#include <sstream>
#include <tuple>
#include <vector>
using namespace llvm;
void performance_optimization(llvm::Module *M) {
for (auto F = M->begin(); F != M->end(); F++) {
for (auto I = F->arg_begin(); I != F->arg_end(); ++I) {
if (I->getType()->isPointerTy()) {
I->addAttr(llvm::Attribute::NoAlias);
}
}
}
llvm::legacy::PassManager Passes;
// add target machine info
llvm::Triple triple("x86_64-unknown-linux-gnu");
std::string Error;
const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
if (!TheTarget) {
printf("Error: %s\n", Error.c_str());
assert(0);
}
llvm::TargetOptions Options;
Options.FloatABIType = FloatABI::Hard;
TargetMachine *TM = TheTarget->createTargetMachine(
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
assert(TM && "No Machine Information\n");
Passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
TargetLibraryInfoImpl TLII(triple);
TLII.disableAllFunctions();
Passes.add(new TargetLibraryInfoWrapperPass(TLII));
// Add O3 optimization
llvm::PassManagerBuilder Builder;
Builder.OptLevel = 3;
Builder.SizeLevel = 0;
Builder.LoopVectorize = true;
Builder.SLPVectorize = true;
Builder.VerifyInput = true;
Builder.VerifyOutput = true;
Builder.populateModulePassManager(Passes);
Passes.run(*M);
}

View File

@ -0,0 +1,480 @@
#include "tool.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <set>
using namespace llvm;
llvm::Module *LoadModuleFromFilr(char *file_name) {
llvm::SMDiagnostic Err;
llvm::LLVMContext *globalContext = new llvm::LLVMContext;
auto program = parseIRFile(file_name, Err, *globalContext).release();
if (!program) {
printf("error when opening the bitcode\n");
exit(1);
}
return program;
}
void VerifyModule(llvm::Module *program) {
std::string msg;
llvm::raw_string_ostream os(msg);
if (llvm::verifyModule(*program, &(llvm::errs())))
llvm::report_fatal_error(os.str().c_str());
}
void DumpModule(llvm::Module *M, char *file_name) {
// modify the program, add a wrapper
std::string msg;
llvm::raw_string_ostream os(msg);
std::error_code EC;
ToolOutputFile Out(file_name, EC, sys::fs::F_None);
if (EC) {
errs() << "Fails to open output file: " << EC.message();
return;
}
WriteBitcodeToFile(*M, Out.os());
Out.keep();
}
bool isKernelFunction(llvm::Module *M, llvm::Function *F) {
NamedMDNode *NamedMD = M->getNamedMetadata("nvvm.annotations");
if (!NamedMD) {
printf("there must be nvvm.annotations!\n");
exit(1);
}
for (unsigned I = 0, E = NamedMD->getNumOperands(); I != E; ++I) {
MDNode *MD = NamedMD->getOperand(I);
if (!MD || MD->getNumOperands() == 0)
continue;
if (MD->getNumOperands() != 3)
continue;
Metadata *Op = MD->getOperand(1);
if (auto Str = llvm::cast<MDString>(Op)) {
if (Str->getString().str() != "kernel")
continue;
llvm::Value *meta =
dyn_cast<llvm::ValueAsMetadata>(MD->getOperand(0))->getValue();
Function *FF = llvm::cast<Function>(meta);
if (FF->getName().str() == F->getName().str())
return true;
}
}
return false;
}
void replace_block(llvm::Function *F, llvm::BasicBlock *before,
llvm::BasicBlock *after) {
for (Function::iterator i = F->begin(); i != F->end(); ++i) {
llvm::BasicBlock *bb = &(*i);
if (bb == after)
continue;
bb->getTerminator()->replaceUsesOfWith(before, after);
}
}
llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore) {
llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
llvm::FunctionType *LauncherFuncT =
FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
llvm::FunctionCallee f =
M->getOrInsertFunction("llvm.nvvm.barrier0", LauncherFuncT);
llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
return llvm::CallInst::Create(F, "", InsertBefore);
}
llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore) {
llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
llvm::FunctionType *LauncherFuncT =
FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
llvm::FunctionCallee f =
M->getOrInsertFunction("llvm.nvvm.bar.warp.sync", LauncherFuncT);
llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
return llvm::CallInst::Create(F, "", InsertBefore);
}
llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
std::string allocaName = std::string(phi->getName().str()) + ".ex_phi";
llvm::Function *function = phi->getParent()->getParent();
IRBuilder<> builder(&*(function->getEntryBlock().getFirstInsertionPt()));
llvm::Instruction *alloca =
builder.CreateAlloca(phi->getType(), 0, allocaName);
for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
++incoming) {
Value *val = phi->getIncomingValue(incoming);
BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
builder.SetInsertPoint(incomingBB->getTerminator());
llvm::Instruction *store = builder.CreateStore(val, alloca);
}
builder.SetInsertPoint(phi);
llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
phi->replaceAllUsesWith(loadedValue);
phi->eraseFromParent();
return loadedValue;
}
void phi2alloc(llvm::Module *M) {
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
typedef std::vector<llvm::Instruction *> InstructionVec;
InstructionVec PHIs;
for (Function::iterator bb = F->begin(); bb != F->end(); ++bb) {
for (BasicBlock::iterator p = bb->begin(); p != bb->end(); ++p) {
Instruction *instr = &*p;
if (isa<PHINode>(instr)) {
PHIs.push_back(instr);
}
}
}
bool changed = false;
for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
Instruction *instr = *i;
BreakPHIToAllocas(dyn_cast<PHINode>(instr));
}
}
}
void remove_cuda_built_in(llvm::Module *M) {
// initialize function name
std::set<std::string> useless_func_name;
useless_func_name.insert("cudaMalloc");
useless_func_name.insert("cudaFuncGetAttributes");
useless_func_name.insert("cudaGetDevice");
useless_func_name.insert("cudaDeviceGetAttribute");
useless_func_name.insert("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
useless_func_name.insert(
"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
std::set<llvm::Function *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (useless_func_name.find(func_name) != useless_func_name.end()) {
need_remove.insert(F);
}
}
for (auto F : need_remove) {
F->dropAllReferences();
F->eraseFromParent();
}
}
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::vector<llvm::Instruction *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
IRBuilder<> builder(&*(F->getEntryBlock().getFirstInsertionPt()));
auto global_intra_warp_idx =
F->getParent()->getGlobalVariable("intra_warp_index");
auto local_intra_warp_idx =
builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
0, "local_intra_warp_idx");
global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
auto global_inter_warp_idx =
F->getParent()->getGlobalVariable("inter_warp_index");
auto local_inter_warp_idx =
builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
0, "local_inter_warp_idx");
global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Load = dyn_cast<LoadInst>(BI)) {
auto load_from = Load->getOperand(0);
if (load_from == F->getParent()->getGlobalVariable("block_size")) {
Load->replaceAllUsesWith(ConstantInt::get(
I32, block_dim[0] * block_dim[1] * block_dim[2]));
need_remove.push_back(Load);
}
} else if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->getCalledFunction()) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
// replace it by warp_id
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto thread_idx = builder.CreateBinOp(
Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
ConstantInt::get(I32, 32), "");
thread_idx = builder.CreateBinOp(
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
thread_idx, "thread_idx");
if (block_dim[1] != 1 || block_dim[2] != 1) {
printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
thread_idx = builder.CreateBinOp(
Instruction::SRem, thread_idx,
ConstantInt::get(I32, block_dim[0]), "thread_id_x");
}
Call->replaceAllUsesWith(thread_idx);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.y") {
// replace it by warp_id
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto thread_idx = builder.CreateBinOp(
Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
ConstantInt::get(I32, 32), "");
thread_idx = builder.CreateBinOp(
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
thread_idx, "thread_idx");
// tidy = tid / block_dim.x
thread_idx = builder.CreateBinOp(
Instruction::SDiv, thread_idx,
ConstantInt::get(I32, block_dim[0]),
// builder.CreateLoad(M->getGlobalVariable("block_size_x")),
"thread_id_y");
Call->replaceAllUsesWith(thread_idx);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
printf("[WARNING] We DO NOT support multi-dim block\n");
auto zero = ConstantInt::get(I32, 0);
Call->replaceAllUsesWith(zero);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
auto block_index_addr = M->getGlobalVariable("block_index");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto block_idx = builder.CreateLoad(block_index_addr);
Call->replaceAllUsesWith(block_idx);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
printf("[WARNING We DO NOT support multi-dim grid\n");
auto zero = ConstantInt::get(I32, 0);
Call->replaceAllUsesWith(zero);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
auto block_size_addr = M->getGlobalVariable("block_size_x");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto block_size = ConstantInt::get(I32, block_dim[0]);
Call->replaceAllUsesWith(block_size);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
auto block_size_addr = M->getGlobalVariable("block_size_y");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto block_size = ConstantInt::get(I32, block_dim[1]);
Call->replaceAllUsesWith(block_size);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
auto block_size_addr = M->getGlobalVariable("block_size_z");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto block_size = ConstantInt::get(I32, block_dim[2]);
Call->replaceAllUsesWith(block_size);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
auto grid_size_addr = M->getGlobalVariable("grid_size");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto grid_size = ConstantInt::get(I32, grid_dim[0]);
Call->replaceAllUsesWith(grid_size);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
printf("[WARNING We DO NOT support multi-dim grid\n");
auto one = ConstantInt::get(I32, 1);
Call->replaceAllUsesWith(one);
need_remove.push_back(Call);
}
}
if (Call->isInlineAsm()) {
auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
printf("unknown InlineAsm\n");
exit(1);
}
// return the rank within the warp
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx);
Call->replaceAllUsesWith(intra_warp_index);
need_remove.push_back(Call);
}
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
void replace_asm_call(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::vector<CallInst *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->isInlineAsm()) {
auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
printf("unknown InlineAsm\n");
exit(1);
}
// return the rank within the warp
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto intra_warp_index_addr =
M->getGlobalVariable("intra_warp_index");
auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr);
Call->replaceAllUsesWith(intra_warp_index);
need_remove.push_back(Call);
}
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
bool has_warp_barrier(llvm::BasicBlock *B) {
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync") {
return true;
}
}
}
return false;
}
bool has_barrier(llvm::BasicBlock *B) {
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier.sync") {
return true;
}
}
}
return false;
}
bool has_block_barrier(llvm::BasicBlock *B) {
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") {
return true;
}
}
}
return false;
}
bool has_barrier(llvm::Function *F) {
for (auto B = F->begin(); B != F->end(); B++) {
if (has_barrier(&(*B)))
return true;
}
return false;
}
bool find_block_barrier_in_region(llvm::BasicBlock *start,
llvm::BasicBlock *end) {
std::set<llvm::BasicBlock *> visit;
std::vector<llvm::BasicBlock *> pending_blocks;
for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
}
while (!pending_blocks.empty()) {
BasicBlock *current = pending_blocks.back();
pending_blocks.pop_back();
if (visit.find(current) != visit.end())
continue;
visit.insert(current);
if (current == end)
continue;
if (has_block_barrier(current)) {
return 1;
}
for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
}
}
return 0;
}
bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
std::set<llvm::BasicBlock *> visit;
std::vector<llvm::BasicBlock *> pending_blocks;
for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
}
while (!pending_blocks.empty()) {
BasicBlock *current = pending_blocks.back();
pending_blocks.pop_back();
if (visit.find(current) != visit.end())
continue;
visit.insert(current);
if (current == end)
continue;
if (has_barrier(current)) {
return 1;
}
for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
}
}
return 0;
}

View File

@ -0,0 +1,217 @@
#include "warp_func.h"
#include "tool.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
#include <set>
using namespace llvm;
/*
* Insert sync before each vote, and replace the
* original vote function to warp vote version
*/
void handle_warp_vote(llvm::Module *M) {
std::set<llvm::CallInst *> need_replace;
llvm::Type *Int1T = Type::getInt1Ty(M->getContext());
llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
auto zero = llvm::ConstantInt::get(I32, 0, true);
auto one = llvm::ConstantInt::get(I32, 1, true);
llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32)->getPointerTo();
llvm::FunctionType *LauncherFuncT =
FunctionType::get(Int1T, {VoteArrayType}, false);
llvm::FunctionCallee _f = M->getOrInsertFunction("warp_any", LauncherFuncT);
llvm::Function *func_warp_any = llvm::cast<llvm::Function>(_f.getCallee());
_f = M->getOrInsertFunction("warp_all", LauncherFuncT);
llvm::Function *func_warp_all = llvm::cast<llvm::Function>(_f.getCallee());
// replace llvm.nvvm.vote.any.sync to warp vote function
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (!isKernelFunction(M, F))
continue;
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
auto func_name = vote_any_sync->getCalledFunction()->getName();
if (func_name == "llvm.nvvm.vote.any.sync" ||
func_name == "llvm.nvvm.vote.all.sync") {
// insert sync before call
need_replace.insert(vote_any_sync);
}
}
}
}
}
GlobalVariable *warp_vote_ptr = M->getNamedGlobal("warp_vote");
for (auto sync_inst : need_replace) {
// create barrier
CreateIntraWarpBarrier(sync_inst);
/*
* store into warp_vote[tid]
*/
assert(warp_vote_ptr != NULL);
auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index");
auto intra_warp_index =
new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst);
auto GEP = GetElementPtrInst::Create(NULL, // Pointee type
warp_vote_ptr, // Alloca
{zero, intra_warp_index}, // Indices
"", sync_inst);
// as AVX only support 8bit for each thread
// so we have to cast the predict into int8
auto predict = llvm::CastInst::CreateIntegerCast(
sync_inst->getArgOperand(1), I8, false, "", sync_inst);
// we need to concern mask
auto mask = llvm::CastInst::CreateIntegerCast(sync_inst->getArgOperand(0),
I32, false, "", sync_inst);
auto bit_flag = BinaryOperator::Create(Instruction::LShr, mask,
intra_warp_index, "", sync_inst);
auto valid =
BinaryOperator::Create(Instruction::And, one, bit_flag, "", sync_inst);
auto valid_8bit =
llvm::CastInst::CreateIntegerCast(valid, I8, false, "", sync_inst);
llvm::Instruction *res;
if (sync_inst->getCalledFunction()->getName() ==
"llvm.nvvm.vote.any.sync") {
res = BinaryOperator::Create(Instruction::Mul, valid_8bit, predict, "",
sync_inst);
} else if (sync_inst->getCalledFunction()->getName() ==
"llvm.nvvm.vote.all.sync") {
auto reverse_valid = BinaryOperator::CreateNot(valid_8bit, "", sync_inst);
res = BinaryOperator::Create(Instruction::Or, reverse_valid, predict, "",
sync_inst);
// as AVX do not have all, we have to
// reverse the result and call AVX-any instead
res = BinaryOperator::CreateNot(res, "", sync_inst);
}
auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
// create barrier
CreateIntraWarpBarrier(sync_inst);
/*
* replace llvm.nvvm.vote.any.sync(i32 mask, i1 predict)
* to warp_any(i32 mask, i8* predict)
*/
std::vector<Value *> args;
// args.push_back(mask);
args.push_back(warp_vote_ptr);
llvm::Instruction *warp_inst;
if (sync_inst->getCalledFunction()->getName() ==
"llvm.nvvm.vote.any.sync") {
warp_inst = llvm::CallInst::Create(func_warp_any, args, "", sync_inst);
} else if (sync_inst->getCalledFunction()->getName() ==
"llvm.nvvm.vote.all.sync") {
warp_inst = llvm::CallInst::Create(func_warp_all, args, "", sync_inst);
}
sync_inst->replaceAllUsesWith(warp_inst);
sync_inst->eraseFromParent();
}
}
void handle_warp_shfl(llvm::Module *M) {
std::set<llvm::CallInst *> need_replace;
llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
auto ZERO = llvm::ConstantInt::get(I32, 0, true);
// replace llvm.nvvm.vote.any.sync to warp vote function
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
if (!isKernelFunction(M, F))
continue;
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
if (CallInst *warp_shfl = dyn_cast<CallInst>(BI)) {
auto func_name = warp_shfl->getCalledFunction()->getName();
if (func_name == "llvm.nvvm.shfl.sync.down.i32" ||
func_name == "llvm.nvvm.shfl.sync.up.i32" ||
func_name == "llvm.nvvm.shfl.sync.bfly.i32") {
// insert sync before call
need_replace.insert(warp_shfl);
}
}
}
}
}
GlobalVariable *warp_shfl_ptr = M->getNamedGlobal("warp_shfl");
for (auto shfl_inst : need_replace) {
/*
* %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add32, i32
* 16, i32 31)
* ->
* warp_shfl[warp_id] = add32
* warp.barrier()
* %10 = warp_shfl[warp_id + offset]
*/
IRBuilder<> builder(shfl_inst);
auto shfl_variable = shfl_inst->getArgOperand(1);
auto shfl_offset = shfl_inst->getArgOperand(2);
auto intra_warp_index =
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
builder.CreateStore(
shfl_variable,
builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index}));
// we should create barrier before store
CreateIntraWarpBarrier(intra_warp_index);
// load shuffled data
auto new_intra_warp_index =
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
auto shfl_name = shfl_inst->getCalledFunction()->getName().str();
if (shfl_name.find("down") != shfl_name.npos) {
auto calculate_offset = builder.CreateBinOp(
Instruction::Add, new_intra_warp_index, shfl_offset);
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
ConstantInt::get(I32, 32));
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
auto load_inst = builder.CreateLoad(gep);
// create barrier
CreateIntraWarpBarrier(new_intra_warp_index);
shfl_inst->replaceAllUsesWith(load_inst);
shfl_inst->eraseFromParent();
} else if (shfl_name.find("up") != shfl_name.npos) {
auto calculate_offset = builder.CreateBinOp(
Instruction::Sub, new_intra_warp_index, shfl_offset);
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
ConstantInt::get(I32, 32));
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
auto load_inst = builder.CreateLoad(gep);
// create barrier
CreateIntraWarpBarrier(new_intra_warp_index);
shfl_inst->replaceAllUsesWith(load_inst);
shfl_inst->eraseFromParent();
} else if (shfl_name.find("bfly") != shfl_name.npos) {
auto calculate_offset = builder.CreateBinOp(
Instruction::Xor, new_intra_warp_index, shfl_offset);
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
ConstantInt::get(I32, 32));
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
auto load_inst = builder.CreateLoad(gep);
// create barrier
CreateIntraWarpBarrier(new_intra_warp_index);
shfl_inst->replaceAllUsesWith(load_inst);
shfl_inst->eraseFromParent();
}
}
}

View File

@ -0,0 +1,82 @@
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define NUM_WARP 2
#define NUM_BLOCK 1
int block_size = 32 * NUM_WARP;
int block_size_x = block_size;
int block_size_y = 1;
int block_size_z = 1;
__thread int block_index = 0;
int grid_size = NUM_BLOCK;
extern "C" {
void *_Z7reduce0PiS_j_wrapper(void *);
__thread int warp_shfl[32];
}
void *wrap(void *p) {
int **res = (int **)p;
block_index = (*(int *)res[3]);
_Z7reduce0PiS_j_wrapper(p);
return NULL;
}
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
int **ret = new int *[4];
int **p0 = new int *;
*p0 = g_idata;
ret[0] = (int *)(p0);
int **p1 = new int *;
*p1 = g_odata;
ret[1] = (int *)(p1);
unsigned int *p2 = new unsigned int;
*p2 = n;
ret[2] = (int *)p2;
int *p3 = new int;
*p3 = bid;
ret[3] = (int *)p3;
return (void *)ret;
}
int main(int argc, char *argv[]) {
int *g_idata;
int size = block_size * NUM_BLOCK;
g_idata = new int[size * 2];
int *res = new int[size];
for (int i = 0; i < size; i++) {
g_idata[i] = i;
}
pthread_t threads[NUM_BLOCK];
void *inp[NUM_BLOCK];
for (long t = 0; t < NUM_BLOCK; t++) {
inp[t] = gen_input(t, g_idata, res, size);
}
for (long t = 0; t < NUM_BLOCK; t++) {
pthread_create(&threads[t], NULL, wrap, inp[t]);
}
for (long t = 0; t < NUM_BLOCK; t++)
pthread_join(threads[t], NULL);
int gold = 0;
for (int i = 0; i < size; i++) {
gold += g_idata[i];
}
assert(*res == gold && "Incorrect res\n");
printf("PASS\n");
pthread_exit(NULL);
}

View File

@ -0,0 +1,150 @@
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
; Function Attrs: nounwind
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: convergent nounwind
define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
%2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
%mul = mul i32 %2, %1
%add = add i32 %mul, %0
%cmp = icmp ult i32 %add, %n
br i1 %cmp, label %cond.true, label %cond.end
cond.true: ; preds = %entry
%idxprom = zext i32 %add to i64
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4, !tbaa !13
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
%idxprom5 = zext i32 %0 to i64
%arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
%arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
%cmp839 = icmp ugt i32 %2, 1
br i1 %cmp839, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %if.end, %cond.end
%cmp18 = icmp eq i32 %0, 0
br i1 %cmp18, label %if.then19, label %if.end23
for.body: ; preds = %cond.end, %if.end
%s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
%mul9 = shl nuw nsw i32 %s.040, 1
%rem = urem i32 %0, %mul9
%cmp10 = icmp eq i32 %rem, 0
br i1 %cmp10, label %if.then, label %if.end
if.then: ; preds = %for.body
%add11 = add i32 %s.040, %0
%idxprom12 = zext i32 %add11 to i64
%arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
%arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
%4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
%5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
%add16 = add nsw i32 %5, %4
store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
br label %if.end
if.end: ; preds = %if.then, %for.body
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
%cmp8 = icmp ult i32 %mul9, %2
br i1 %cmp8, label %for.body, label %for.cond.cleanup
if.then19: ; preds = %for.cond.cleanup
%idxprom21 = zext i32 %1 to i64
%arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
%6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
br label %if.end23
if.end23: ; preds = %if.then19, %for.cond.cleanup
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier.sync(i32) #3
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
attributes #4 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}
!10 = !{i32 0, i32 1024}
!11 = !{i32 0, i32 2147483647}
!12 = !{i32 1, i32 1025}
!13 = !{!14, !14, i64 0}
!14 = !{!"int", !15, i64 0}
!15 = !{!"omnipotent char", !16, i64 0}
!16 = !{!"Simple C++ TBAA"}

View File

@ -0,0 +1,6 @@
#!/bin/bash
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
llc --filetype=obj kernel.bc
g++ host.cpp kernel.o -lpthread -o test
./test

View File

@ -0,0 +1,82 @@
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define NUM_WARP 2
#define NUM_BLOCK 1
int block_size = 32 * NUM_WARP;
int block_size_x = block_size;
int block_size_y = 1;
int block_size_z = 1;
__thread int block_index = 0;
int grid_size = NUM_BLOCK;
extern "C" {
void *_Z7reduce5PiS_j_wrapper(void *);
__thread int warp_shfl[32];
}
void *wrap(void *p) {
int **res = (int **)p;
block_index = (*(int *)res[3]);
_Z7reduce5PiS_j_wrapper(p);
return NULL;
}
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
int **ret = new int *[4];
int **p0 = new int *;
*p0 = g_idata;
ret[0] = (int *)(p0);
int **p1 = new int *;
*p1 = g_odata;
ret[1] = (int *)(p1);
unsigned int *p2 = new unsigned int;
*p2 = n;
ret[2] = (int *)p2;
int *p3 = new int;
*p3 = bid;
ret[3] = (int *)p3;
return (void *)ret;
}
int main(int argc, char *argv[]) {
int *g_idata;
int size = block_size * NUM_BLOCK;
g_idata = new int[size * 2];
int *res = new int[size];
for (int i = 0; i < size; i++) {
g_idata[i] = i;
}
pthread_t threads[NUM_BLOCK];
void *inp[NUM_BLOCK];
for (long t = 0; t < NUM_BLOCK; t++) {
inp[t] = gen_input(t, g_idata, res, size);
}
for (long t = 0; t < NUM_BLOCK; t++) {
pthread_create(&threads[t], NULL, wrap, inp[t]);
}
for (long t = 0; t < NUM_BLOCK; t++)
pthread_join(threads[t], NULL);
int gold = 0;
for (int i = 0; i < size; i++) {
gold += g_idata[i];
}
assert(*res == gold && "Incorrect res\n");
printf("PASS\n");
pthread_exit(NULL);
}

View File

@ -0,0 +1,179 @@
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
; Function Attrs: nounwind
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: convergent nounwind
define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
%mul = shl i32 %1, 7
%add = add i32 %mul, %0
%cmp = icmp ult i32 %add, %n
br i1 %cmp, label %cond.true, label %cond.end
cond.true: ; preds = %entry
%idxprom = zext i32 %add to i64
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
%2 = load i32, i32* %arrayidx, align 4, !tbaa !12
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
%add4 = add i32 %add, 64
%cmp5 = icmp ult i32 %add4, %n
br i1 %cmp5, label %if.then, label %if.end
if.then: ; preds = %cond.end
%idxprom7 = zext i32 %add4 to i64
%arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
%3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
%add9 = add nsw i32 %3, %cond
br label %if.end
if.end: ; preds = %if.then, %cond.end
%mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
%idxprom10 = zext i32 %0 to i64
%arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
%arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
%5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
%mul.i.i52 = mul nuw nsw i32 %5, %4
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
%mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
%add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
%add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
%cmp14 = icmp ult i32 %add8.i.i55, 32
br i1 %cmp14, label %if.then15, label %if.end32
if.then15: ; preds = %if.end
%add16 = add nuw nsw i32 %0, 32
%idxprom17 = zext i32 %add16 to i64
%arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
%arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
%8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
%add19 = add nsw i32 %8, %mySum.0
%9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
%add23 = add nsw i32 %9, %add19
%10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
%add23.1 = add nsw i32 %10, %add23
%11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
%add23.2 = add nsw i32 %11, %add23.1
%12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
%add23.3 = add nsw i32 %12, %add23.2
%13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
%cmp27 = icmp eq i32 %add8.i.i55, 0
br i1 %cmp27, label %if.then28, label %if.end32
if.then28: ; preds = %if.then15
%add23.4 = add nsw i32 %13, %add23.3
%idxprom30 = zext i32 %1 to i64
%arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
br label %if.end32
if.end32: ; preds = %if.end, %if.then28, %if.then15
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier.sync(i32) #3
; Function Attrs: convergent inaccessiblememonly nounwind
declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
attributes #4 = { convergent inaccessiblememonly nounwind }
attributes #5 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}
!10 = !{i32 0, i32 1024}
!11 = !{i32 0, i32 2147483647}
!12 = !{!13, !13, i64 0}
!13 = !{!"int", !14, i64 0}
!14 = !{!"omnipotent char", !15, i64 0}
!15 = !{!"Simple C++ TBAA"}
!16 = !{i32 0, i32 64}
!17 = !{i32 1, i32 1025}

View File

@ -0,0 +1,6 @@
#!/bin/bash
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
llc --filetype=obj kernel.bc
g++ host.cpp kernel.o -lpthread -o test
./test

View File

@ -0,0 +1,11 @@
#!bin/sh
for file in ./*
do
if test -d $file
then
echo executing $file
cd $file
bash run.sh
cd ..
fi
done

View File

@ -0,0 +1,84 @@
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM_BLOCK 1
int N = 32;
int block_size = 32;
int block_size_x = block_size;
int block_size_y = 1;
int block_size_z = 1;
__thread int block_index = 0;
int grid_size = NUM_BLOCK;
extern "C" {
void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
}
void *wrap(void *p) {
int **res = (int **)p;
block_index = (*(int *)res[4]);
_Z9vectorAddPKfS0_Pfi_wrapper(p);
return NULL;
}
void *gen_input(int bid, float *A, float *B, float *C, int N) {
int **ret = new int *[5];
float **p0 = new float *;
*p0 = A;
ret[0] = (int *)(p0);
float **p1 = new float *;
*p1 = B;
ret[1] = (int *)(p1);
float **p2 = new float *;
*p2 = C;
ret[2] = (int *)(p2);
int *p3 = new int;
*p3 = N;
ret[3] = (int *)p3;
int *p4 = new int;
*p4 = bid;
ret[4] = (int *)p4;
return (void *)ret;
}
int main() {
float *A, *B, *C;
A = new float[N];
B = new float[N];
C = new float[N];
for (int i = 0; i < N; i++) {
A[i] = i;
B[i] = 1;
C[i] = 0;
}
pthread_t threads[NUM_BLOCK];
int rc;
for (long t = 0; t < NUM_BLOCK; t++) {
void *inp = gen_input(t, A, B, C, N);
rc = pthread_create(&threads[t], NULL, wrap, inp);
}
clock_t t1 = clock();
/* Last thing that main() should do */
for (long t = 0; t < NUM_BLOCK; t++)
pthread_join(threads[t], NULL);
for (int i = 0; i < N; i++) {
assert(C[i] == (A[i] + B[i]));
}
printf("PASS\n");
pthread_exit(NULL);
}

View File

@ -0,0 +1,86 @@
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
; Function Attrs: nounwind
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nofree nounwind
define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
%idxprom8 = zext i32 %0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
%1 = load float, float* %arrayidx, align 4, !tbaa !11
%arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
%2 = load float, float* %arrayidx2, align 4, !tbaa !11
%add = fadd contract float %1, %2
%arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
store float %add, float* %arrayidx4, align 4, !tbaa !11
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}
!10 = !{i32 0, i32 1024}
!11 = !{!12, !12, i64 0}
!12 = !{!"float", !13, i64 0}
!13 = !{!"omnipotent char", !14, i64 0}
!14 = !{!"Simple C++ TBAA"}

View File

@ -0,0 +1,6 @@
#!/bin/bash
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
llc --filetype=obj kernel.bc
g++ host.cpp kernel.o -lpthread -o test
./test

16
runtime/CMakeLists.txt Normal file
View File

@ -0,0 +1,16 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(
X86runtime
DESCRIPTION "Implementation CUDA runtime API with x86"
LANGUAGES CXX)
set(LIB_NAME x86Runtime)
set(CMAKE_VERBOSE_MAKEFILE ON)
# compile threadPool implementation
add_subdirectory(threadPool)
# compile x86 runtime library
include_directories(./include)
include_directories(./threadPool/include)
file(GLOB proj_SOURCES "lib/*.cpp")
add_library(${LIB_NAME} SHARED ${proj_SOURCES})

View File

@ -0,0 +1,19 @@
#ifndef __RUNTIME_IMPL__
#define __RUNTIME_IMPL__
#include "cudaStatus.h"
#include "structures.h"
cudaError_t cudaDeviceReset(void);
cudaError_t cudaDeviceSynchronize(void);
cudaError_t cudaFree(void *devPtr);
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
void **args, size_t sharedMem,
cudaStream_t stream);
cudaError_t cudaMalloc(void **devPtr, size_t size);
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
cudaMemcpyKind kind);
cudaError_t cudaSetDevice(int device);
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
cudaError_t cudaStreamCreate(cudaStream_t *pStream);
cudaError_t cudaStreamDestroy(cudaStream_t stream);
cudaError_t cudaStreamSynchronize(cudaStream_t stream);
#endif

View File

@ -0,0 +1,18 @@
#ifndef __RUNTIME_STATUS__
#define __RUNTIME_STATUS__
#include <stdio.h>
enum cudaError_t {
CudaSuccess = 0,
CudaErrorInvalidValue = 1,
CudaErrorInvalidMemoryAllocation = 2,
};
enum cudaMemcpyKind {
cudaMemcpyHostToHost = 0,
cudaMemcpyHostToDevice = 1,
cudaMemcpyDeviceToHost = 2,
cudaMemcpyDeviceToDevice = 3,
cudaMemcpyDefault = 4,
};
#endif

View File

@ -0,0 +1,100 @@
#include "cudaRuntimeImpl.h"
#include "api.h"
#include <stdio.h>
#include <stdlib.h>
cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
cudaError_t cudaFree(void *devPtr) { free(devPtr); }
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
void **args, size_t sharedMem,
cudaStream_t stream) {
// if scheduler is null init device
cu_kernel *ker =
create_kernel(func, gridDim, blockDim, &args, sharedMem, stream);
int lstatus = cuLaunchKernel(&ker);
}
cudaError_t cudaMalloc(void **devPtr, size_t size) {
*devPtr = malloc(size);
if (devPtr == NULL)
return cudaErrorMemoryAllocation;
return cudaSuccess;
}
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
cudaMemcpyKind kind) {
if (kind == cudaMemcpyHostToHost) {
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToHost) {
// how does the code know which device accessing the memory
memcpy(dst, src, count);
} else if (kind == cudaMemcpyHostToDevice) {
// how does the code know which device accessing the memory
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToHost) {
// how does the code know which device accessing the memory
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToDevice) {
memcpy(dst, dst, count);
} else if (kind == cudaMemcpyDefault) {
memcpy(dst, src, count);
}
return cudaSuccess;
}
cudaError_t cudaSetDevice(int device) {
// error checking
init_device();
}
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
cstreamData *dst_stream = (cstreamData *)dst;
cstreamData *src_stream = (cstreamData *)src;
if (dst_stream == NULL || src_stream == NULL) {
return cudaErrorInvalidValue; // 1
}
dst_stream->stream_priority = src_stream->stream_priority;
dst_stream->stream_flags = src_stream->stream_flags;
return cudaSuccess; // 0
}
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData));
if (s == NULL)
return cudaErrorMemoryAllocation;
s->ev.status = C_RUN;
s->id = stream_counter;
stream_counter++;
s->stream_priority = DEFAULT;
create_KernelQueue(&(s->kernelQueue));
INIT_LOCK(s->stream_lock);
*pStream = (cudaStream_t)(s);
return cudaSuccess;
}
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
cstreamData *s = (cstreamData *)(stream);
free(s->kernelQueue);
DESTROY_LOCK(s->stream_lock);
free(s);
return cudaSuccess;
}
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
cstreamData *e = ((cstreamData *)(stream));
MUTEX_LOCK(e->stream_lock);
e->ev.status = C_SYNCHRONIZE;
e->ev.numKernelsToWait = e->kernelQueue->waiting_count;
MUTEX_UNLOCK(e->stream_lock);
}

View File

@ -0,0 +1,17 @@
cmake_minimum_required(VERSION 3.1)
# C project
project(
ThreadPool
DESCRIPTION "Using pthread to implement ThreadPool"
LANGUAGES CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(LIB_NAME threadPool)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)
include_directories(./include)
file(GLOB proj_SOURCES "lib/*.cpp")
add_library(${LIB_NAME} SHARED ${proj_SOURCES})

View File

@ -0,0 +1,25 @@
#ifndef C_API_H
#define C_API_H
#include "structures.h"
cu_kernel *create_kernel(void (*wrap)(cu_argument *));
int getWorkItem(struct kernel_queue **qu, cu_kernel *ker,
struct argument *kernel_arg, int **blockId);
int create_KernelQueue(kernel_queue **q);
int dequeKernelLL(struct kernel_queue **qu);
int dequeKernel(struct kernel_queue **qu, cu_kernel *ker);
int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker);
int scheduler_init(cu_device device);
void scheduler_uninit();
void cuSynchronizeBarrier();
int set_kernel_arguments(cu_kernel **k, unsigned int arg_num, void **arg_value);
int setKernelDimensions(cu_kernel *k, struct argument **arg,
void **totalBlockSize, void *blockId);
#endif

View File

@ -0,0 +1,26 @@
#ifndef C_DEF_H
#define C_DEF_H
// Error
#define C_SUCCESS 0x0
#define C_ERROR 0x1
// execution status
#define C_COMPLETE 0x2
#define C_RUNNING 0x3
#define C_SUBMITTED 0x4
#define C_QUEUED 0x5
#define C_CREATED 0x5
// stream status
#define C_RUN 0x1
#define C_WAIT 0x2
#define C_SYNCHRONIZE 0x3
// Not Initliazed Error
#define C_ERROR_NOT_INITIALIZED 0x6
#define C_ERROR_MEMALLOC 0x7
#define C_QUEUE_EMPTY 0x8
#endif

View File

@ -0,0 +1,38 @@
#ifndef C_MACROS_H
#define C_MACROS_H
#include "assert.h"
#include <pthread.h>
#define INIT_LOCK(__LOCK__) \
{ \
do { \
int r = pthread_mutex_init(&(__LOCK__), NULL); \
assert(r == 0); \
} while (0); \
}
#define MUTEX_LOCK(__LOCK__) \
{ \
do { \
int r = pthread_mutex_lock(&(__LOCK__)); \
assert(r == 0); \
} while (0); \
}
#define MUTEX_UNLOCK(__LOCK__) \
{ \
do { \
int r = pthread_mutex_unlock(&(__LOCK__)); \
assert(r == 0); \
} while (0); \
}
#define DESTROY_LOCK(__LOCK__) \
{ \
do { \
int r = pthread_mutex_destroy(&(__LOCK__)); \
assert(r == 0); \
} while (0); \
}
#endif // HEADER_FILE

View File

@ -0,0 +1,191 @@
#ifndef C_STRUCTURES_H
#define C_STRUCTURES_H
#include "pthread.h"
#define cudaStream_t cstreamData
typedef struct device {
int max_compute_units;
int device_id;
} cu_device;
typedef struct c_thread {
pthread_t thread;
unsigned long executed_commands;
unsigned index;
bool exit;
} cu_ptd;
typedef struct scheduler_pool {
struct c_thread *thread_pool;
size_t num_worker_threads;
size_t local_mem_size;
int num_kernel_launch;
int num_kernel_finished;
int num_kernel_queued;
size_t idle_threads;
pthread_cond_t wake_pool;
int threadpool_shutdown_requested;
// lock for scheduler
pthread_mutex_t work_queue_lock;
// C99 array at the end
// user kernel queue for only user called functions
struct kernel_queue *kernelQueue;
} cu_pool;
struct kernel_queue {
struct kernel *head;
struct kernel *tail;
// finish command count
unsigned long finish_count;
// waiting to be run on threads
unsigned long waiting_count;
// running count
unsigned long running_count;
// total count
unsigned long kernel_count;
// current index for task to be run
unsigned long current_index;
};
typedef struct command {
struct kernel *ker;
struct command *next;
struct command *prev;
} cu_command;
typedef struct argument {
// size of the argument to allocation
size_t size;
void *value;
unsigned int index;
} cu_argument;
typedef struct input_arg {
// real values for the input
char *p;
struct argument *argus[];
// (TODO): implement meta_data
// the type of metadata will need to change to list of ints or something
// so that we can parse the arguments p
} cu_input;
struct dim3 {
size_t x;
size_t y;
size_t z;
dim3(int d1) {
x = d1;
y = z = 1;
}
dim3() { x = y = z = 1; }
};
enum StreamType {
DEFAULT,
LOW,
HIGH,
EXT,
};
struct cStreamDataInternal {
/*
status of the stream (run , wait)
Run: Stream will asynchronously assign the kernel assign with this stream
Wait: Stream will halt kernels from exiting the scheduler
*/
int status;
/*
if status == wait, wait on the number of kernels to wait to become 0
*/
unsigned long numKernelsToWait;
unsigned int lastKernelIdToWait;
unsigned int count; // number of task left in the stream
};
typedef struct streamData {
// execution status of current event monitor
struct cStreamDataInternal ev;
pthread_mutex_t stream_lock; // lock on the stream
StreamType stream_priority;
unsigned int id;
unsigned int stream_flags;
// queue of the kernels in this stream
struct kernel_queue *kernelQueue;
} cstreamData;
// kernel information
typedef struct kernel {
void *(*start_routine)(void *);
void **args;
dim3 gridDim;
dim3 blockDim;
struct kernel *next;
struct kernel *prev;
size_t shared_mem;
cstreamData *stream;
struct event *barrier;
int status;
int totalBlocks;
int N;
int blockSize;
int kernelId;
// current blockId
int blockId;
void *shared_mem_loc;
} cu_kernel;
typedef struct asyncKernel {
unsigned int numBlocks;
unsigned int numThreads;
struct event *evt;
struct kernel *ker;
struct asyncKernel *prev;
struct asyncKernel *next;
} asyncKernel;
// command queue of command nodes
typedef struct kernel_arg_array {
size_t size;
unsigned int index;
} karg_arr;
typedef struct kernel_image_arg {
size_t size;
unsigned int index;
} k_arg;
#endif // HEADER_FILE

View File

@ -0,0 +1,456 @@
#include <stdio.h>
#include <stdlib.h>
#include <thread>
#include "api.h"
#include "def.h"
#include "macros.h"
#include "structures.h"
/*
Initialize the device
*/
int init_device() {
cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
if (device == NULL)
return C_ERROR_MEMALLOC;
device->max_compute_units = std::thread::hardware_concurrency();
// initialize scheduler
int ret = scheduler_init(*device);
if (ret != C_SUCCESS)
return ret;
return C_SUCCESS;
}
/*
Create Kernel
*/
static int kernelIds = 0;
cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
void ***args, size_t sharedMem, cstreamData *stream) {
cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
// set the function pointer
ker->start_routine = (void *(*)(void *))func;
// ker->start_routine(args);
ker->args = *args;
ker->gridDim = gridDim;
ker->blockDim = blockDim;
ker->shared_mem = sharedMem;
// malloc shared memory dynamic (heap , needs to be on the stack)
// each thread create their own shared memory // after the task submission
ker->shared_mem_loc = calloc(1, sharedMem);
ker->stream = stream;
ker->blockId = 0;
ker->totalBlocks = gridDim.x;
ker->N = blockDim.x;
ker->kernelId = kernelIds;
kernelIds += 1;
ker->blockSize = blockDim.x;
return ker;
}
/*
Create Kernel Queue
*/
int create_KernelQueue(kernel_queue **q) {
*q = (kernel_queue *)calloc(1, sizeof(kernel_queue));
if (*q == NULL) {
return C_ERROR_MEMALLOC;
}
(*q)->kernel_count = 0;
(*q)->running_count = 0;
(*q)->waiting_count = 0;
(*q)->finish_count = 0;
(*q)->current_index = 0;
return C_SUCCESS;
}
int dequeKernelLL(struct kernel_queue **qu) {
struct kernel_queue *q = *qu;
q->finish_count += 1;
// free the pointer
if (q->head == NULL) {
return C_QUEUE_EMPTY;
} else {
//*ker = *(q->head);
q->head = (q->head)->next;
if (q->head != NULL) {
q->head->prev = NULL;
}
}
return C_SUCCESS;
}
int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) {
struct kernel_queue *q = *qu;
cu_kernel *p = *ker;
if (q->head == NULL) {
q->head = p;
q->tail = p;
} else {
p->prev = q->tail;
q->tail->next = p;
q->tail = p;
p->next = NULL;
}
q->kernel_count += 1;
q->waiting_count += 1;
// user kernel command
return C_SUCCESS;
}
// scheduler
static cu_pool *scheduler;
__thread int block_index = 0;
__thread int thread_memory_size = 0;
/*
Enqueue Kernel (k) to the scheduler kernelQueue
*/
int schedulerEnqueueKernel(cu_kernel **k) {
cu_kernel *ker = *k;
MUTEX_LOCK(scheduler->work_queue_lock);
enqueueKernel(&scheduler->kernelQueue, &ker);
pthread_cond_broadcast(&(scheduler->wake_pool));
MUTEX_UNLOCK(scheduler->work_queue_lock);
}
/*
Kernel Launch with numBlocks and numThreadsPerBlock
*/
int cuLaunchKernel(cu_kernel **k) {
// Calculate Block Size N/numBlocks
cu_kernel *ker = *k;
int status = C_RUN;
MUTEX_LOCK(scheduler->work_queue_lock);
scheduler->num_kernel_queued += 1;
MUTEX_UNLOCK(scheduler->work_queue_lock);
// stream == 0 add to the kernelQueue
if (ker->stream == 0) {
schedulerEnqueueKernel(&ker);
} else {
// add to it's stream queue
// stream queue can be waiting or running with or without tasks
MUTEX_LOCK(((cstreamData *)(ker->stream))->stream_lock);
status = ((cstreamData *)(ker->stream))->ev.status;
// if stream queue status is run (first kernel) (enqueue to the kernel
// queue)
cstreamData *e = ((cstreamData *)(ker->stream));
// synchronized is called after no job in the queue so stream is stuck on
// synchronize
if (e->ev.status == C_SYNCHRONIZE) {
if ((e->kernelQueue->finish_count) == (e->kernelQueue->kernel_count)) {
e->ev.status = C_RUN;
}
}
if (e->ev.status == C_RUN) {
// change the status to wait
e->ev.status == C_WAIT;
MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
schedulerEnqueueKernel(&ker);
} else {
// the status of stream queue is wait so just enqueue to the stream
enqueueKernel(&((cstreamData *)(ker->stream))->kernelQueue, &ker);
MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
}
}
}
/*
Get Work Item: get the kernel from the queue and increment blockId
*/
int getWorkItem(struct kernel_queue **qu, cu_kernel **kern, int blockId) {
struct kernel_queue *q = *qu;
if (q->waiting_count > 0) {
*kern = q->head;
cu_kernel *ker = *kern;
if (blockId + 1 == q->head->totalBlocks) {
// deque the head
dequeKernelLL(qu);
ker->status = C_COMPLETE;
q->waiting_count -= 1;
} else {
q->head->blockId += 1;
}
q->finish_count += 1;
} else {
return C_QUEUE_EMPTY;
}
return C_SUCCESS;
}
/*
Thread Gets Work
*/
int get_work(c_thread *th) {
cu_kernel ker;
MUTEX_LOCK(scheduler->work_queue_lock);
RETRY:
int is_exit = 0;
int is_command_not_null = 0;
int blockId;
int localBlockSize;
int status;
int completion_status = 0;
is_exit = scheduler->threadpool_shutdown_requested;
MUTEX_UNLOCK(scheduler->work_queue_lock);
if (!is_exit) {
MUTEX_LOCK(scheduler->work_queue_lock);
// if kernel waiting to be complete is not zero
if (scheduler->kernelQueue->waiting_count > 0) {
blockId = scheduler->kernelQueue->head->blockId;
localBlockSize = scheduler->kernelQueue->head->blockSize;
// set status as success fully queue
status = C_SUCCESS;
ker = *(scheduler->kernelQueue->head);
// if the blockId + 1 is equal to the goal block size ,
// then its the last block
if (blockId + 1 == scheduler->kernelQueue->head->totalBlocks) {
// deque the head
dequeKernelLL(&scheduler->kernelQueue);
ker.status = C_COMPLETE;
scheduler->kernelQueue->waiting_count -= 1;
} else {
// increment the blockId
scheduler->kernelQueue->head->blockId =
scheduler->kernelQueue->head->blockId + 1;
}
// status = getWorkItem(&scheduler->kernelQueue, &ker, blockId);
} else {
status = C_QUEUE_EMPTY;
}
MUTEX_UNLOCK(scheduler->work_queue_lock);
}
if (status != C_QUEUE_EMPTY) {
block_index = blockId;
thread_memory_size = ker.shared_mem;
ker.start_routine(ker.args);
is_command_not_null = 1;
if (ker.status == C_COMPLETE) {
// check if this kernel's stream has more jobs to run (enqueue the next
// job)
if (ker.stream != NULL) {
bool synchronize = false;
MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
// synchronize stream
if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
}
MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
// synchronize stream
if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
}
if (((cstreamData *)(ker.stream))->ev.numKernelsToWait == 0) {
synchronize = false;
} else {
synchronize = true;
}
}
if (synchronize == false) {
if (((cstreamData *)(ker.stream))->kernelQueue->waiting_count > 0) {
((cstreamData *)(ker.stream))->ev.status = C_WAIT;
MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
cu_kernel *kern =
((cstreamData *)(ker.stream))->kernelQueue->head;
schedulerEnqueueKernel(&kern);
dequeKernelLL(&((cstreamData *)(ker.stream))->kernelQueue);
} else {
// switch the stream to run to allow for the next execution
((cstreamData *)(ker.stream))->ev.status = C_RUN;
MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
}
}
}
}
MUTEX_LOCK(scheduler->work_queue_lock);
scheduler->num_kernel_finished += 1;
MUTEX_UNLOCK(scheduler->work_queue_lock);
}
}
MUTEX_LOCK(scheduler->work_queue_lock);
if ((is_exit == 0 && is_command_not_null == 0)) {
// all threads in condition wait
scheduler->idle_threads += 1;
pthread_cond_wait(&(scheduler->wake_pool), &(scheduler->work_queue_lock));
scheduler->idle_threads -= 1;
goto RETRY;
}
MUTEX_UNLOCK(scheduler->work_queue_lock);
return is_exit;
}
void *driver_thread(void *p) {
struct c_thread *td = (struct c_thread *)p;
int is_exit = 0;
td->exit = false;
while (1) {
// get work
is_exit = get_work(td);
// exit the routine
if (is_exit) {
td->exit = true;
// pthread_exit
pthread_exit(NULL);
}
}
}
/*
Initialize the scheduler
*/
int scheduler_init(cu_device device) {
scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
scheduler->num_worker_threads = device.max_compute_units;
scheduler->thread_pool = (struct c_thread *)calloc(
scheduler->num_worker_threads, sizeof(c_thread));
kernel_queue *asq;
create_KernelQueue(&asq);
scheduler->kernelQueue = asq;
INIT_LOCK(scheduler->work_queue_lock);
pthread_cond_init(&scheduler->wake_pool, NULL);
scheduler->idle_threads = 0;
for (int i = 0; i < scheduler->num_worker_threads; i++) {
scheduler->thread_pool[i].index = i;
pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread,
(void *)&scheduler->thread_pool[i]);
}
return C_SUCCESS;
}
void scheduler_uninit() {
unsigned i;
int r = pthread_mutex_lock(&scheduler->work_queue_lock);
assert(r == 0);
scheduler->threadpool_shutdown_requested = 1;
pthread_cond_broadcast(&scheduler->wake_pool);
int r1 = pthread_mutex_unlock(&scheduler->work_queue_lock);
assert(r1 == 0);
for (i = 0; i < scheduler->num_worker_threads; i++) {
pthread_join(scheduler->thread_pool[i].thread, NULL);
}
free(scheduler->thread_pool);
free(scheduler->kernelQueue);
pthread_mutex_destroy(&scheduler->work_queue_lock);
pthread_cond_destroy(&scheduler->wake_pool);
scheduler->threadpool_shutdown_requested = 0;
}
int cuWait(cstreamData *evt) {
AGAIN:
int r = pthread_mutex_lock(&evt->stream_lock);
assert(r == 0);
if (evt->ev.status != C_COMPLETE) {
int r1 = pthread_mutex_unlock(&evt->stream_lock);
assert(r1 == 0);
goto AGAIN;
}
return C_SUCCESS;
}
/*
Barrier for Kernel Launch
During kernel launch, increment the number of work items required to finish
Each kernel will point to the same event
During Running Command, decrement the event.work_item count
when count is 0, all work items for this kernel launch is finish
Sense Like Barrier
Counting Barrier basically
*/
void cuSynchronizeBarrier() {
AGAIN:
MUTEX_LOCK(scheduler->work_queue_lock);
if (scheduler->num_kernel_finished != scheduler->num_kernel_queued ||
scheduler->idle_threads != scheduler->num_worker_threads) {
MUTEX_UNLOCK(scheduler->work_queue_lock);
goto AGAIN;
} else {
MUTEX_UNLOCK(scheduler->work_queue_lock);
}
}