add backbone, including basic features for compilation
This commit is contained in:
commit
addf0a95b7
|
@ -0,0 +1,41 @@
|
|||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||
|
||||
project(CudaOnX86)
|
||||
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
|
||||
set(CMAKE_CXX_STANDARD "14")
|
||||
set(MAJOR_VERSION 0)
|
||||
set(MINOR_VERSION 1)
|
||||
set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
|
||||
set(COX_VERSION ${VERSION_STRING})
|
||||
# get LLVM PATH get PATH for head file
|
||||
if(DEFINED LLVM_CONFIG_PATH)
|
||||
if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
|
||||
if(EXISTS "${LLVM_CONFIG_PATH}")
|
||||
set(LLVM_CONFIG "${LLVM_CONFIG_PATH}")
|
||||
else()
|
||||
message(FATAL_ERROR "llvm-config is not found in ${LLVM_CONFIG_PATH}")
|
||||
endif()
|
||||
message(STATUS "Using llvm-config: ${LLVM_CONFIG}")
|
||||
execute_process(
|
||||
COMMAND "${LLVM_CONFIG}" "--cxxflags"
|
||||
OUTPUT_VARIABLE LLVM_CXX_FLAG
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(
|
||||
COMMAND "${LLVM_CONFIG}" "--libdir"
|
||||
OUTPUT_VARIABLE LLVM_LIB_PATH
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(
|
||||
COMMAND "${LLVM_CONFIG}" "--libs"
|
||||
OUTPUT_VARIABLE LLVM_LINK_FLAG
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "llvm-config is required")
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
|
||||
|
||||
set(GCC_COVERAGE_LINK_FLAGS
|
||||
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
|
||||
|
||||
add_subdirectory(compilation)
|
|
@ -0,0 +1,34 @@
|
|||
# Contributing to COX
|
||||
|
||||
Thank you for your interest in contributing to COX!
|
||||
We appreciate all contributions, including but not limited to:
|
||||
|
||||
- Add documentation
|
||||
- Add new features and components
|
||||
- Fix bugs
|
||||
|
||||
## How to contribute?
|
||||
|
||||
0. (Optional) Open an issue and discuss your idea before start
|
||||
1. Fork the latest version COX
|
||||
2. Commit to the forked repo
|
||||
3. Create a Pull Request to COX main branch
|
||||
|
||||
## Code style
|
||||
|
||||
We follow the Clang format in this repo.
|
||||
To make sure your contribution is following the correct style,
|
||||
we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
|
||||
|
||||
```bash
|
||||
# Python environment is required
|
||||
pip install pre-commit
|
||||
```
|
||||
|
||||
Then, from the repository folder, execute the following instruction:
|
||||
|
||||
```bash
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
With pre-commit plugin, each local commit will be automatically checked.
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2021 Ruobing Han
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,62 @@
|
|||
# COX: CUDA on X86
|
||||
|
||||
## Introduction
|
||||
|
||||
This project consists of two parts: a series of LLVM passes that
|
||||
achieve a SPMD NVVM IR as input, and output the corresponding
|
||||
MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
|
||||
|
||||
## Install
|
||||
|
||||
### Prerequisites
|
||||
|
||||
* Linux: Verified on Ubuntu 18.04
|
||||
* LLVM10.0
|
||||
* NVIDIA CUDA-toolkit
|
||||
* x86 CPU
|
||||
* pthread
|
||||
* GCC 7.5.0
|
||||
|
||||
### Installation
|
||||
|
||||
1. Clone from github
|
||||
|
||||
```bash
|
||||
git clone https://github.com/drcut/open_source_template
|
||||
cd open_source_template
|
||||
```
|
||||
|
||||
2. Build the transformer for NVVM IR to LLVM IR for X86
|
||||
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
cmake .. -DLLVM_CONFIG_PATH=`which llvm-config` # need path to llvm-config
|
||||
make
|
||||
```
|
||||
|
||||
## Run Vecadd samples
|
||||
|
||||
```bash
|
||||
# Generate bitcode from human-readable LLVM IR
|
||||
llvm-as ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
# use LLVM passes to transform NVVM IR (SPMD) to LLVM IR (MPMD+SIMD).
|
||||
# NOTE: we hard-code the grid size (1, 1, 1)
|
||||
# and block size (1024, 1, 1) into the generated LLVM IR
|
||||
./compilation/nvvm2x86 \
|
||||
../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc \
|
||||
kernel.bc 1 1 1 32 1 1
|
||||
# generate object file from LLVM IR
|
||||
llc --filetype=obj kernel.bc
|
||||
# link generated kernel function
|
||||
# with host function and generate excutable file
|
||||
g++ ../compilation/examples/vecadd/host.cpp \
|
||||
kernel.o -lpthread -o vecadd_example
|
||||
# execute the executable file
|
||||
./vecadd_example
|
||||
```
|
||||
|
||||
## Author
|
||||
|
||||
[Ruobing Han](https://drcut.github.io/) is a CS phd student in
|
||||
Georgia Institute Technology, under the supervision
|
||||
of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).
|
|
@ -0,0 +1,23 @@
|
|||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||
project(
|
||||
NVVM2X86
|
||||
DESCRIPTION "Translate NVVM IR to LLVM IR for X86"
|
||||
LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
|
||||
# compile kernel translator
|
||||
include_directories(./KernelTranslation/include)
|
||||
add_subdirectory(KernelTranslation)
|
||||
|
||||
add_executable(kernelTranslator KernelTranslation.cpp)
|
||||
target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
|
||||
|
||||
# compile host translator
|
||||
include_directories(./HostTranslation/include)
|
||||
add_subdirectory(HostTranslation)
|
||||
|
||||
add_executable(hostTranslator HostTranslation.cpp)
|
||||
|
||||
target_link_libraries(hostTranslator spmd2mpmd cudaRuntime2cpuRuntime
|
||||
${GCC_COVERAGE_LINK_FLAGS})
|
|
@ -0,0 +1,25 @@
|
|||
#include "ReplaceKernelLaunch.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
assert(argc == 3 && "incorrect number of arguments\n");
|
||||
|
||||
char *input_host_path = argv[1];
|
||||
char *output_host_path = argv[2];
|
||||
|
||||
// load LLVM module(s)
|
||||
llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
|
||||
VerifyModule(hostModule);
|
||||
// process host module
|
||||
ReplaceKernelLaunch(hostModule);
|
||||
VerifyModule(hostModule);
|
||||
DumpModule(hostModule, output_host_path);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
|
||||
# C project
|
||||
project(
|
||||
HostTranslation
|
||||
DESCRIPTION "Translate CUDA host modules to CPU host modules,
|
||||
mainly replace CUDA Runtime APIs with CPU Runtime APIs"
|
||||
LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
set(LIB_NAME cudaRuntime2cpuRuntime)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(./include)
|
||||
|
||||
file(GLOB proj_HEADERS "include/*.h")
|
||||
file(GLOB proj_SOURCES "lib/*.cpp")
|
||||
|
||||
# Add core library.
|
||||
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
|
|
@ -0,0 +1,11 @@
|
|||
#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
|
||||
#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
/*
|
||||
* Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
|
||||
* Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
|
||||
*/
|
||||
void ReplaceKernelLaunch(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,94 @@
|
|||
#include "ReplaceKernelLaunch.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
|
||||
// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
|
||||
void ReplaceKernelLaunch(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto VoidTy = llvm::Type::getVoidTy(context);
|
||||
auto I8 = llvm::Type::getInt8PtrTy(context);
|
||||
std::map<std::string, BitCastInst *> kernels;
|
||||
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(Type::getVoidTy(*C), NULL);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 =
|
||||
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
||||
bool done = false;
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
|
||||
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
|
||||
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
|
||||
|
||||
Value *callOperand = callInst->getArgOperand(0);
|
||||
|
||||
Function *functionOperand =
|
||||
dyn_cast<Function>(callInst->getArgOperand(0));
|
||||
|
||||
// call function is wrapped in a bitcast
|
||||
if (functionOperand == NULL) {
|
||||
|
||||
std::vector<size_t> arg_sizes;
|
||||
functionOperand =
|
||||
dyn_cast<Function>(callOperand->stripPointerCasts());
|
||||
|
||||
FunctionType *ft = calledFunction->getFunctionType();
|
||||
std::cout << " Parent (Caller) Function Name: " << func_name
|
||||
<< ", cudaLaunchKernel Function: "
|
||||
<< functionOperand->getName().str() << ", args "
|
||||
<< functionOperand->arg_size() << std::endl;
|
||||
auto rep = kernels.find(functionOperand->getName().str());
|
||||
if (rep != kernels.end()) {
|
||||
|
||||
callInst->setArgOperand(0, rep->second);
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<Type *> Params;
|
||||
Params.push_back(I8);
|
||||
FunctionType *FT = FunctionType::get(VoidTy, Params, false);
|
||||
std::string newName =
|
||||
functionOperand->getName().str() + "_wrapper";
|
||||
|
||||
Function *F =
|
||||
Function::Create(FT, Function::ExternalLinkage, newName, M);
|
||||
F->setDSOLocal(true);
|
||||
|
||||
BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
|
||||
callInst->setArgOperand(0, BC);
|
||||
kernels.insert({functionOperand->getName().str(), BC});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
#include "generate_x86_format.h"
|
||||
#include "handle_sync.h"
|
||||
#include "init.h"
|
||||
#include "insert_sync.h"
|
||||
#include "insert_warp_loop.h"
|
||||
#include "performance.h"
|
||||
#include "tool.h"
|
||||
#include "warp_func.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
assert(argc == 9 && "incorrect number of arguments\n");
|
||||
llvm::Module *program = LoadModuleFromFilr(argv[1]);
|
||||
// get size of grid and dim from input arguments
|
||||
int *grid_dim = new int[3];
|
||||
int *block_dim = new int[3];
|
||||
grid_dim[0] = atoi(argv[3]);
|
||||
grid_dim[1] = atoi(argv[4]);
|
||||
grid_dim[2] = atoi(argv[5]);
|
||||
block_dim[0] = atoi(argv[6]);
|
||||
block_dim[1] = atoi(argv[7]);
|
||||
block_dim[2] = atoi(argv[8]);
|
||||
|
||||
// inline, and create auxiliary global variables
|
||||
init_block(program);
|
||||
// insert sync before each vote, and replace the
|
||||
// original vote function to warp vote
|
||||
handle_warp_vote(program);
|
||||
// replace warp shuffle
|
||||
handle_warp_shfl(program);
|
||||
// insert sync
|
||||
insert_sync(program);
|
||||
// split block by sync
|
||||
split_block_by_sync(program);
|
||||
// add loop for intra&intera thread
|
||||
insert_warp_loop(program);
|
||||
// (TODO): replace this patch
|
||||
replace_built_in_function(program, grid_dim, block_dim);
|
||||
// VerifyModule(program);
|
||||
generate_x86_format(program);
|
||||
// performance optimization
|
||||
performance_optimization(program);
|
||||
|
||||
DumpModule(program, argv[2]);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
|
||||
# C project
|
||||
project(
|
||||
KernelTranslation
|
||||
DESCRIPTION
|
||||
"Translate SPMD Kernel to MPMD format with hierarchical collapsing"
|
||||
LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
set(LIB_NAME spmd2mpmd)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(./include)
|
||||
|
||||
file(GLOB proj_HEADERS "include/*.h")
|
||||
file(GLOB proj_SOURCES "lib/*.cpp")
|
||||
|
||||
# Add core library.
|
||||
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
|
|
@ -0,0 +1,8 @@
|
|||
#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
|
||||
#define __NVVM2x86_GENERATE_X86_FORMAT__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void generate_x86_format(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,10 @@
|
|||
#ifndef __NVVM2x86_HANDLE_SYNC__
|
||||
#define __NVVM2x86_HANDLE_SYNC__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void split_block_by_sync(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,7 @@
|
|||
#ifndef __NVVM2x86_INIT__
|
||||
#define __NVVM2x86_INIT__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void init_block(llvm::Module *M);
|
||||
#endif
|
|
@ -0,0 +1,9 @@
|
|||
#ifndef __NVVM2x86_INSERT_SYNC__
|
||||
#define __NVVM2x86_INSERT_SYNC__
|
||||
|
||||
#include "llvm/IR/Function.h"
|
||||
|
||||
// insert extra barrier
|
||||
void insert_sync(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef __NVVM2x86_INSERT_WARP_LOOP__
|
||||
#define __NVVM2x86_INSERT_WARP_LOOP__
|
||||
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void insert_warp_loop(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,9 @@
|
|||
#ifndef __NVVM2x86_MEMORY_HIERARCHY__
|
||||
#define __NVVM2x86_MEMORY_HIERARCHY__
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void mem_share2global(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,7 @@
|
|||
#ifndef __NVVM2x86_PERFORMANCE__
|
||||
#define __NVVM2x86_PERFORMANCE__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void performance_optimization(llvm::Module *M);
|
||||
#endif
|
|
@ -0,0 +1,24 @@
|
|||
#ifndef __NVVM2x86_TOOL__
|
||||
#define __NVVM2x86_TOOL__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
llvm::Module *LoadModuleFromFilr(char *file_name);
|
||||
void DumpModule(llvm::Module *M, char *file_name);
|
||||
bool isKernelFunction(llvm::Module *M, llvm::Function *F);
|
||||
void replace_block(llvm::Function *F, llvm::BasicBlock *before,
|
||||
llvm::BasicBlock *after);
|
||||
llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore);
|
||||
llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
|
||||
void VerifyModule(llvm::Module *);
|
||||
void phi2alloc(llvm::Module *M);
|
||||
void remove_cuda_built_in(llvm::Module *M);
|
||||
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
|
||||
void replace_asm_call(llvm::Module *M);
|
||||
bool find_block_barrier_in_region(llvm::BasicBlock *start,
|
||||
llvm::BasicBlock *end);
|
||||
bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end);
|
||||
bool has_warp_barrier(llvm::BasicBlock *B);
|
||||
bool has_barrier(llvm::BasicBlock *B);
|
||||
bool has_block_barrier(llvm::BasicBlock *B);
|
||||
bool has_barrier(llvm::Function *F);
|
||||
#endif
|
|
@ -0,0 +1,10 @@
|
|||
#ifndef __NVVM2x86_WARP_FUNC__
|
||||
#define __NVVM2x86_WARP_FUNC__
|
||||
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void handle_warp_vote(llvm::Module *M);
|
||||
void handle_warp_shfl(llvm::Module *M);
|
||||
#endif
|
|
@ -0,0 +1,119 @@
|
|||
#include "generate_x86_format.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void set_meta_data(llvm::Module *M) {
|
||||
M->setTargetTriple("x86_64-unknown-linux-gnu");
|
||||
M->setDataLayout(
|
||||
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
|
||||
}
|
||||
|
||||
// as pthread only accept a single void* for input
|
||||
// we have to decode this input inside the kernel
|
||||
void decode_input(llvm::Module *M) {
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
|
||||
LLVMContext *C = &M->getContext();
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
llvm::FunctionType *LauncherFuncT = FunctionType::get(
|
||||
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
|
||||
|
||||
// generate Wrapper Function type
|
||||
// now we only support a single int32*
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
auto func_name = F->getName().str();
|
||||
llvm::IRBuilder<> Builder(M->getContext());
|
||||
|
||||
FunctionCallee fc =
|
||||
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
|
||||
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
|
||||
|
||||
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
|
||||
Builder.SetInsertPoint(Block);
|
||||
|
||||
// WorkGroup has only a single input
|
||||
Function::arg_iterator ai = WorkGroup->arg_begin();
|
||||
|
||||
SmallVector<Value *, 8> Arguments;
|
||||
Value *input_arg = &*ai;
|
||||
// convert to int**
|
||||
input_arg = Builder.CreateBitOrPointerCast(
|
||||
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
|
||||
size_t idx = 0;
|
||||
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
||||
ii != ee; ++ii) {
|
||||
Type *ArgType = ii->getType();
|
||||
|
||||
// calculate addr
|
||||
Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx));
|
||||
// load corresponding int*
|
||||
GEP = Builder.CreateLoad(GEP);
|
||||
// bitcast
|
||||
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
|
||||
Value *Arg = Builder.CreateLoad(GEP);
|
||||
Arguments.push_back(Arg);
|
||||
++idx;
|
||||
}
|
||||
|
||||
CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
|
||||
Builder.CreateRetVoid();
|
||||
}
|
||||
for (auto f : need_remove) {
|
||||
f->dropAllReferences();
|
||||
f->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void remove_barrier(llvm::Module *M) {
|
||||
std::vector<Instruction *> need_remove;
|
||||
for (auto F = M->begin(); F != M->end(); ++F)
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void generate_x86_format(llvm::Module *M) {
|
||||
// change metadata
|
||||
set_meta_data(M);
|
||||
// decode argument
|
||||
decode_input(M);
|
||||
// remove barrier
|
||||
remove_barrier(M);
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
#include "handle_sync.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void split_block_by_sync(llvm::Function *F) {
|
||||
std::set<llvm::Instruction *> sync_inst;
|
||||
bool jump_first_sync = 1;
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
if (jump_first_sync) {
|
||||
jump_first_sync = 0;
|
||||
Instruction *next_inst = &(*std::next(i));
|
||||
sync_inst.insert(next_inst);
|
||||
continue;
|
||||
}
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
sync_inst.insert(Call);
|
||||
// we should also sync the next instruction
|
||||
// so that we can get a block with sync inst only
|
||||
Instruction *next_inst = &(*std::next(i));
|
||||
sync_inst.insert(next_inst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int _tmp = 0;
|
||||
for (auto inst : sync_inst) {
|
||||
inst->getParent()->splitBasicBlock(
|
||||
inst, inst->getParent()->getName().str() + "_after_block_sync_" +
|
||||
std::to_string(_tmp++));
|
||||
}
|
||||
}
|
||||
|
||||
void split_block_by_sync(llvm::Module *M) {
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
if (isKernelFunction(M, F))
|
||||
split_block_by_sync(F);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,302 @@
|
|||
#include "init.h"
|
||||
#include "memory_hierarchy.h"
|
||||
#include "tool.h"
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void inline_func_vote(llvm::Module *M) {
|
||||
std::set<llvm::Function *> need_remove;
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||
if (c->getCalledFunction()) {
|
||||
if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
|
||||
InlineFunctionInfo IFI;
|
||||
InlineFunction(c, IFI);
|
||||
need_remove.insert(c->getCalledFunction());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto f : need_remove) {
|
||||
f->dropAllReferences();
|
||||
f->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void create_global_variable(llvm::Module *M) {
|
||||
llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
|
||||
llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
|
||||
|
||||
auto zero = llvm::ConstantInt::get(I32, 0, true);
|
||||
|
||||
// we need global variable used for warp shuffle
|
||||
llvm::Type *WarpArrayType = llvm::ArrayType::get(I32, 32);
|
||||
llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32);
|
||||
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
zero, "intra_warp_index", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
zero, "inter_warp_index", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size_x", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size_y", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size_z", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "grid_size", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_index", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
// TLS variable used for warp-level collective operators
|
||||
new llvm::GlobalVariable(
|
||||
*M, WarpArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
"warp_shfl", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
auto warp_vote = new llvm::GlobalVariable(
|
||||
*M, VoteArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
"warp_vote", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
warp_vote->setAlignment(llvm::MaybeAlign(32));
|
||||
}
|
||||
|
||||
void remove_metadata(llvm::Module *M) {
|
||||
SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
F->getAllMetadata(MDs);
|
||||
for (auto &MD : MDs) {
|
||||
F->setMetadata(MD.first, NULL);
|
||||
}
|
||||
F->removeFnAttr("target-features");
|
||||
F->removeFnAttr("target-cpu");
|
||||
}
|
||||
}
|
||||
|
||||
void init_llvm_pass() {
|
||||
|
||||
InitializeAllTargets();
|
||||
InitializeAllTargetMCs();
|
||||
InitializeAllAsmPrinters();
|
||||
InitializeAllAsmParsers();
|
||||
|
||||
PassRegistry &Registry = *PassRegistry::getPassRegistry();
|
||||
|
||||
initializeCore(Registry);
|
||||
initializeScalarOpts(Registry);
|
||||
initializeVectorization(Registry);
|
||||
initializeIPO(Registry);
|
||||
initializeAnalysis(Registry);
|
||||
initializeTransformUtils(Registry);
|
||||
initializeInstCombine(Registry);
|
||||
initializeInstrumentation(Registry);
|
||||
initializeTarget(Registry);
|
||||
|
||||
llvm::StringMap<llvm::cl::Option *> &opts = llvm::cl::getRegisteredOptions();
|
||||
|
||||
llvm::cl::Option *O = nullptr;
|
||||
|
||||
O = opts["scalarize-load-store"];
|
||||
assert(O && "could not find LLVM option 'scalarize-load-store'");
|
||||
O->addOccurrence(1, StringRef("scalarize-load-store"), StringRef("1"), false);
|
||||
|
||||
// LLVM inner loop vectorizer does not check whether the loop inside
|
||||
// another loop, in which case even a small trip count loops might be
|
||||
// worthwhile to vectorize.
|
||||
O = opts["vectorizer-min-trip-count"];
|
||||
assert(O && "could not find LLVM option 'vectorizer-min-trip-count'");
|
||||
O->addOccurrence(1, StringRef("vectorizer-min-trip-count"), StringRef("2"),
|
||||
false);
|
||||
|
||||
// Disable jump threading optimization with following two options from
|
||||
// duplicating blocks. Using jump threading will mess up parallel region
|
||||
// construction especially when kernel contains barriers.
|
||||
// TODO: If enabled then parallel region construction code needs
|
||||
// improvements and make sure it doesn't disallow other optimizations like
|
||||
// vectorization.
|
||||
O = opts["jump-threading-threshold"];
|
||||
assert(O && "could not find LLVM option 'jump-threading-threshold'");
|
||||
O->addOccurrence(1, StringRef("jump-threading-threshold"), StringRef("0"),
|
||||
false);
|
||||
O = opts["jump-threading-implication-search-threshold"];
|
||||
assert(O && "could not find LLVM option "
|
||||
"'jump-threading-implication-search-threshold'");
|
||||
O->addOccurrence(1, StringRef("jump-threading-implication-search-threshold"),
|
||||
StringRef("0"), false);
|
||||
|
||||
// Enable diagnostics from the loop vectorizer.
|
||||
O = opts["pass-remarks-missed"];
|
||||
assert(O && "could not find LLVM option 'pass-remarks-missed'");
|
||||
O->addOccurrence(1, StringRef("pass-remarks-missed"),
|
||||
StringRef("loop-vectorize"), false);
|
||||
O->addOccurrence(1, StringRef("pass-remarks-missed"),
|
||||
StringRef("slp-vectorize"), false);
|
||||
|
||||
O = opts["pass-remarks-analysis"];
|
||||
assert(O && "could not find LLVM option 'pass-remarks-analysis'");
|
||||
O->addOccurrence(1, StringRef("pass-remarks-analysis"),
|
||||
StringRef("loop-vectorize"), false);
|
||||
O->addOccurrence(1, StringRef("pass-remarks-analysis"),
|
||||
StringRef("slp-vectorize"), false);
|
||||
|
||||
O = opts["pass-remarks"];
|
||||
assert(O && "could not find LLVM option 'pass-remarks'");
|
||||
O->addOccurrence(1, StringRef("pass-remarks"), StringRef("loop-vectorize"),
|
||||
false);
|
||||
O->addOccurrence(1, StringRef("pass-remarks"), StringRef("slp-vectorize"),
|
||||
false);
|
||||
}
|
||||
|
||||
void llvm_preprocess(llvm::Module *M) {
|
||||
init_llvm_pass();
|
||||
|
||||
auto Registry = PassRegistry::getPassRegistry();
|
||||
|
||||
llvm::legacy::PassManager Passes;
|
||||
|
||||
std::vector<std::string> passes;
|
||||
passes.push_back("lowerswitch");
|
||||
passes.push_back("mem2reg");
|
||||
passes.push_back("simplifycfg");
|
||||
passes.push_back("loop-simplify");
|
||||
for (auto pass : passes) {
|
||||
const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
|
||||
if (PIs) {
|
||||
Pass *thispass = PIs->createPass();
|
||||
Passes.add(thispass);
|
||||
} else {
|
||||
printf("Pass: %s not found\n", pass.c_str());
|
||||
}
|
||||
}
|
||||
Passes.run(*M);
|
||||
}
|
||||
|
||||
bool lower_constant_expr(llvm::Module *M) {
|
||||
bool modified = false;
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<CallInst *> need_remove;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto load_inst = dyn_cast<llvm::LoadInst>(BI)) {
|
||||
auto load_from = load_inst->getOperand(0);
|
||||
if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
|
||||
modified = true;
|
||||
auto ReplInst = get_element_ptr->getAsInstruction();
|
||||
ReplInst->insertBefore(load_inst);
|
||||
std::vector<Instruction *> Users;
|
||||
// Do not replace use during iteration of use. Do it in another loop
|
||||
for (auto U : get_element_ptr->users()) {
|
||||
if (auto InstUser = dyn_cast<Instruction>(U)) {
|
||||
Users.push_back(InstUser);
|
||||
}
|
||||
}
|
||||
for (auto &User : Users)
|
||||
User->replaceUsesOfWith(get_element_ptr, ReplInst);
|
||||
}
|
||||
} else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
|
||||
auto store_to = store_inst->getOperand(1);
|
||||
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
|
||||
modified = true;
|
||||
auto ReplInst = addr_cast->getAsInstruction();
|
||||
ReplInst->insertBefore(store_inst);
|
||||
std::vector<Instruction *> Users;
|
||||
// Do not replace use during iteration of use. Do it in another loop
|
||||
for (auto U : addr_cast->users()) {
|
||||
if (auto InstUser = dyn_cast<Instruction>(U)) {
|
||||
Users.push_back(InstUser);
|
||||
}
|
||||
}
|
||||
for (auto &User : Users)
|
||||
User->replaceUsesOfWith(addr_cast, ReplInst);
|
||||
}
|
||||
} else if (auto get_element_ptr =
|
||||
dyn_cast<llvm::GetElementPtrInst>(BI)) {
|
||||
auto get_from = get_element_ptr->getOperand(0);
|
||||
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
|
||||
modified = true;
|
||||
auto ReplInst = addr_cast->getAsInstruction();
|
||||
ReplInst->insertBefore(get_element_ptr);
|
||||
std::vector<Instruction *> Users;
|
||||
// Do not replace use during iteration of use. Do it in another loop
|
||||
for (auto U : addr_cast->users()) {
|
||||
if (auto InstUser = dyn_cast<Instruction>(U)) {
|
||||
Users.push_back(InstUser);
|
||||
}
|
||||
}
|
||||
for (auto &User : Users)
|
||||
User->replaceUsesOfWith(addr_cast, ReplInst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return modified;
|
||||
}
|
||||
|
||||
void init_block(llvm::Module *M) {
|
||||
// using official llvm preprocess
|
||||
llvm_preprocess(M);
|
||||
// remove useles Cuda function
|
||||
remove_cuda_built_in(M);
|
||||
|
||||
// lower ConstantExpression
|
||||
bool modified;
|
||||
do {
|
||||
modified = lower_constant_expr(M);
|
||||
} while (modified);
|
||||
// remove useless metadata
|
||||
remove_metadata(M);
|
||||
// inline vote function
|
||||
inline_func_vote(M);
|
||||
// create global variable for warp and vote
|
||||
create_global_variable(M);
|
||||
// replace phi with data load
|
||||
phi2alloc(M);
|
||||
// replace share memory
|
||||
mem_share2global(M);
|
||||
// replace asm Inline
|
||||
replace_asm_call(M);
|
||||
}
|
|
@ -0,0 +1,494 @@
|
|||
#include "insert_sync.h"
|
||||
#include "assert.h"
|
||||
#include "handle_sync.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
class InsertBuiltInBarrier : public llvm::FunctionPass {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
InsertBuiltInBarrier() : FunctionPass(ID) {}
|
||||
|
||||
virtual bool runOnFunction(Function &F) {
|
||||
if (!isKernelFunction(F.getParent(), &F))
|
||||
return 0;
|
||||
std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
|
||||
std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
|
||||
|
||||
// insert sync in the entry
|
||||
BasicBlock *entry = &(*F.begin());
|
||||
for (auto i = entry->begin(); i != entry->end(); i++) {
|
||||
if (!isa<AllocaInst>(i)) {
|
||||
insert_inter_warp_sync_before.push_back(&(*(i)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (Function::iterator I = F.begin(); I != F.end(); ++I) {
|
||||
BasicBlock::iterator BI = I->begin();
|
||||
|
||||
// insert barrier before return
|
||||
for (; BI != I->end(); BI++) {
|
||||
llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
|
||||
if (Ret) {
|
||||
insert_inter_warp_sync_before.push_back(&(*BI));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (insert_intra_warp_sync_before.empty() &&
|
||||
insert_inter_warp_sync_before.empty())
|
||||
return 0;
|
||||
for (auto inst : insert_intra_warp_sync_before) {
|
||||
CreateIntraWarpBarrier(inst);
|
||||
}
|
||||
for (auto inst : insert_inter_warp_sync_before) {
|
||||
CreateInterWarpBarrier(inst);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
class InsertConditionalBarrier : public llvm::FunctionPass {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
InsertConditionalBarrier() : FunctionPass(ID) {}
|
||||
|
||||
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
|
||||
AU.addRequired<PostDominatorTreeWrapperPass>();
|
||||
AU.addPreserved<PostDominatorTreeWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||
}
|
||||
|
||||
BasicBlock *firstNonBackedgePredecessor(llvm::BasicBlock *bb) {
|
||||
|
||||
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
|
||||
pred_iterator I = pred_begin(bb), E = pred_end(bb);
|
||||
if (I == E)
|
||||
return NULL;
|
||||
while (DT->dominates(bb, *I) && I != E)
|
||||
++I;
|
||||
if (I == E)
|
||||
return NULL;
|
||||
else
|
||||
return *I;
|
||||
}
|
||||
|
||||
BasicBlock *firstNonBackedgeSuccessor(llvm::BasicBlock *bb) {
|
||||
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
auto t = bb->getTerminator();
|
||||
assert(t->getNumSuccessors() <= 2);
|
||||
for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
|
||||
BasicBlock *successor = t->getSuccessor(i);
|
||||
bool isBackedge = DT->dominates(successor, bb);
|
||||
if (isBackedge)
|
||||
continue;
|
||||
return successor;
|
||||
}
|
||||
};
|
||||
|
||||
virtual bool runOnFunction(Function &F) {
|
||||
if (!isKernelFunction(F.getParent(), &F))
|
||||
return 0;
|
||||
|
||||
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
|
||||
|
||||
// first find all conditional barriers
|
||||
std::vector<BasicBlock *> conditionalBarriers;
|
||||
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
|
||||
BasicBlock *b = &*i;
|
||||
if (!has_barrier(b))
|
||||
continue;
|
||||
|
||||
// Unconditional barrier postdominates the entry node.
|
||||
if (PDT->getPostDomTree().dominates(b, &F.getEntryBlock()))
|
||||
continue;
|
||||
conditionalBarriers.push_back(b);
|
||||
}
|
||||
|
||||
if (conditionalBarriers.size() == 0)
|
||||
return 0;
|
||||
|
||||
bool changed = false;
|
||||
|
||||
while (!conditionalBarriers.empty()) {
|
||||
BasicBlock *b = conditionalBarriers.back();
|
||||
conditionalBarriers.pop_back();
|
||||
|
||||
// insert barrier in the start of if-condition
|
||||
|
||||
BasicBlock *pos = b;
|
||||
BasicBlock *pred = firstNonBackedgePredecessor(b);
|
||||
|
||||
while (PDT->getPostDomTree().dominates(b, pred)) {
|
||||
pos = pred;
|
||||
// If our BB post dominates the given block, we know it is not the
|
||||
// branching block that makes the barrier conditional.
|
||||
pred = firstNonBackedgePredecessor(pred);
|
||||
|
||||
if (pred == b)
|
||||
break; // Traced across a loop edge, skip this case.
|
||||
}
|
||||
// we should create warp/block barrier based on the conditional barrier
|
||||
if (has_warp_barrier(b)) {
|
||||
CreateIntraWarpBarrier(pred->getTerminator());
|
||||
} else {
|
||||
CreateInterWarpBarrier(pred->getTerminator());
|
||||
}
|
||||
changed = true;
|
||||
|
||||
// insert barrier in the merge point for then-else branches
|
||||
// also insert barrier at the end of conditional branch
|
||||
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
std::queue<llvm::BasicBlock *> successor_queue;
|
||||
for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
|
||||
auto ss = pred->getTerminator()->getSuccessor(i);
|
||||
if (!DT->dominates(ss, pred))
|
||||
successor_queue.push(ss);
|
||||
}
|
||||
std::set<llvm::BasicBlock *> visited;
|
||||
llvm::BasicBlock *merge_point = NULL;
|
||||
while (!successor_queue.empty()) {
|
||||
auto curr = successor_queue.front();
|
||||
successor_queue.pop();
|
||||
if (visited.find(curr) != visited.end())
|
||||
continue;
|
||||
|
||||
visited.insert(curr);
|
||||
if (PDT->getPostDomTree().dominates(curr, pred)) {
|
||||
// find the truly merge point
|
||||
merge_point = curr;
|
||||
if (has_warp_barrier(b)) {
|
||||
CreateIntraWarpBarrier(&(*curr->begin()));
|
||||
for (BasicBlock *Pred : predecessors(curr)) {
|
||||
CreateIntraWarpBarrier(&(*Pred->getTerminator()));
|
||||
}
|
||||
} else {
|
||||
CreateInterWarpBarrier(&(*curr->begin()));
|
||||
for (BasicBlock *Pred : predecessors(curr)) {
|
||||
CreateInterWarpBarrier(&(*Pred->getTerminator()));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
|
||||
auto ss = curr->getTerminator()->getSuccessor(i);
|
||||
if (!DT->dominates(ss, curr))
|
||||
successor_queue.push(ss);
|
||||
}
|
||||
}
|
||||
assert(merge_point && "do not find merge point\n");
|
||||
changed = true;
|
||||
|
||||
// we may create a new conditional barrier after insert
|
||||
if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
|
||||
conditionalBarriers.push_back(pred);
|
||||
|
||||
// find any block which are not dominated by header
|
||||
// but be posdiminated by merge point
|
||||
std::queue<llvm::BasicBlock *> if_body;
|
||||
std::set<llvm::BasicBlock *> visited_block;
|
||||
for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
|
||||
if_body.push(pred->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
while (!if_body.empty()) {
|
||||
auto curr = if_body.front();
|
||||
if_body.pop();
|
||||
if (visited_block.find(curr) != visited_block.end())
|
||||
continue;
|
||||
visited_block.insert(curr);
|
||||
if (!PDT->getPostDomTree().dominates(merge_point, curr))
|
||||
continue;
|
||||
if (!DT->dominates(pred, curr) &&
|
||||
PDT->getPostDomTree().dominates(merge_point, curr)) {
|
||||
// we should insert barrier at the beginning and
|
||||
// end of its predecessor
|
||||
if (has_warp_barrier(b)) {
|
||||
CreateIntraWarpBarrier(&(*curr->begin()));
|
||||
for (BasicBlock *Pred : predecessors(curr)) {
|
||||
CreateIntraWarpBarrier(&(*Pred->getTerminator()));
|
||||
}
|
||||
} else {
|
||||
CreateInterWarpBarrier(&(*curr->begin()));
|
||||
for (BasicBlock *Pred : predecessors(curr)) {
|
||||
CreateInterWarpBarrier(&(*Pred->getTerminator()));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
|
||||
if_body.push(curr->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
};
|
||||
|
||||
class InsertBarrierForSpecialCase : public llvm::FunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
InsertBarrierForSpecialCase() : FunctionPass(ID) {}
|
||||
|
||||
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
|
||||
AU.addRequired<PostDominatorTreeWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
}
|
||||
|
||||
virtual bool runOnFunction(Function &F) {
|
||||
if (!isKernelFunction(F.getParent(), &F))
|
||||
return 0;
|
||||
bool changed = false;
|
||||
std::set<BasicBlock *> if_head;
|
||||
// insert an extra block for the following case
|
||||
// 1) there is a merge point for an if-else branch,
|
||||
// but this merge point has other income edge
|
||||
|
||||
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
|
||||
auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
|
||||
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
|
||||
BasicBlock *b = &*i;
|
||||
BasicBlock *merge_point = NULL;
|
||||
if (b->getTerminator()->getNumSuccessors() == 2) {
|
||||
auto b1 = b->getTerminator()->getSuccessor(0);
|
||||
auto b2 = b->getTerminator()->getSuccessor(1);
|
||||
if (PDT->getPostDomTree().dominates(b1, b2)) {
|
||||
merge_point = b1;
|
||||
} else if (PDT->getPostDomTree().dominates(b2, b2)) {
|
||||
merge_point = b2;
|
||||
} else {
|
||||
assert(0 && "find complex if-else branch\n");
|
||||
}
|
||||
std::cout << std::flush;
|
||||
for (BasicBlock *Pred : predecessors(merge_point)) {
|
||||
if (!DT->dominates(b, Pred)) {
|
||||
// we need to insert an extra block to be the merge point
|
||||
// for the if-branch
|
||||
if_head.insert(b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto M = F.getParent();
|
||||
for (auto head : if_head) {
|
||||
assert(head->getTerminator()->getNumSuccessors() == 2);
|
||||
BasicBlock *merge_point = NULL;
|
||||
auto s1 = head->getTerminator()->getSuccessor(0);
|
||||
auto s2 = head->getTerminator()->getSuccessor(1);
|
||||
if (PDT->getPostDomTree().dominates(s1, s2)) {
|
||||
merge_point = s1;
|
||||
} else {
|
||||
merge_point = s2;
|
||||
}
|
||||
if (!find_barrier_in_region(head, merge_point)) {
|
||||
printf("do not need to handle tri-income if: %s\n",
|
||||
merge_point->getName().str().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
BasicBlock *Block = BasicBlock::Create(M->getContext(), "if_end", &F);
|
||||
llvm::IRBuilder<> Builder(M->getContext());
|
||||
Builder.SetInsertPoint(Block);
|
||||
auto br_inst = Builder.CreateBr(merge_point);
|
||||
assert(has_barrier(head) && "preheader does not have barrier\n");
|
||||
if (has_warp_barrier(head)) {
|
||||
CreateIntraWarpBarrier(br_inst);
|
||||
} else {
|
||||
CreateInterWarpBarrier(br_inst);
|
||||
}
|
||||
// replace usage in if-branch
|
||||
std::set<Instruction *> need_replace;
|
||||
for (BasicBlock *Pred : predecessors(merge_point)) {
|
||||
if (DT->dominates(head, Pred) && Pred != Block) {
|
||||
need_replace.insert(Pred->getTerminator());
|
||||
}
|
||||
}
|
||||
for (auto inst : need_replace) {
|
||||
inst->replaceUsesOfWith(merge_point, Block);
|
||||
}
|
||||
changed = 1;
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
};
|
||||
|
||||
class InsertConditionalForBarrier : public llvm::LoopPass {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
InsertConditionalForBarrier() : LoopPass(ID) {}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const {
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
}
|
||||
|
||||
bool runOnLoop(Loop *L, LPPassManager &LPM) {
|
||||
if (!isKernelFunction(L->getHeader()->getParent()->getParent(),
|
||||
L->getHeader()->getParent()))
|
||||
return 0;
|
||||
// check whether this loop has barrier
|
||||
bool is_conditional_loop = 0;
|
||||
bool is_warp = 0;
|
||||
for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); i != e;
|
||||
++i) {
|
||||
for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
|
||||
++j) {
|
||||
if (auto Call = dyn_cast<CallInst>(j)) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
is_conditional_loop = true;
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
is_warp = 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!is_conditional_loop)
|
||||
return 0;
|
||||
// insert barrier at the beginning of header
|
||||
// and the end of pre header, so that we can get a
|
||||
// single block connected with latch
|
||||
if (!is_warp) {
|
||||
auto prehead_block = L->getLoopPreheader();
|
||||
CreateInterWarpBarrier(prehead_block->getTerminator());
|
||||
auto header_block = L->getHeader();
|
||||
CreateInterWarpBarrier(&(*header_block->begin()));
|
||||
} else {
|
||||
auto prehead_block = L->getLoopPreheader();
|
||||
CreateIntraWarpBarrier(prehead_block->getTerminator());
|
||||
auto header_block = L->getHeader();
|
||||
CreateIntraWarpBarrier(&(*header_block->begin()));
|
||||
}
|
||||
|
||||
// as we assume all loops are rotated, we have to insert
|
||||
// barrier before the condition jump of the loop exit
|
||||
|
||||
if (auto exit_block = L->getExitingBlock()) {
|
||||
auto conditional_br =
|
||||
dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
|
||||
assert(conditional_br && conditional_br->isConditional());
|
||||
// insert barrier at the beginning of successor of exit
|
||||
if (!is_warp)
|
||||
CreateInterWarpBarrier(conditional_br);
|
||||
else
|
||||
CreateIntraWarpBarrier(conditional_br);
|
||||
} else {
|
||||
// handle break in for-loop
|
||||
printf("loop has multiply exists\n");
|
||||
// this time, we have also insert sync before the for-body
|
||||
auto header_block = L->getHeader();
|
||||
assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
|
||||
"has more than 2 successors of the for-head\n");
|
||||
BasicBlock *for_body = NULL;
|
||||
for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
|
||||
i++) {
|
||||
auto bb = header_block->getTerminator()->getSuccessor(i);
|
||||
if (L->contains(bb)) {
|
||||
if (is_warp) {
|
||||
CreateIntraWarpBarrier(&(*bb->begin()));
|
||||
} else {
|
||||
CreateInterWarpBarrier(&(*bb->begin()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SmallVector<llvm::BasicBlock *, 8> ExitingBlocks;
|
||||
|
||||
L->getExitingBlocks(ExitingBlocks);
|
||||
while (!ExitingBlocks.empty()) {
|
||||
auto exit_block = ExitingBlocks.back();
|
||||
ExitingBlocks.pop_back();
|
||||
auto conditional_br =
|
||||
dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
|
||||
assert(conditional_br && conditional_br->isConditional());
|
||||
// insert barrier at the beginning of successor of exit
|
||||
if (!is_warp)
|
||||
CreateInterWarpBarrier(conditional_br);
|
||||
else
|
||||
CreateIntraWarpBarrier(conditional_br);
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
char InsertBuiltInBarrier::ID = 0;
|
||||
char InsertConditionalBarrier::ID = 0;
|
||||
char InsertConditionalForBarrier::ID = 0;
|
||||
char InsertBarrierForSpecialCase::ID = 0;
|
||||
|
||||
namespace {
|
||||
static RegisterPass<InsertConditionalBarrier>
|
||||
insert_conditional_barrier("insert-conditional-if-barriers",
|
||||
"Insert conditional barriers for if body");
|
||||
static RegisterPass<InsertConditionalForBarrier>
|
||||
insert_conditional_for_barrier("insert-conditional-for-barriers",
|
||||
"Insert conditional barriers for for loop");
|
||||
static RegisterPass<InsertBarrierForSpecialCase>
|
||||
insert_special_case("insert-special-case-barriers",
|
||||
"Insert barriers for special cases");
|
||||
static RegisterPass<InsertBuiltInBarrier>
|
||||
insert_built_in_barrier("insert-built-in-barriers",
|
||||
"Insert built in barriers");
|
||||
} // namespace
|
||||
|
||||
void insert_sync(llvm::Module *M) {
|
||||
auto Registry = PassRegistry::getPassRegistry();
|
||||
|
||||
llvm::legacy::PassManager Passes;
|
||||
|
||||
std::vector<std::string> passes;
|
||||
passes.push_back("insert-built-in-barriers");
|
||||
passes.push_back("insert-conditional-if-barriers");
|
||||
passes.push_back("insert-conditional-for-barriers");
|
||||
passes.push_back("insert-special-case-barriers");
|
||||
for (auto pass : passes) {
|
||||
const PassInfo *PIs = Registry->getPassInfo(StringRef(pass));
|
||||
if (PIs) {
|
||||
Pass *thispass = PIs->createPass();
|
||||
Passes.add(thispass);
|
||||
} else {
|
||||
assert(0 && "Pass not found\n");
|
||||
}
|
||||
}
|
||||
Passes.run(*M);
|
||||
}
|
|
@ -0,0 +1,848 @@
|
|||
|
||||
#include "insert_warp_loop.h"
|
||||
#include "handle_sync.h"
|
||||
#include "tool.h"
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
struct ParallelRegion {
|
||||
std::set<llvm::BasicBlock *> wrapped_block;
|
||||
llvm::BasicBlock *successor_block;
|
||||
llvm::BasicBlock *start_block;
|
||||
llvm::BasicBlock *end_block;
|
||||
bool inst_in_region(llvm::Instruction *inst) {
|
||||
for (auto bb : wrapped_block) {
|
||||
if (inst->getParent()->getName().str() == bb->getName().str())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool inst_used_in_region(llvm::Instruction *inst) {
|
||||
for (auto ui = inst->use_begin(); ui != inst->use_end(); ++ui) {
|
||||
auto *user = dyn_cast<Instruction>(ui->getUser());
|
||||
if (user == NULL)
|
||||
continue;
|
||||
if (inst_in_region(user)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
std::map<llvm::Instruction *, unsigned> tempInstructionIds;
|
||||
std::map<std::string, llvm::Instruction *> contextArrays;
|
||||
int tempInstructionIndex = 0;
|
||||
int need_nested_loop;
|
||||
|
||||
bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
|
||||
if (isa<BranchInst>(instr))
|
||||
return true;
|
||||
|
||||
llvm::Module *M = instr->getParent()->getParent()->getParent();
|
||||
llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
|
||||
if (load != NULL) {
|
||||
auto load_addr = load->getPointerOperand();
|
||||
if (load_addr == M->getGlobalVariable("intra_warp_index"))
|
||||
return true;
|
||||
if (load_addr == M->getGlobalVariable("inter_warp_index"))
|
||||
return true;
|
||||
if (load_addr == M->getGlobalVariable("warp_vote"))
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO: we should further analyze whether the local variable
|
||||
// is same among all threads within a wrap
|
||||
return false;
|
||||
}
|
||||
|
||||
// generate countpart alloc in the beginning of the Function
|
||||
llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
||||
bool intra_warp_loop) {
|
||||
std::ostringstream var;
|
||||
|
||||
if (std::string(instruction->getName().str()) != "") {
|
||||
var << instruction->getName().str();
|
||||
} else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) {
|
||||
var << tempInstructionIds[instruction];
|
||||
} else {
|
||||
tempInstructionIds[instruction] = tempInstructionIndex++;
|
||||
var << tempInstructionIds[instruction];
|
||||
}
|
||||
if (intra_warp_loop)
|
||||
var << "_intra_warp_";
|
||||
else
|
||||
var << "_inter_warp_";
|
||||
std::string varName = var.str();
|
||||
|
||||
if (contextArrays.find(varName) != contextArrays.end())
|
||||
return contextArrays[varName];
|
||||
|
||||
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
|
||||
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
|
||||
Function *FF = instruction->getParent()->getParent();
|
||||
Module *M = instruction->getParent()->getParent()->getParent();
|
||||
LLVMContext &C = M->getContext();
|
||||
const llvm::DataLayout &Layout = M->getDataLayout();
|
||||
|
||||
llvm::Type *elementType;
|
||||
if (isa<AllocaInst>(instruction)) {
|
||||
elementType =
|
||||
dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
|
||||
} else {
|
||||
elementType = instruction->getType();
|
||||
}
|
||||
|
||||
Type *AllocType = elementType;
|
||||
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
|
||||
if (InstCast) {
|
||||
unsigned Alignment = InstCast->getAlignment();
|
||||
|
||||
uint64_t StoreSize = Layout.getTypeStoreSize(InstCast->getAllocatedType());
|
||||
|
||||
if ((Alignment > 1) && (StoreSize & (Alignment - 1))) {
|
||||
uint64_t AlignedSize = (StoreSize & (~(Alignment - 1))) + Alignment;
|
||||
assert(AlignedSize > StoreSize);
|
||||
uint64_t RequiredExtraBytes = AlignedSize - StoreSize;
|
||||
|
||||
if (isa<ArrayType>(elementType)) {
|
||||
|
||||
ArrayType *StructPadding = ArrayType::get(
|
||||
Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
|
||||
|
||||
std::vector<Type *> PaddedStructElements;
|
||||
PaddedStructElements.push_back(elementType);
|
||||
PaddedStructElements.push_back(StructPadding);
|
||||
const ArrayRef<Type *> NewStructElements(PaddedStructElements);
|
||||
AllocType = StructType::get(M->getContext(), NewStructElements, true);
|
||||
uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
|
||||
assert(NewStoreSize == AlignedSize);
|
||||
} else if (isa<StructType>(elementType)) {
|
||||
StructType *OldStruct = dyn_cast<StructType>(elementType);
|
||||
|
||||
ArrayType *StructPadding = ArrayType::get(
|
||||
Type::getInt8Ty(M->getContext()), RequiredExtraBytes);
|
||||
std::vector<Type *> PaddedStructElements;
|
||||
for (unsigned j = 0; j < OldStruct->getNumElements(); j++)
|
||||
PaddedStructElements.push_back(OldStruct->getElementType(j));
|
||||
PaddedStructElements.push_back(StructPadding);
|
||||
const ArrayRef<Type *> NewStructElements(PaddedStructElements);
|
||||
AllocType = StructType::get(OldStruct->getContext(), NewStructElements,
|
||||
OldStruct->isPacked());
|
||||
uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType);
|
||||
assert(NewStoreSize == AlignedSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llvm::Value *ItemSize = nullptr;
|
||||
llvm::AllocaInst *Alloca = nullptr;
|
||||
|
||||
auto block_size_addr = M->getGlobalVariable("block_size");
|
||||
auto block_size = builder.CreateLoad(block_size_addr);
|
||||
Alloca = builder.CreateAlloca(AllocType, block_size, varName);
|
||||
|
||||
contextArrays[varName] = Alloca;
|
||||
return Alloca;
|
||||
}
|
||||
|
||||
// save the local variable into replicated array
|
||||
llvm::Instruction *AddContextSave(llvm::Instruction *instruction,
|
||||
llvm::Instruction *alloca,
|
||||
bool intra_warp_loop) {
|
||||
|
||||
if (isa<AllocaInst>(instruction)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
llvm::Module *M = instruction->getParent()->getParent()->getParent();
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
|
||||
/* Save the produced variable to the array. */
|
||||
BasicBlock::iterator definition =
|
||||
(dyn_cast<Instruction>(instruction))->getIterator();
|
||||
++definition;
|
||||
|
||||
IRBuilder<> builder(&*definition);
|
||||
std::vector<llvm::Value *> gepArgs;
|
||||
|
||||
auto inter_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, intra_warp_index,
|
||||
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
|
||||
ConstantInt::get(I32, 32)),
|
||||
"thread_idx");
|
||||
gepArgs.push_back(thread_idx);
|
||||
|
||||
return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
|
||||
}
|
||||
|
||||
llvm::Instruction *AddContextRestore(llvm::Value *val,
|
||||
llvm::Instruction *alloca,
|
||||
llvm::Instruction *before, bool isAlloca,
|
||||
bool intra_warp_loop) {
|
||||
assert(val != NULL);
|
||||
assert(alloca != NULL);
|
||||
IRBuilder<> builder(alloca);
|
||||
if (before != NULL) {
|
||||
builder.SetInsertPoint(before);
|
||||
} else if (isa<Instruction>(val)) {
|
||||
builder.SetInsertPoint(dyn_cast<Instruction>(val));
|
||||
before = dyn_cast<Instruction>(val);
|
||||
} else {
|
||||
assert(false && "Unknown context restore location!");
|
||||
}
|
||||
|
||||
std::vector<llvm::Value *> gepArgs;
|
||||
|
||||
auto M = before->getParent()->getParent()->getParent();
|
||||
auto I32 = llvm::Type::getInt32Ty(M->getContext());
|
||||
auto inter_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, intra_warp_index,
|
||||
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
|
||||
ConstantInt::get(I32, 32)),
|
||||
"thread_idx");
|
||||
gepArgs.push_back(thread_idx);
|
||||
|
||||
llvm::Instruction *gep =
|
||||
dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
|
||||
if (isAlloca) {
|
||||
return gep;
|
||||
}
|
||||
return builder.CreateLoad(gep);
|
||||
}
|
||||
|
||||
void AddContextSaveRestore(llvm::Instruction *instruction,
|
||||
bool intra_warp_loop) {
|
||||
|
||||
/* Allocate the context data array for the variable. */
|
||||
llvm::Instruction *alloca = GetContextArray(instruction, intra_warp_loop);
|
||||
|
||||
llvm::Instruction *theStore =
|
||||
AddContextSave(instruction, alloca, intra_warp_loop);
|
||||
|
||||
std::vector<Instruction *> uses;
|
||||
|
||||
for (Instruction::use_iterator ui = instruction->use_begin(),
|
||||
ue = instruction->use_end();
|
||||
ui != ue; ++ui) {
|
||||
llvm::Instruction *user = cast<Instruction>(ui->getUser());
|
||||
if (user == NULL)
|
||||
continue;
|
||||
if (user == theStore)
|
||||
continue;
|
||||
uses.push_back(user);
|
||||
}
|
||||
|
||||
for (auto user : uses) {
|
||||
Instruction *contextRestoreLocation = user;
|
||||
llvm::Value *loadedValue =
|
||||
AddContextRestore(user, alloca, contextRestoreLocation,
|
||||
isa<AllocaInst>(instruction), intra_warp_loop);
|
||||
user->replaceUsesOfWith(instruction, loadedValue);
|
||||
}
|
||||
}
|
||||
|
||||
void handle_alloc(llvm::Function *F) {
|
||||
auto M = F->getParent();
|
||||
LLVMContext &C = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(C);
|
||||
|
||||
std::vector<llvm::Instruction *> instruction_to_fix;
|
||||
for (auto bb = F->begin(); bb != F->end(); bb++) {
|
||||
for (auto ii = bb->begin(); ii != bb->end(); ii++) {
|
||||
if (llvm::AllocaInst *i = dyn_cast<AllocaInst>(ii)) {
|
||||
instruction_to_fix.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llvm::Instruction *> need_remove;
|
||||
|
||||
for (auto inst : instruction_to_fix) {
|
||||
// generate a new alloc
|
||||
auto block_size_addr = M->getGlobalVariable("block_size");
|
||||
IRBuilder<> builder(inst);
|
||||
auto block_size = builder.CreateLoad(block_size_addr);
|
||||
|
||||
llvm::Type *elementType = NULL;
|
||||
if (dyn_cast<AllocaInst>(inst)->getType()->getElementType()) {
|
||||
elementType = dyn_cast<AllocaInst>(inst)->getType()->getElementType();
|
||||
}
|
||||
assert(elementType != NULL);
|
||||
|
||||
auto Alloca = builder.CreateAlloca(elementType, block_size,
|
||||
inst->getName().str() + "inter_warp");
|
||||
|
||||
// replace all usage
|
||||
std::set<Instruction *> replace_user;
|
||||
for (Instruction::use_iterator ui = inst->use_begin(), ue = inst->use_end();
|
||||
ui != ue; ++ui) {
|
||||
replace_user.insert(dyn_cast<Instruction>(ui->getUser()));
|
||||
}
|
||||
for (auto user : replace_user) {
|
||||
|
||||
IRBuilder<> builder(user);
|
||||
// std::vector<llvm::Value *> gepArgs;
|
||||
auto inter_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("inter_warp_index"));
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, intra_warp_index,
|
||||
builder.CreateBinOp(Instruction::Mul, inter_warp_index,
|
||||
ConstantInt::get(I32, 32)),
|
||||
"thread_idx");
|
||||
|
||||
auto gep = builder.CreateGEP(Alloca, thread_idx);
|
||||
|
||||
user->replaceUsesOfWith(inst, gep);
|
||||
}
|
||||
need_remove.push_back(inst);
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->dropAllReferences();
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
|
||||
bool intra_warp_loop = 1;
|
||||
// we should handle allocation generated by PHI
|
||||
{
|
||||
std::vector<llvm::Instruction *> instruction_to_fix;
|
||||
auto F = PRs[0].start_block->getParent();
|
||||
for (auto bb = F->begin(); bb != F->end(); bb++) {
|
||||
for (auto ii = bb->begin(); ii != bb->end(); ii++) {
|
||||
if (isa<AllocaInst>(&(*ii)))
|
||||
instruction_to_fix.push_back(&(*ii));
|
||||
}
|
||||
for (auto inst : instruction_to_fix) {
|
||||
AddContextSaveRestore(inst, intra_warp_loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto parallel_regions : PRs) {
|
||||
std::set<llvm::Instruction *> instruction_in_region;
|
||||
std::vector<llvm::Instruction *> instruction_to_fix;
|
||||
|
||||
for (auto bb : parallel_regions.wrapped_block) {
|
||||
for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
|
||||
++instr) {
|
||||
llvm::Instruction *instruction = &*instr;
|
||||
instruction_in_region.insert(instruction);
|
||||
}
|
||||
}
|
||||
/* Find all the instructions that define new values and
|
||||
check if they need to be context saved. */
|
||||
for (auto bb : parallel_regions.wrapped_block) {
|
||||
for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
|
||||
++instr) {
|
||||
llvm::Instruction *instruction = &*instr;
|
||||
|
||||
if (ShouldNotBeContextSaved(instruction))
|
||||
continue;
|
||||
|
||||
for (Instruction::use_iterator ui = instruction->use_begin(),
|
||||
ue = instruction->use_end();
|
||||
ui != ue; ++ui) {
|
||||
llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
|
||||
|
||||
if (user == NULL)
|
||||
continue;
|
||||
if (isa<AllocaInst>(instruction) ||
|
||||
(instruction_in_region.find(user) ==
|
||||
instruction_in_region.end())) {
|
||||
instruction_to_fix.push_back(instruction);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : instruction_to_fix) {
|
||||
AddContextSaveRestore(inst, intra_warp_loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BasicBlock *insert_loop_init(llvm::BasicBlock *InsertInitBefore,
|
||||
bool intra_warp_loop) {
|
||||
llvm::Module *M = InsertInitBefore->getParent()->getParent();
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::string block_name =
|
||||
(intra_warp_loop) ? "intra_warp_init" : "inter_warp_init";
|
||||
BasicBlock *loop_init = BasicBlock::Create(
|
||||
context, block_name, InsertInitBefore->getParent(), InsertInitBefore);
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(loop_init);
|
||||
if (intra_warp_loop) { // intra warp
|
||||
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
|
||||
builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
|
||||
} else { // inter warp
|
||||
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
|
||||
builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
|
||||
}
|
||||
builder.CreateBr(InsertInitBefore);
|
||||
return loop_init;
|
||||
}
|
||||
|
||||
BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore,
|
||||
llvm::BasicBlock *LoopEnd, bool intra_warp_loop) {
|
||||
llvm::Module *M = InsertCondBefore->getParent()->getParent();
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::string block_name =
|
||||
(intra_warp_loop) ? "intra_warp_cond" : "inter_warp_cond";
|
||||
BasicBlock *loop_cond = BasicBlock::Create(
|
||||
context, block_name, InsertCondBefore->getParent(), InsertCondBefore);
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(loop_cond);
|
||||
|
||||
llvm::Value *cmpResult = NULL;
|
||||
if (!intra_warp_loop) {
|
||||
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
|
||||
auto block_size = M->getGlobalVariable("block_size");
|
||||
auto warp_cnt =
|
||||
builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size),
|
||||
ConstantInt::get(I32, 32), "warp_number");
|
||||
|
||||
cmpResult =
|
||||
builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt);
|
||||
} else {
|
||||
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
|
||||
auto block_size = M->getGlobalVariable("block_size");
|
||||
if (!need_nested_loop) {
|
||||
cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
|
||||
builder.CreateLoad(block_size));
|
||||
} else {
|
||||
cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index),
|
||||
ConstantInt::get(I32, 32));
|
||||
}
|
||||
}
|
||||
builder.CreateCondBr(cmpResult, InsertCondBefore, LoopEnd);
|
||||
return loop_cond;
|
||||
}
|
||||
|
||||
BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore,
|
||||
bool intra_warp_loop) {
|
||||
llvm::Module *M = InsertIncBefore->getParent()->getParent();
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::string block_name =
|
||||
(intra_warp_loop) ? "intra_warp_inc" : "inter_warp_inc";
|
||||
BasicBlock *loop_inc = BasicBlock::Create(
|
||||
context, block_name, InsertIncBefore->getParent(), InsertIncBefore);
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(loop_inc);
|
||||
if (intra_warp_loop) { // intra warp
|
||||
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
|
||||
auto new_index = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(intra_warp_index),
|
||||
ConstantInt::get(I32, 1), "intra_warp_index_increment");
|
||||
builder.CreateStore(new_index, intra_warp_index);
|
||||
} else { // inter warp
|
||||
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
|
||||
auto new_index = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(inter_warp_index),
|
||||
ConstantInt::get(I32, 1), "inter_warp_index_increment");
|
||||
builder.CreateStore(new_index, inter_warp_index);
|
||||
}
|
||||
builder.CreateBr(InsertIncBefore);
|
||||
return loop_inc;
|
||||
}
|
||||
|
||||
void add_warp_loop(std::vector<ParallelRegion> parallel_regions,
|
||||
bool intra_warp_loop) {
|
||||
for (auto region : parallel_regions) {
|
||||
auto start_block = region.start_block;
|
||||
auto tail_block = region.end_block;
|
||||
auto next_block = region.successor_block;
|
||||
|
||||
auto loop_cond = insert_loop_cond(start_block, next_block, intra_warp_loop);
|
||||
auto loop_init = insert_loop_init(loop_cond, intra_warp_loop);
|
||||
|
||||
auto F = start_block->getParent();
|
||||
for (Function::iterator i = F->begin(); i != F->end(); ++i) {
|
||||
llvm::BasicBlock *bb = &(*i);
|
||||
if (bb == loop_cond)
|
||||
continue;
|
||||
bb->getTerminator()->replaceUsesOfWith(start_block, loop_init);
|
||||
}
|
||||
auto loop_inc = insert_loop_inc(loop_cond, intra_warp_loop);
|
||||
tail_block->getTerminator()->replaceUsesOfWith(next_block, loop_inc);
|
||||
// we have to reset inter/intra warp index to 0, as these maybe used
|
||||
// outside PR when there are conditional loop/branch
|
||||
llvm::Module *M = start_block->getParent()->getParent();
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
BasicBlock *reset_index = BasicBlock::Create(start_block->getContext(),
|
||||
"reset_block", F, next_block);
|
||||
IRBuilder<> builder(start_block->getContext());
|
||||
builder.SetInsertPoint(reset_index);
|
||||
if (intra_warp_loop) { // intra warp
|
||||
auto intra_warp_index = M->getGlobalVariable("intra_warp_index");
|
||||
builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index);
|
||||
} else { // inter warp
|
||||
auto inter_warp_index = M->getGlobalVariable("inter_warp_index");
|
||||
builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index);
|
||||
}
|
||||
builder.CreateBr(next_block);
|
||||
loop_cond->getTerminator()->replaceUsesOfWith(next_block, reset_index);
|
||||
// add metadata
|
||||
MDNode *Dummy =
|
||||
MDNode::getTemporary(context, ArrayRef<Metadata *>()).release();
|
||||
MDNode *AccessGroupMD = MDNode::getDistinct(context, {});
|
||||
MDNode *ParallelAccessMD = MDNode::get(
|
||||
context,
|
||||
{MDString::get(context, "llvm.loop.parallel_accesses"), AccessGroupMD});
|
||||
MDNode *Root = MDNode::get(context, {Dummy, ParallelAccessMD});
|
||||
|
||||
Root->replaceOperandWith(0, Root);
|
||||
MDNode::deleteTemporary(Dummy);
|
||||
// We now have
|
||||
// !1 = metadata !{metadata !1} <- self-referential root
|
||||
loop_cond->getTerminator()->setMetadata("llvm.loop", Root);
|
||||
|
||||
for (auto bb : region.wrapped_block) {
|
||||
for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); ii != ee;
|
||||
ii++) {
|
||||
if (!ii->mayReadOrWriteMemory()) {
|
||||
continue;
|
||||
}
|
||||
MDNode *NewMD = MDNode::get(bb->getContext(), AccessGroupMD);
|
||||
MDNode *OldMD = ii->getMetadata("llvm.mem.parallel_loop_access");
|
||||
if (OldMD != nullptr) {
|
||||
NewMD = llvm::MDNode::concatenate(OldMD, NewMD);
|
||||
}
|
||||
ii->setMetadata("llvm.mem.parallel_loop_access", NewMD);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_parallel_region(std::vector<ParallelRegion> parallel_regions) {
|
||||
printf("get PR:\n");
|
||||
for (auto region : parallel_regions) {
|
||||
auto start = region.start_block;
|
||||
auto end = region.end_block;
|
||||
auto next = region.successor_block;
|
||||
printf("parallel region: %s->%s next: %s\n", start->getName().str().c_str(),
|
||||
end->getName().str().c_str(), next->getName().str().c_str());
|
||||
printf("have: \n");
|
||||
for (auto b : region.wrapped_block) {
|
||||
printf("%s\n", b->getName().str().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
|
||||
std::vector<Instruction *> need_remove;
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
if (!intra_warp_loop && (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync")) {
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
class InsertWarpLoopPass : public llvm::FunctionPass {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
bool intra_warp_loop;
|
||||
DominatorTree *DT;
|
||||
PostDominatorTree *PDT;
|
||||
|
||||
InsertWarpLoopPass(bool intra_warp = 0)
|
||||
: FunctionPass(ID), intra_warp_loop(intra_warp) {}
|
||||
|
||||
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<PostDominatorTreeWrapperPass>();
|
||||
}
|
||||
|
||||
void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop,
|
||||
std::vector<ParallelRegion> ¶llel_regions) {
|
||||
ParallelRegion current_region;
|
||||
|
||||
SmallVector<BasicBlock *, 4> pending_blocks;
|
||||
BasicBlock *region_entry_barrier = NULL;
|
||||
BasicBlock *entry = NULL;
|
||||
BasicBlock *exit = B->getSinglePredecessor();
|
||||
for (BasicBlock *Pred : predecessors(B)) {
|
||||
pending_blocks.push_back(Pred);
|
||||
}
|
||||
if (pending_blocks.size() > 1) {
|
||||
// becuase we have insert the sync and split by them,
|
||||
// so if B has several income edges, it must be a merge point
|
||||
// for a conditional if. We can safely ignore it
|
||||
// TODO: we have to further check whether this conditional if
|
||||
// is for inter warp or intra warp
|
||||
return;
|
||||
}
|
||||
|
||||
while (!pending_blocks.empty()) {
|
||||
BasicBlock *current = pending_blocks.back();
|
||||
pending_blocks.pop_back();
|
||||
|
||||
// avoid infinite recursion of loops
|
||||
if (current_region.wrapped_block.count(current) != 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we reach another barrier this must be the
|
||||
// parallel region entry.
|
||||
bool has_barrier = 0;
|
||||
for (auto i = current->begin(), e = current->end(); i != e; ++i) {
|
||||
if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
|
||||
auto func_name = call_inst->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync")
|
||||
has_barrier = 1;
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync" && intra_warp_loop)
|
||||
has_barrier = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// if we reach a block which only has a single condtional branch,
|
||||
// it is the start point of a B-condition, we have to stop here
|
||||
bool is_single_conditional_branch_block = 0;
|
||||
if (auto br = dyn_cast<llvm::BranchInst>(current->getTerminator())) {
|
||||
if (br->isConditional()) {
|
||||
if (current->size() == 1) {
|
||||
is_single_conditional_branch_block = 1;
|
||||
} else {
|
||||
// generate by replicate local variable
|
||||
printf(
|
||||
"[WARNING] match single conditional branch with HARD CODE\n");
|
||||
bool branch_to_intra_init = false;
|
||||
for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
|
||||
llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
|
||||
auto block_name = entryCandidate->getName().str();
|
||||
if (find_block_barrier_in_region(current, B)) {
|
||||
if (block_name.find("warp_init") != block_name.npos) {
|
||||
is_single_conditional_branch_block = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (has_barrier || is_single_conditional_branch_block) {
|
||||
if (region_entry_barrier == NULL)
|
||||
region_entry_barrier = current;
|
||||
else if (region_entry_barrier != current) {
|
||||
// this means there is not PR before B, just return
|
||||
return;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Non-barrier block, this must be on the region.
|
||||
current_region.wrapped_block.insert(current);
|
||||
|
||||
// Add predecessors to pending queue.
|
||||
for (BasicBlock *Pred : predecessors(current)) {
|
||||
pending_blocks.push_back(Pred);
|
||||
}
|
||||
}
|
||||
|
||||
if (current_region.wrapped_block.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// if do not find entry node, this means all predecessor
|
||||
// blocks do not need to execute multiply times
|
||||
if (region_entry_barrier == NULL) {
|
||||
return;
|
||||
}
|
||||
// Find the entry node.
|
||||
assert(region_entry_barrier != NULL);
|
||||
for (unsigned
|
||||
suc = 0,
|
||||
num = region_entry_barrier->getTerminator()->getNumSuccessors();
|
||||
suc < num; ++suc) {
|
||||
llvm::BasicBlock *entryCandidate =
|
||||
region_entry_barrier->getTerminator()->getSuccessor(suc);
|
||||
if (current_region.wrapped_block.count(entryCandidate) == 0)
|
||||
continue;
|
||||
entry = entryCandidate;
|
||||
break;
|
||||
}
|
||||
// delete useless PR, those PRs only have branch
|
||||
if (entry == exit) {
|
||||
if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
bool is_useless = true;
|
||||
auto iter = entry;
|
||||
do {
|
||||
if (iter->size() != 1 || !isa<llvm::BranchInst>(entry->begin())) {
|
||||
is_useless = false;
|
||||
break;
|
||||
}
|
||||
if (iter->getTerminator()->getNumSuccessors() > 1) {
|
||||
is_useless = false;
|
||||
break;
|
||||
}
|
||||
iter = iter->getTerminator()->getSuccessor(0);
|
||||
} while (iter != exit);
|
||||
if (is_useless) {
|
||||
return;
|
||||
}
|
||||
assert(current_region.wrapped_block.count(entry) != 0);
|
||||
current_region.start_block = entry;
|
||||
current_region.end_block = exit;
|
||||
current_region.successor_block = B;
|
||||
parallel_regions.push_back(current_region);
|
||||
}
|
||||
|
||||
std::vector<ParallelRegion> getParallelRegions(llvm::Function *F,
|
||||
bool intra_warp_loop) {
|
||||
std::vector<ParallelRegion> parallel_regions;
|
||||
|
||||
SmallVector<BasicBlock *, 4> exit_blocks;
|
||||
for (Function::iterator s = F->begin(); s != F->end(); s++) {
|
||||
if (llvm::CallInst *call_inst =
|
||||
llvm::dyn_cast<llvm::CallInst>(s->begin())) {
|
||||
auto func_name = call_inst->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
exit_blocks.push_back(&(*s));
|
||||
}
|
||||
// when handling intra warp loop, we need also split the blocks
|
||||
// between warp barrier
|
||||
if (intra_warp_loop && func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
exit_blocks.push_back(&(*s));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First find all the ParallelRegions in the Function.
|
||||
while (!exit_blocks.empty()) {
|
||||
BasicBlock *exit = exit_blocks.back();
|
||||
exit_blocks.pop_back();
|
||||
getParallelRegionBefore(exit, intra_warp_loop, parallel_regions);
|
||||
}
|
||||
return parallel_regions;
|
||||
}
|
||||
|
||||
virtual bool runOnFunction(Function &F) {
|
||||
if (!isKernelFunction(F.getParent(), &F))
|
||||
return 0;
|
||||
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
|
||||
|
||||
// find parallel region we need to wrap
|
||||
auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
|
||||
assert(!parallel_regions.empty() && "can not find any parallel regions\n");
|
||||
// print_parallel_region(parallel_regions);
|
||||
add_warp_loop(parallel_regions, intra_warp_loop);
|
||||
|
||||
if (intra_warp_loop) {
|
||||
handle_local_variable_intra_warp(parallel_regions);
|
||||
}
|
||||
remove_barrier(&F, intra_warp_loop);
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
char InsertWarpLoopPass::ID = 0;
|
||||
|
||||
namespace {
|
||||
static RegisterPass<InsertWarpLoopPass> X("insert-warp-loop",
|
||||
"Insert inter/intra warp loop");
|
||||
} // namespace
|
||||
|
||||
bool has_warp_barrier(llvm::Module *M) {
|
||||
for (auto F = M->begin(); F != M->end(); ++F)
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void insert_warp_loop(llvm::Module *M) {
|
||||
llvm::legacy::PassManager Passes;
|
||||
need_nested_loop = has_warp_barrier(M);
|
||||
// use nested loop only when there are warp-level barrier
|
||||
if (need_nested_loop) {
|
||||
bool intra_warp = true;
|
||||
Passes.add(new InsertWarpLoopPass(intra_warp));
|
||||
// insert inter warp loop
|
||||
Passes.add(new InsertWarpLoopPass(!intra_warp));
|
||||
Passes.run(*M);
|
||||
} else {
|
||||
bool intra_warp = true;
|
||||
// only need a single loop, with size=block_size
|
||||
Passes.add(new InsertWarpLoopPass(intra_warp));
|
||||
Passes.run(*M);
|
||||
// remove all barriers
|
||||
for (auto F = M->begin(); F != M->end(); ++F)
|
||||
remove_barrier(dyn_cast<llvm::Function>(F), false);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
#include "memory_hierarchy.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <assert.h>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
void mem_share2global(llvm::Module *M) {
|
||||
LLVMContext *C = &M->getContext();
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int64T = Type::getInt64Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
||||
std::set<llvm::Instruction *> need_remove;
|
||||
std::set<GlobalVariable *> need_remove_share_memory;
|
||||
|
||||
// find all share memory and generate corresponding global memory
|
||||
for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
|
||||
if (GlobalVariable *share_memory = dyn_cast<GlobalVariable>(I)) {
|
||||
if (auto PT = dyn_cast<PointerType>(I->getType())) {
|
||||
unsigned AS = PT->getAddressSpace();
|
||||
if (AS == 3) { // find a share memory
|
||||
need_remove_share_memory.insert(share_memory);
|
||||
// generate the corresponding global memory variable
|
||||
auto new_name = "wrapper_global_" + share_memory->getName().str();
|
||||
auto element_type = PT->getElementType();
|
||||
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
|
||||
ConstantAggregateZero *const_array =
|
||||
ConstantAggregateZero::get(array_type);
|
||||
global_memory->setInitializer(const_array);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
} else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
|
||||
auto zero = llvm::ConstantInt::get(int_type, 0, true);
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, int_type, false, llvm::GlobalValue::ExternalLinkage, zero,
|
||||
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
|
||||
false);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
} else {
|
||||
assert(0 && "The required Share Memory Type is not supported\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
|
||||
BasicBlock *b = &*i;
|
||||
for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
|
||||
if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
|
||||
auto read_array = get_element_ptr->getPointerOperand();
|
||||
if (GlobalVariable *read_share_memory =
|
||||
dyn_cast<llvm::GlobalVariable>(read_array)) {
|
||||
// find a GetElementPtr which read share memory
|
||||
if (corresponding_global_memory.find(read_share_memory) !=
|
||||
corresponding_global_memory.end()) {
|
||||
std::vector<Value *> Indices;
|
||||
for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
|
||||
Indices.push_back(get_element_ptr->getOperand(i + 1));
|
||||
|
||||
auto new_GEP = GetElementPtrInst::Create(
|
||||
NULL, // Pointee type
|
||||
corresponding_global_memory.find(read_share_memory)
|
||||
->second, // Alloca
|
||||
Indices, // Indices
|
||||
"", get_element_ptr);
|
||||
// replace all get_element_ptr with new_GEP:
|
||||
// we can not directly use:
|
||||
// get_element_ptr->replaceAllUsesWith(new_GEP);
|
||||
// as get_element_ptr and new_GEP have different return type
|
||||
llvm::Type *original_type = get_element_ptr->getType();
|
||||
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
|
||||
new_GEP, original_type, "", get_element_ptr);
|
||||
get_element_ptr->replaceAllUsesWith(FormatASC);
|
||||
need_remove.insert(get_element_ptr);
|
||||
}
|
||||
}
|
||||
} else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
|
||||
auto read_array = addr_cast->getOperand(0);
|
||||
if (GlobalVariable *read_share_memory =
|
||||
dyn_cast<llvm::GlobalVariable>(read_array)) {
|
||||
// find a GetElementPtr which read share memory
|
||||
if (corresponding_global_memory.find(read_share_memory) !=
|
||||
corresponding_global_memory.end()) {
|
||||
llvm::Type *original_type = addr_cast->getType();
|
||||
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
|
||||
corresponding_global_memory.find(read_share_memory)->second,
|
||||
original_type, "", addr_cast);
|
||||
addr_cast->replaceAllUsesWith(FormatASC);
|
||||
need_remove.insert(addr_cast);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto i : need_remove) {
|
||||
i->dropAllReferences();
|
||||
i->eraseFromParent();
|
||||
}
|
||||
for (auto i : need_remove_share_memory) {
|
||||
i->dropAllReferences();
|
||||
i->eraseFromParent();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
#include "performance.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/MachineModuleInfo.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void performance_optimization(llvm::Module *M) {
|
||||
for (auto F = M->begin(); F != M->end(); F++) {
|
||||
for (auto I = F->arg_begin(); I != F->arg_end(); ++I) {
|
||||
if (I->getType()->isPointerTy()) {
|
||||
I->addAttr(llvm::Attribute::NoAlias);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llvm::legacy::PassManager Passes;
|
||||
|
||||
// add target machine info
|
||||
llvm::Triple triple("x86_64-unknown-linux-gnu");
|
||||
|
||||
std::string Error;
|
||||
const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
|
||||
if (!TheTarget) {
|
||||
printf("Error: %s\n", Error.c_str());
|
||||
assert(0);
|
||||
}
|
||||
llvm::TargetOptions Options;
|
||||
Options.FloatABIType = FloatABI::Hard;
|
||||
|
||||
TargetMachine *TM = TheTarget->createTargetMachine(
|
||||
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
|
||||
Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
|
||||
assert(TM && "No Machine Information\n");
|
||||
|
||||
Passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
|
||||
|
||||
TargetLibraryInfoImpl TLII(triple);
|
||||
TLII.disableAllFunctions();
|
||||
Passes.add(new TargetLibraryInfoWrapperPass(TLII));
|
||||
|
||||
// Add O3 optimization
|
||||
llvm::PassManagerBuilder Builder;
|
||||
Builder.OptLevel = 3;
|
||||
Builder.SizeLevel = 0;
|
||||
|
||||
Builder.LoopVectorize = true;
|
||||
Builder.SLPVectorize = true;
|
||||
|
||||
Builder.VerifyInput = true;
|
||||
Builder.VerifyOutput = true;
|
||||
|
||||
Builder.populateModulePassManager(Passes);
|
||||
Passes.run(*M);
|
||||
}
|
|
@ -0,0 +1,480 @@
|
|||
#include "tool.h"
|
||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
llvm::Module *LoadModuleFromFilr(char *file_name) {
|
||||
llvm::SMDiagnostic Err;
|
||||
llvm::LLVMContext *globalContext = new llvm::LLVMContext;
|
||||
auto program = parseIRFile(file_name, Err, *globalContext).release();
|
||||
if (!program) {
|
||||
printf("error when opening the bitcode\n");
|
||||
exit(1);
|
||||
}
|
||||
return program;
|
||||
}
|
||||
|
||||
void VerifyModule(llvm::Module *program) {
|
||||
std::string msg;
|
||||
llvm::raw_string_ostream os(msg);
|
||||
if (llvm::verifyModule(*program, &(llvm::errs())))
|
||||
llvm::report_fatal_error(os.str().c_str());
|
||||
}
|
||||
|
||||
void DumpModule(llvm::Module *M, char *file_name) {
|
||||
// modify the program, add a wrapper
|
||||
std::string msg;
|
||||
llvm::raw_string_ostream os(msg);
|
||||
std::error_code EC;
|
||||
ToolOutputFile Out(file_name, EC, sys::fs::F_None);
|
||||
if (EC) {
|
||||
errs() << "Fails to open output file: " << EC.message();
|
||||
return;
|
||||
}
|
||||
WriteBitcodeToFile(*M, Out.os());
|
||||
Out.keep();
|
||||
}
|
||||
|
||||
bool isKernelFunction(llvm::Module *M, llvm::Function *F) {
|
||||
NamedMDNode *NamedMD = M->getNamedMetadata("nvvm.annotations");
|
||||
if (!NamedMD) {
|
||||
printf("there must be nvvm.annotations!\n");
|
||||
exit(1);
|
||||
}
|
||||
for (unsigned I = 0, E = NamedMD->getNumOperands(); I != E; ++I) {
|
||||
MDNode *MD = NamedMD->getOperand(I);
|
||||
if (!MD || MD->getNumOperands() == 0)
|
||||
continue;
|
||||
if (MD->getNumOperands() != 3)
|
||||
continue;
|
||||
Metadata *Op = MD->getOperand(1);
|
||||
if (auto Str = llvm::cast<MDString>(Op)) {
|
||||
if (Str->getString().str() != "kernel")
|
||||
continue;
|
||||
llvm::Value *meta =
|
||||
dyn_cast<llvm::ValueAsMetadata>(MD->getOperand(0))->getValue();
|
||||
Function *FF = llvm::cast<Function>(meta);
|
||||
if (FF->getName().str() == F->getName().str())
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void replace_block(llvm::Function *F, llvm::BasicBlock *before,
|
||||
llvm::BasicBlock *after) {
|
||||
for (Function::iterator i = F->begin(); i != F->end(); ++i) {
|
||||
llvm::BasicBlock *bb = &(*i);
|
||||
if (bb == after)
|
||||
continue;
|
||||
bb->getTerminator()->replaceUsesOfWith(before, after);
|
||||
}
|
||||
}
|
||||
|
||||
llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore) {
|
||||
llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
|
||||
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
|
||||
|
||||
llvm::FunctionCallee f =
|
||||
M->getOrInsertFunction("llvm.nvvm.barrier0", LauncherFuncT);
|
||||
llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
|
||||
return llvm::CallInst::Create(F, "", InsertBefore);
|
||||
}
|
||||
|
||||
llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore) {
|
||||
llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false);
|
||||
llvm::FunctionCallee f =
|
||||
M->getOrInsertFunction("llvm.nvvm.bar.warp.sync", LauncherFuncT);
|
||||
llvm::Function *F = llvm::cast<llvm::Function>(f.getCallee());
|
||||
return llvm::CallInst::Create(F, "", InsertBefore);
|
||||
}
|
||||
|
||||
llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
|
||||
|
||||
std::string allocaName = std::string(phi->getName().str()) + ".ex_phi";
|
||||
|
||||
llvm::Function *function = phi->getParent()->getParent();
|
||||
|
||||
IRBuilder<> builder(&*(function->getEntryBlock().getFirstInsertionPt()));
|
||||
|
||||
llvm::Instruction *alloca =
|
||||
builder.CreateAlloca(phi->getType(), 0, allocaName);
|
||||
|
||||
for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
|
||||
++incoming) {
|
||||
Value *val = phi->getIncomingValue(incoming);
|
||||
BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
|
||||
builder.SetInsertPoint(incomingBB->getTerminator());
|
||||
llvm::Instruction *store = builder.CreateStore(val, alloca);
|
||||
}
|
||||
builder.SetInsertPoint(phi);
|
||||
|
||||
llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
|
||||
phi->replaceAllUsesWith(loadedValue);
|
||||
phi->eraseFromParent();
|
||||
|
||||
return loadedValue;
|
||||
}
|
||||
|
||||
void phi2alloc(llvm::Module *M) {
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
|
||||
typedef std::vector<llvm::Instruction *> InstructionVec;
|
||||
|
||||
InstructionVec PHIs;
|
||||
|
||||
for (Function::iterator bb = F->begin(); bb != F->end(); ++bb) {
|
||||
for (BasicBlock::iterator p = bb->begin(); p != bb->end(); ++p) {
|
||||
Instruction *instr = &*p;
|
||||
if (isa<PHINode>(instr)) {
|
||||
PHIs.push_back(instr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool changed = false;
|
||||
for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
|
||||
Instruction *instr = *i;
|
||||
BreakPHIToAllocas(dyn_cast<PHINode>(instr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remove_cuda_built_in(llvm::Module *M) {
|
||||
// initialize function name
|
||||
std::set<std::string> useless_func_name;
|
||||
useless_func_name.insert("cudaMalloc");
|
||||
useless_func_name.insert("cudaFuncGetAttributes");
|
||||
useless_func_name.insert("cudaGetDevice");
|
||||
useless_func_name.insert("cudaDeviceGetAttribute");
|
||||
useless_func_name.insert("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
|
||||
useless_func_name.insert(
|
||||
"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (useless_func_name.find(func_name) != useless_func_name.end()) {
|
||||
need_remove.insert(F);
|
||||
}
|
||||
}
|
||||
for (auto F : need_remove) {
|
||||
F->dropAllReferences();
|
||||
F->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<llvm::Instruction *> need_remove;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
|
||||
IRBuilder<> builder(&*(F->getEntryBlock().getFirstInsertionPt()));
|
||||
auto global_intra_warp_idx =
|
||||
F->getParent()->getGlobalVariable("intra_warp_index");
|
||||
auto local_intra_warp_idx =
|
||||
builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
|
||||
0, "local_intra_warp_idx");
|
||||
global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
|
||||
auto global_inter_warp_idx =
|
||||
F->getParent()->getGlobalVariable("inter_warp_index");
|
||||
auto local_inter_warp_idx =
|
||||
builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
|
||||
0, "local_inter_warp_idx");
|
||||
global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
|
||||
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Load = dyn_cast<LoadInst>(BI)) {
|
||||
auto load_from = Load->getOperand(0);
|
||||
if (load_from == F->getParent()->getGlobalVariable("block_size")) {
|
||||
Load->replaceAllUsesWith(ConstantInt::get(
|
||||
I32, block_dim[0] * block_dim[1] * block_dim[2]));
|
||||
need_remove.push_back(Load);
|
||||
}
|
||||
} else if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->getCalledFunction()) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
|
||||
// replace it by warp_id
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
|
||||
ConstantInt::get(I32, 32), "");
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
|
||||
thread_idx, "thread_idx");
|
||||
if (block_dim[1] != 1 || block_dim[2] != 1) {
|
||||
printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::SRem, thread_idx,
|
||||
ConstantInt::get(I32, block_dim[0]), "thread_id_x");
|
||||
}
|
||||
|
||||
Call->replaceAllUsesWith(thread_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.y") {
|
||||
// replace it by warp_id
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
|
||||
auto thread_idx = builder.CreateBinOp(
|
||||
Instruction::Mul, builder.CreateLoad(local_inter_warp_idx),
|
||||
ConstantInt::get(I32, 32), "");
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
|
||||
thread_idx, "thread_idx");
|
||||
// tidy = tid / block_dim.x
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::SDiv, thread_idx,
|
||||
ConstantInt::get(I32, block_dim[0]),
|
||||
// builder.CreateLoad(M->getGlobalVariable("block_size_x")),
|
||||
"thread_id_y");
|
||||
|
||||
Call->replaceAllUsesWith(thread_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
|
||||
printf("[WARNING] We DO NOT support multi-dim block\n");
|
||||
auto zero = ConstantInt::get(I32, 0);
|
||||
Call->replaceAllUsesWith(zero);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
|
||||
func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
|
||||
printf("[WARNING We DO NOT support multi-dim grid\n");
|
||||
auto zero = ConstantInt::get(I32, 0);
|
||||
Call->replaceAllUsesWith(zero);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_size = ConstantInt::get(I32, block_dim[0]);
|
||||
Call->replaceAllUsesWith(block_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_size = ConstantInt::get(I32, block_dim[1]);
|
||||
Call->replaceAllUsesWith(block_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_size = ConstantInt::get(I32, block_dim[2]);
|
||||
Call->replaceAllUsesWith(block_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = ConstantInt::get(I32, grid_dim[0]);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
|
||||
func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
|
||||
printf("[WARNING We DO NOT support multi-dim grid\n");
|
||||
auto one = ConstantInt::get(I32, 1);
|
||||
Call->replaceAllUsesWith(one);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
if (Call->isInlineAsm()) {
|
||||
auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
|
||||
if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
|
||||
printf("unknown InlineAsm\n");
|
||||
exit(1);
|
||||
}
|
||||
// return the rank within the warp
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx);
|
||||
Call->replaceAllUsesWith(intra_warp_index);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void replace_asm_call(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<CallInst *> need_remove;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->isInlineAsm()) {
|
||||
auto asm_inst = dyn_cast<InlineAsm>(Call->getCalledOperand());
|
||||
if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") {
|
||||
printf("unknown InlineAsm\n");
|
||||
exit(1);
|
||||
}
|
||||
// return the rank within the warp
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto intra_warp_index_addr =
|
||||
M->getGlobalVariable("intra_warp_index");
|
||||
auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr);
|
||||
Call->replaceAllUsesWith(intra_warp_index);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
bool has_warp_barrier(llvm::BasicBlock *B) {
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_barrier(llvm::BasicBlock *B) {
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_block_barrier(llvm::BasicBlock *B) {
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_barrier(llvm::Function *F) {
|
||||
for (auto B = F->begin(); B != F->end(); B++) {
|
||||
if (has_barrier(&(*B)))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool find_block_barrier_in_region(llvm::BasicBlock *start,
|
||||
llvm::BasicBlock *end) {
|
||||
std::set<llvm::BasicBlock *> visit;
|
||||
std::vector<llvm::BasicBlock *> pending_blocks;
|
||||
for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
|
||||
pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
while (!pending_blocks.empty()) {
|
||||
BasicBlock *current = pending_blocks.back();
|
||||
pending_blocks.pop_back();
|
||||
if (visit.find(current) != visit.end())
|
||||
continue;
|
||||
visit.insert(current);
|
||||
if (current == end)
|
||||
continue;
|
||||
if (has_block_barrier(current)) {
|
||||
return 1;
|
||||
}
|
||||
for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
|
||||
pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
|
||||
std::set<llvm::BasicBlock *> visit;
|
||||
std::vector<llvm::BasicBlock *> pending_blocks;
|
||||
for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
|
||||
pending_blocks.push_back(start->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
while (!pending_blocks.empty()) {
|
||||
BasicBlock *current = pending_blocks.back();
|
||||
pending_blocks.pop_back();
|
||||
if (visit.find(current) != visit.end())
|
||||
continue;
|
||||
visit.insert(current);
|
||||
if (current == end)
|
||||
continue;
|
||||
if (has_barrier(current)) {
|
||||
return 1;
|
||||
}
|
||||
for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
|
||||
pending_blocks.push_back(current->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,217 @@
|
|||
|
||||
#include "warp_func.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
/*
|
||||
* Insert sync before each vote, and replace the
|
||||
* original vote function to warp vote version
|
||||
*/
|
||||
void handle_warp_vote(llvm::Module *M) {
|
||||
std::set<llvm::CallInst *> need_replace;
|
||||
llvm::Type *Int1T = Type::getInt1Ty(M->getContext());
|
||||
llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
|
||||
llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext());
|
||||
auto zero = llvm::ConstantInt::get(I32, 0, true);
|
||||
auto one = llvm::ConstantInt::get(I32, 1, true);
|
||||
llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32)->getPointerTo();
|
||||
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(Int1T, {VoteArrayType}, false);
|
||||
llvm::FunctionCallee _f = M->getOrInsertFunction("warp_any", LauncherFuncT);
|
||||
llvm::Function *func_warp_any = llvm::cast<llvm::Function>(_f.getCallee());
|
||||
_f = M->getOrInsertFunction("warp_all", LauncherFuncT);
|
||||
llvm::Function *func_warp_all = llvm::cast<llvm::Function>(_f.getCallee());
|
||||
|
||||
// replace llvm.nvvm.vote.any.sync to warp vote function
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
|
||||
if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
|
||||
auto func_name = vote_any_sync->getCalledFunction()->getName();
|
||||
if (func_name == "llvm.nvvm.vote.any.sync" ||
|
||||
func_name == "llvm.nvvm.vote.all.sync") {
|
||||
// insert sync before call
|
||||
need_replace.insert(vote_any_sync);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GlobalVariable *warp_vote_ptr = M->getNamedGlobal("warp_vote");
|
||||
for (auto sync_inst : need_replace) {
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(sync_inst);
|
||||
/*
|
||||
* store into warp_vote[tid]
|
||||
*/
|
||||
assert(warp_vote_ptr != NULL);
|
||||
auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index");
|
||||
auto intra_warp_index =
|
||||
new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst);
|
||||
|
||||
auto GEP = GetElementPtrInst::Create(NULL, // Pointee type
|
||||
warp_vote_ptr, // Alloca
|
||||
{zero, intra_warp_index}, // Indices
|
||||
"", sync_inst);
|
||||
|
||||
// as AVX only support 8bit for each thread
|
||||
// so we have to cast the predict into int8
|
||||
auto predict = llvm::CastInst::CreateIntegerCast(
|
||||
sync_inst->getArgOperand(1), I8, false, "", sync_inst);
|
||||
// we need to concern mask
|
||||
auto mask = llvm::CastInst::CreateIntegerCast(sync_inst->getArgOperand(0),
|
||||
I32, false, "", sync_inst);
|
||||
auto bit_flag = BinaryOperator::Create(Instruction::LShr, mask,
|
||||
intra_warp_index, "", sync_inst);
|
||||
auto valid =
|
||||
BinaryOperator::Create(Instruction::And, one, bit_flag, "", sync_inst);
|
||||
auto valid_8bit =
|
||||
llvm::CastInst::CreateIntegerCast(valid, I8, false, "", sync_inst);
|
||||
|
||||
llvm::Instruction *res;
|
||||
if (sync_inst->getCalledFunction()->getName() ==
|
||||
"llvm.nvvm.vote.any.sync") {
|
||||
res = BinaryOperator::Create(Instruction::Mul, valid_8bit, predict, "",
|
||||
sync_inst);
|
||||
} else if (sync_inst->getCalledFunction()->getName() ==
|
||||
"llvm.nvvm.vote.all.sync") {
|
||||
auto reverse_valid = BinaryOperator::CreateNot(valid_8bit, "", sync_inst);
|
||||
res = BinaryOperator::Create(Instruction::Or, reverse_valid, predict, "",
|
||||
sync_inst);
|
||||
// as AVX do not have all, we have to
|
||||
// reverse the result and call AVX-any instead
|
||||
res = BinaryOperator::CreateNot(res, "", sync_inst);
|
||||
}
|
||||
|
||||
auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(sync_inst);
|
||||
/*
|
||||
* replace llvm.nvvm.vote.any.sync(i32 mask, i1 predict)
|
||||
* to warp_any(i32 mask, i8* predict)
|
||||
*/
|
||||
std::vector<Value *> args;
|
||||
// args.push_back(mask);
|
||||
args.push_back(warp_vote_ptr);
|
||||
llvm::Instruction *warp_inst;
|
||||
if (sync_inst->getCalledFunction()->getName() ==
|
||||
"llvm.nvvm.vote.any.sync") {
|
||||
warp_inst = llvm::CallInst::Create(func_warp_any, args, "", sync_inst);
|
||||
} else if (sync_inst->getCalledFunction()->getName() ==
|
||||
"llvm.nvvm.vote.all.sync") {
|
||||
warp_inst = llvm::CallInst::Create(func_warp_all, args, "", sync_inst);
|
||||
}
|
||||
sync_inst->replaceAllUsesWith(warp_inst);
|
||||
sync_inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void handle_warp_shfl(llvm::Module *M) {
|
||||
std::set<llvm::CallInst *> need_replace;
|
||||
|
||||
llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext());
|
||||
auto ZERO = llvm::ConstantInt::get(I32, 0, true);
|
||||
// replace llvm.nvvm.vote.any.sync to warp vote function
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
|
||||
if (CallInst *warp_shfl = dyn_cast<CallInst>(BI)) {
|
||||
auto func_name = warp_shfl->getCalledFunction()->getName();
|
||||
if (func_name == "llvm.nvvm.shfl.sync.down.i32" ||
|
||||
func_name == "llvm.nvvm.shfl.sync.up.i32" ||
|
||||
func_name == "llvm.nvvm.shfl.sync.bfly.i32") {
|
||||
// insert sync before call
|
||||
need_replace.insert(warp_shfl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GlobalVariable *warp_shfl_ptr = M->getNamedGlobal("warp_shfl");
|
||||
for (auto shfl_inst : need_replace) {
|
||||
/*
|
||||
* %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add32, i32
|
||||
* 16, i32 31)
|
||||
* ->
|
||||
* warp_shfl[warp_id] = add32
|
||||
* warp.barrier()
|
||||
* %10 = warp_shfl[warp_id + offset]
|
||||
*/
|
||||
IRBuilder<> builder(shfl_inst);
|
||||
|
||||
auto shfl_variable = shfl_inst->getArgOperand(1);
|
||||
auto shfl_offset = shfl_inst->getArgOperand(2);
|
||||
|
||||
auto intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
builder.CreateStore(
|
||||
shfl_variable,
|
||||
builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index}));
|
||||
// we should create barrier before store
|
||||
CreateIntraWarpBarrier(intra_warp_index);
|
||||
// load shuffled data
|
||||
auto new_intra_warp_index =
|
||||
builder.CreateLoad(M->getGlobalVariable("intra_warp_index"));
|
||||
auto shfl_name = shfl_inst->getCalledFunction()->getName().str();
|
||||
if (shfl_name.find("down") != shfl_name.npos) {
|
||||
auto calculate_offset = builder.CreateBinOp(
|
||||
Instruction::Add, new_intra_warp_index, shfl_offset);
|
||||
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
|
||||
ConstantInt::get(I32, 32));
|
||||
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = builder.CreateLoad(gep);
|
||||
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(new_intra_warp_index);
|
||||
shfl_inst->replaceAllUsesWith(load_inst);
|
||||
shfl_inst->eraseFromParent();
|
||||
} else if (shfl_name.find("up") != shfl_name.npos) {
|
||||
auto calculate_offset = builder.CreateBinOp(
|
||||
Instruction::Sub, new_intra_warp_index, shfl_offset);
|
||||
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
|
||||
ConstantInt::get(I32, 32));
|
||||
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = builder.CreateLoad(gep);
|
||||
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(new_intra_warp_index);
|
||||
shfl_inst->replaceAllUsesWith(load_inst);
|
||||
shfl_inst->eraseFromParent();
|
||||
} else if (shfl_name.find("bfly") != shfl_name.npos) {
|
||||
auto calculate_offset = builder.CreateBinOp(
|
||||
Instruction::Xor, new_intra_warp_index, shfl_offset);
|
||||
auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset,
|
||||
ConstantInt::get(I32, 32));
|
||||
auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index});
|
||||
auto load_inst = builder.CreateLoad(gep);
|
||||
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(new_intra_warp_index);
|
||||
shfl_inst->replaceAllUsesWith(load_inst);
|
||||
shfl_inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define NUM_WARP 2
|
||||
#define NUM_BLOCK 1
|
||||
|
||||
int block_size = 32 * NUM_WARP;
|
||||
int block_size_x = block_size;
|
||||
int block_size_y = 1;
|
||||
int block_size_z = 1;
|
||||
__thread int block_index = 0;
|
||||
int grid_size = NUM_BLOCK;
|
||||
|
||||
extern "C" {
|
||||
void *_Z7reduce0PiS_j_wrapper(void *);
|
||||
__thread int warp_shfl[32];
|
||||
}
|
||||
|
||||
void *wrap(void *p) {
|
||||
int **res = (int **)p;
|
||||
block_index = (*(int *)res[3]);
|
||||
_Z7reduce0PiS_j_wrapper(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
|
||||
int **ret = new int *[4];
|
||||
|
||||
int **p0 = new int *;
|
||||
*p0 = g_idata;
|
||||
ret[0] = (int *)(p0);
|
||||
|
||||
int **p1 = new int *;
|
||||
*p1 = g_odata;
|
||||
ret[1] = (int *)(p1);
|
||||
|
||||
unsigned int *p2 = new unsigned int;
|
||||
*p2 = n;
|
||||
ret[2] = (int *)p2;
|
||||
|
||||
int *p3 = new int;
|
||||
*p3 = bid;
|
||||
ret[3] = (int *)p3;
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int *g_idata;
|
||||
|
||||
int size = block_size * NUM_BLOCK;
|
||||
g_idata = new int[size * 2];
|
||||
int *res = new int[size];
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
g_idata[i] = i;
|
||||
}
|
||||
|
||||
pthread_t threads[NUM_BLOCK];
|
||||
|
||||
void *inp[NUM_BLOCK];
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
inp[t] = gen_input(t, g_idata, res, size);
|
||||
}
|
||||
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
pthread_create(&threads[t], NULL, wrap, inp[t]);
|
||||
}
|
||||
for (long t = 0; t < NUM_BLOCK; t++)
|
||||
pthread_join(threads[t], NULL);
|
||||
int gold = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
gold += g_idata[i];
|
||||
}
|
||||
assert(*res == gold && "Incorrect res\n");
|
||||
printf("PASS\n");
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
|
||||
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
|
||||
%2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
|
||||
%mul = mul i32 %2, %1
|
||||
%add = add i32 %mul, %0
|
||||
%cmp = icmp ult i32 %add, %n
|
||||
br i1 %cmp, label %cond.true, label %cond.end
|
||||
|
||||
cond.true: ; preds = %entry
|
||||
%idxprom = zext i32 %add to i64
|
||||
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
|
||||
%3 = load i32, i32* %arrayidx, align 4, !tbaa !13
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %entry, %cond.true
|
||||
%cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
|
||||
%idxprom5 = zext i32 %0 to i64
|
||||
%arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
|
||||
%arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
|
||||
store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
|
||||
%cmp839 = icmp ugt i32 %2, 1
|
||||
br i1 %cmp839, label %for.body, label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %if.end, %cond.end
|
||||
%cmp18 = icmp eq i32 %0, 0
|
||||
br i1 %cmp18, label %if.then19, label %if.end23
|
||||
|
||||
for.body: ; preds = %cond.end, %if.end
|
||||
%s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
|
||||
%mul9 = shl nuw nsw i32 %s.040, 1
|
||||
%rem = urem i32 %0, %mul9
|
||||
%cmp10 = icmp eq i32 %rem, 0
|
||||
br i1 %cmp10, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%add11 = add i32 %s.040, %0
|
||||
%idxprom12 = zext i32 %add11 to i64
|
||||
%arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
|
||||
%arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
|
||||
%4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
|
||||
%5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
|
||||
%add16 = add nsw i32 %5, %4
|
||||
store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
|
||||
%cmp8 = icmp ult i32 %mul9, %2
|
||||
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
||||
|
||||
if.then19: ; preds = %for.cond.cleanup
|
||||
%idxprom21 = zext i32 %1 to i64
|
||||
%arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
|
||||
%6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
|
||||
store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
|
||||
br label %if.end23
|
||||
|
||||
if.end23: ; preds = %if.then19, %for.cond.cleanup
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier.sync(i32) #3
|
||||
|
||||
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
attributes #4 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
||||
!10 = !{i32 0, i32 1024}
|
||||
!11 = !{i32 0, i32 2147483647}
|
||||
!12 = !{i32 1, i32 1025}
|
||||
!13 = !{!14, !14, i64 0}
|
||||
!14 = !{!"int", !15, i64 0}
|
||||
!15 = !{!"omnipotent char", !16, i64 0}
|
||||
!16 = !{!"Simple C++ TBAA"}
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
|
||||
llc --filetype=obj kernel.bc
|
||||
g++ host.cpp kernel.o -lpthread -o test
|
||||
./test
|
|
@ -0,0 +1,82 @@
|
|||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define NUM_WARP 2
|
||||
#define NUM_BLOCK 1
|
||||
|
||||
int block_size = 32 * NUM_WARP;
|
||||
int block_size_x = block_size;
|
||||
int block_size_y = 1;
|
||||
int block_size_z = 1;
|
||||
__thread int block_index = 0;
|
||||
int grid_size = NUM_BLOCK;
|
||||
|
||||
extern "C" {
|
||||
void *_Z7reduce5PiS_j_wrapper(void *);
|
||||
__thread int warp_shfl[32];
|
||||
}
|
||||
|
||||
void *wrap(void *p) {
|
||||
int **res = (int **)p;
|
||||
block_index = (*(int *)res[3]);
|
||||
_Z7reduce5PiS_j_wrapper(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
|
||||
int **ret = new int *[4];
|
||||
|
||||
int **p0 = new int *;
|
||||
*p0 = g_idata;
|
||||
ret[0] = (int *)(p0);
|
||||
|
||||
int **p1 = new int *;
|
||||
*p1 = g_odata;
|
||||
ret[1] = (int *)(p1);
|
||||
|
||||
unsigned int *p2 = new unsigned int;
|
||||
*p2 = n;
|
||||
ret[2] = (int *)p2;
|
||||
|
||||
int *p3 = new int;
|
||||
*p3 = bid;
|
||||
ret[3] = (int *)p3;
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int *g_idata;
|
||||
|
||||
int size = block_size * NUM_BLOCK;
|
||||
g_idata = new int[size * 2];
|
||||
int *res = new int[size];
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
g_idata[i] = i;
|
||||
}
|
||||
|
||||
pthread_t threads[NUM_BLOCK];
|
||||
|
||||
void *inp[NUM_BLOCK];
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
inp[t] = gen_input(t, g_idata, res, size);
|
||||
}
|
||||
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
pthread_create(&threads[t], NULL, wrap, inp[t]);
|
||||
}
|
||||
for (long t = 0; t < NUM_BLOCK; t++)
|
||||
pthread_join(threads[t], NULL);
|
||||
int gold = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
gold += g_idata[i];
|
||||
}
|
||||
assert(*res == gold && "Incorrect res\n");
|
||||
printf("PASS\n");
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
|
||||
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
|
||||
%mul = shl i32 %1, 7
|
||||
%add = add i32 %mul, %0
|
||||
%cmp = icmp ult i32 %add, %n
|
||||
br i1 %cmp, label %cond.true, label %cond.end
|
||||
|
||||
cond.true: ; preds = %entry
|
||||
%idxprom = zext i32 %add to i64
|
||||
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
|
||||
%2 = load i32, i32* %arrayidx, align 4, !tbaa !12
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %entry, %cond.true
|
||||
%cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
|
||||
%add4 = add i32 %add, 64
|
||||
%cmp5 = icmp ult i32 %add4, %n
|
||||
br i1 %cmp5, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %cond.end
|
||||
%idxprom7 = zext i32 %add4 to i64
|
||||
%arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
|
||||
%3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
|
||||
%add9 = add nsw i32 %3, %cond
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %cond.end
|
||||
%mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
|
||||
%idxprom10 = zext i32 %0 to i64
|
||||
%arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
|
||||
%arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
|
||||
store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
|
||||
%5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
|
||||
%mul.i.i52 = mul nuw nsw i32 %5, %4
|
||||
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
|
||||
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
|
||||
%mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
|
||||
%add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
|
||||
%add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
|
||||
%cmp14 = icmp ult i32 %add8.i.i55, 32
|
||||
br i1 %cmp14, label %if.then15, label %if.end32
|
||||
|
||||
if.then15: ; preds = %if.end
|
||||
%add16 = add nuw nsw i32 %0, 32
|
||||
%idxprom17 = zext i32 %add16 to i64
|
||||
%arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
|
||||
%arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
|
||||
%8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
|
||||
%add19 = add nsw i32 %8, %mySum.0
|
||||
%9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
|
||||
%add23 = add nsw i32 %9, %add19
|
||||
%10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
|
||||
%add23.1 = add nsw i32 %10, %add23
|
||||
%11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
|
||||
%add23.2 = add nsw i32 %11, %add23.1
|
||||
%12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
|
||||
%add23.3 = add nsw i32 %12, %add23.2
|
||||
%13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
|
||||
%cmp27 = icmp eq i32 %add8.i.i55, 0
|
||||
br i1 %cmp27, label %if.then28, label %if.end32
|
||||
|
||||
if.then28: ; preds = %if.then15
|
||||
%add23.4 = add nsw i32 %13, %add23.3
|
||||
%idxprom30 = zext i32 %1 to i64
|
||||
%arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
|
||||
store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
|
||||
br label %if.end32
|
||||
|
||||
if.end32: ; preds = %if.end, %if.then28, %if.then15
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier.sync(i32) #3
|
||||
|
||||
; Function Attrs: convergent inaccessiblememonly nounwind
|
||||
declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
|
||||
|
||||
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
attributes #4 = { convergent inaccessiblememonly nounwind }
|
||||
attributes #5 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
||||
!10 = !{i32 0, i32 1024}
|
||||
!11 = !{i32 0, i32 2147483647}
|
||||
!12 = !{!13, !13, i64 0}
|
||||
!13 = !{!"int", !14, i64 0}
|
||||
!14 = !{!"omnipotent char", !15, i64 0}
|
||||
!15 = !{!"Simple C++ TBAA"}
|
||||
!16 = !{i32 0, i32 64}
|
||||
!17 = !{i32 1, i32 1025}
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
|
||||
llc --filetype=obj kernel.bc
|
||||
g++ host.cpp kernel.o -lpthread -o test
|
||||
./test
|
|
@ -0,0 +1,11 @@
|
|||
#!bin/sh
|
||||
for file in ./*
|
||||
do
|
||||
if test -d $file
|
||||
then
|
||||
echo executing $file
|
||||
cd $file
|
||||
bash run.sh
|
||||
cd ..
|
||||
fi
|
||||
done
|
|
@ -0,0 +1,84 @@
|
|||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#define NUM_BLOCK 1
|
||||
int N = 32;
|
||||
|
||||
int block_size = 32;
|
||||
int block_size_x = block_size;
|
||||
int block_size_y = 1;
|
||||
int block_size_z = 1;
|
||||
__thread int block_index = 0;
|
||||
int grid_size = NUM_BLOCK;
|
||||
|
||||
extern "C" {
|
||||
void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
|
||||
}
|
||||
|
||||
void *wrap(void *p) {
|
||||
int **res = (int **)p;
|
||||
block_index = (*(int *)res[4]);
|
||||
_Z9vectorAddPKfS0_Pfi_wrapper(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *gen_input(int bid, float *A, float *B, float *C, int N) {
|
||||
int **ret = new int *[5];
|
||||
|
||||
float **p0 = new float *;
|
||||
*p0 = A;
|
||||
ret[0] = (int *)(p0);
|
||||
|
||||
float **p1 = new float *;
|
||||
*p1 = B;
|
||||
ret[1] = (int *)(p1);
|
||||
|
||||
float **p2 = new float *;
|
||||
*p2 = C;
|
||||
ret[2] = (int *)(p2);
|
||||
|
||||
int *p3 = new int;
|
||||
*p3 = N;
|
||||
ret[3] = (int *)p3;
|
||||
|
||||
int *p4 = new int;
|
||||
*p4 = bid;
|
||||
ret[4] = (int *)p4;
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
int main() {
|
||||
float *A, *B, *C;
|
||||
|
||||
A = new float[N];
|
||||
B = new float[N];
|
||||
C = new float[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
A[i] = i;
|
||||
B[i] = 1;
|
||||
C[i] = 0;
|
||||
}
|
||||
|
||||
pthread_t threads[NUM_BLOCK];
|
||||
|
||||
int rc;
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
void *inp = gen_input(t, A, B, C, N);
|
||||
rc = pthread_create(&threads[t], NULL, wrap, inp);
|
||||
}
|
||||
clock_t t1 = clock();
|
||||
/* Last thing that main() should do */
|
||||
for (long t = 0; t < NUM_BLOCK; t++)
|
||||
pthread_join(threads[t], NULL);
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
assert(C[i] == (A[i] + B[i]));
|
||||
}
|
||||
printf("PASS\n");
|
||||
pthread_exit(NULL);
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nofree nounwind
|
||||
define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
|
||||
%idxprom8 = zext i32 %0 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
|
||||
%1 = load float, float* %arrayidx, align 4, !tbaa !11
|
||||
%arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
|
||||
%2 = load float, float* %arrayidx2, align 4, !tbaa !11
|
||||
%add = fadd contract float %1, %2
|
||||
%arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
|
||||
store float %add, float* %arrayidx4, align 4, !tbaa !11
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
||||
!10 = !{i32 0, i32 1024}
|
||||
!11 = !{!12, !12, i64 0}
|
||||
!12 = !{!"float", !13, i64 0}
|
||||
!13 = !{!"omnipotent char", !14, i64 0}
|
||||
!14 = !{!"Simple C++ TBAA"}
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
|
||||
llc --filetype=obj kernel.bc
|
||||
g++ host.cpp kernel.o -lpthread -o test
|
||||
./test
|
|
@ -0,0 +1,16 @@
|
|||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||
project(
|
||||
X86runtime
|
||||
DESCRIPTION "Implementation CUDA runtime API with x86"
|
||||
LANGUAGES CXX)
|
||||
set(LIB_NAME x86Runtime)
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
|
||||
# compile threadPool implementation
|
||||
add_subdirectory(threadPool)
|
||||
|
||||
# compile x86 runtime library
|
||||
include_directories(./include)
|
||||
include_directories(./threadPool/include)
|
||||
file(GLOB proj_SOURCES "lib/*.cpp")
|
||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
|
@ -0,0 +1,19 @@
|
|||
#ifndef __RUNTIME_IMPL__
|
||||
#define __RUNTIME_IMPL__
|
||||
#include "cudaStatus.h"
|
||||
#include "structures.h"
|
||||
cudaError_t cudaDeviceReset(void);
|
||||
cudaError_t cudaDeviceSynchronize(void);
|
||||
cudaError_t cudaFree(void *devPtr);
|
||||
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
void **args, size_t sharedMem,
|
||||
cudaStream_t stream);
|
||||
cudaError_t cudaMalloc(void **devPtr, size_t size);
|
||||
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
||||
cudaMemcpyKind kind);
|
||||
cudaError_t cudaSetDevice(int device);
|
||||
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
|
||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream);
|
||||
cudaError_t cudaStreamDestroy(cudaStream_t stream);
|
||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream);
|
||||
#endif
|
|
@ -0,0 +1,18 @@
|
|||
#ifndef __RUNTIME_STATUS__
|
||||
#define __RUNTIME_STATUS__
|
||||
#include <stdio.h>
|
||||
|
||||
enum cudaError_t {
|
||||
CudaSuccess = 0,
|
||||
CudaErrorInvalidValue = 1,
|
||||
CudaErrorInvalidMemoryAllocation = 2,
|
||||
};
|
||||
|
||||
enum cudaMemcpyKind {
|
||||
cudaMemcpyHostToHost = 0,
|
||||
cudaMemcpyHostToDevice = 1,
|
||||
cudaMemcpyDeviceToHost = 2,
|
||||
cudaMemcpyDeviceToDevice = 3,
|
||||
cudaMemcpyDefault = 4,
|
||||
};
|
||||
#endif
|
|
@ -0,0 +1,100 @@
|
|||
#include "cudaRuntimeImpl.h"
|
||||
#include "api.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
|
||||
cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
|
||||
cudaError_t cudaFree(void *devPtr) { free(devPtr); }
|
||||
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
void **args, size_t sharedMem,
|
||||
cudaStream_t stream) {
|
||||
// if scheduler is null init device
|
||||
|
||||
cu_kernel *ker =
|
||||
create_kernel(func, gridDim, blockDim, &args, sharedMem, stream);
|
||||
|
||||
int lstatus = cuLaunchKernel(&ker);
|
||||
}
|
||||
cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
||||
*devPtr = malloc(size);
|
||||
if (devPtr == NULL)
|
||||
return cudaErrorMemoryAllocation;
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
||||
cudaMemcpyKind kind) {
|
||||
if (kind == cudaMemcpyHostToHost) {
|
||||
memcpy(dst, src, count);
|
||||
} else if (kind == cudaMemcpyDeviceToHost) {
|
||||
// how does the code know which device accessing the memory
|
||||
memcpy(dst, src, count);
|
||||
} else if (kind == cudaMemcpyHostToDevice) {
|
||||
// how does the code know which device accessing the memory
|
||||
memcpy(dst, src, count);
|
||||
} else if (kind == cudaMemcpyDeviceToHost) {
|
||||
// how does the code know which device accessing the memory
|
||||
memcpy(dst, src, count);
|
||||
} else if (kind == cudaMemcpyDeviceToDevice) {
|
||||
|
||||
memcpy(dst, dst, count);
|
||||
} else if (kind == cudaMemcpyDefault) {
|
||||
memcpy(dst, src, count);
|
||||
}
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaSetDevice(int device) {
|
||||
// error checking
|
||||
init_device();
|
||||
}
|
||||
|
||||
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
||||
cstreamData *dst_stream = (cstreamData *)dst;
|
||||
cstreamData *src_stream = (cstreamData *)src;
|
||||
|
||||
if (dst_stream == NULL || src_stream == NULL) {
|
||||
return cudaErrorInvalidValue; // 1
|
||||
}
|
||||
|
||||
dst_stream->stream_priority = src_stream->stream_priority;
|
||||
dst_stream->stream_flags = src_stream->stream_flags;
|
||||
|
||||
return cudaSuccess; // 0
|
||||
}
|
||||
|
||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
||||
cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData));
|
||||
if (s == NULL)
|
||||
return cudaErrorMemoryAllocation;
|
||||
s->ev.status = C_RUN;
|
||||
s->id = stream_counter;
|
||||
stream_counter++;
|
||||
s->stream_priority = DEFAULT;
|
||||
create_KernelQueue(&(s->kernelQueue));
|
||||
|
||||
INIT_LOCK(s->stream_lock);
|
||||
*pStream = (cudaStream_t)(s);
|
||||
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
||||
cstreamData *s = (cstreamData *)(stream);
|
||||
|
||||
free(s->kernelQueue);
|
||||
|
||||
DESTROY_LOCK(s->stream_lock);
|
||||
|
||||
free(s);
|
||||
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
||||
cstreamData *e = ((cstreamData *)(stream));
|
||||
MUTEX_LOCK(e->stream_lock);
|
||||
|
||||
e->ev.status = C_SYNCHRONIZE;
|
||||
e->ev.numKernelsToWait = e->kernelQueue->waiting_count;
|
||||
MUTEX_UNLOCK(e->stream_lock);
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
|
||||
# C project
|
||||
project(
|
||||
ThreadPool
|
||||
DESCRIPTION "Using pthread to implement ThreadPool"
|
||||
LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
set(LIB_NAME threadPool)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(./include)
|
||||
|
||||
file(GLOB proj_SOURCES "lib/*.cpp")
|
||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
|
@ -0,0 +1,25 @@
|
|||
#ifndef C_API_H
|
||||
#define C_API_H
|
||||
|
||||
#include "structures.h"
|
||||
|
||||
cu_kernel *create_kernel(void (*wrap)(cu_argument *));
|
||||
int getWorkItem(struct kernel_queue **qu, cu_kernel *ker,
|
||||
struct argument *kernel_arg, int **blockId);
|
||||
int create_KernelQueue(kernel_queue **q);
|
||||
|
||||
int dequeKernelLL(struct kernel_queue **qu);
|
||||
|
||||
int dequeKernel(struct kernel_queue **qu, cu_kernel *ker);
|
||||
int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker);
|
||||
|
||||
int scheduler_init(cu_device device);
|
||||
void scheduler_uninit();
|
||||
void cuSynchronizeBarrier();
|
||||
|
||||
int set_kernel_arguments(cu_kernel **k, unsigned int arg_num, void **arg_value);
|
||||
|
||||
int setKernelDimensions(cu_kernel *k, struct argument **arg,
|
||||
void **totalBlockSize, void *blockId);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,26 @@
|
|||
#ifndef C_DEF_H
|
||||
#define C_DEF_H
|
||||
|
||||
// Error
|
||||
#define C_SUCCESS 0x0
|
||||
#define C_ERROR 0x1
|
||||
|
||||
// execution status
|
||||
#define C_COMPLETE 0x2
|
||||
#define C_RUNNING 0x3
|
||||
#define C_SUBMITTED 0x4
|
||||
#define C_QUEUED 0x5
|
||||
#define C_CREATED 0x5
|
||||
|
||||
// stream status
|
||||
#define C_RUN 0x1
|
||||
#define C_WAIT 0x2
|
||||
#define C_SYNCHRONIZE 0x3
|
||||
|
||||
// Not Initliazed Error
|
||||
#define C_ERROR_NOT_INITIALIZED 0x6
|
||||
#define C_ERROR_MEMALLOC 0x7
|
||||
|
||||
#define C_QUEUE_EMPTY 0x8
|
||||
|
||||
#endif
|
|
@ -0,0 +1,38 @@
|
|||
#ifndef C_MACROS_H
|
||||
#define C_MACROS_H
|
||||
#include "assert.h"
|
||||
#include <pthread.h>
|
||||
|
||||
#define INIT_LOCK(__LOCK__) \
|
||||
{ \
|
||||
do { \
|
||||
int r = pthread_mutex_init(&(__LOCK__), NULL); \
|
||||
assert(r == 0); \
|
||||
} while (0); \
|
||||
}
|
||||
|
||||
#define MUTEX_LOCK(__LOCK__) \
|
||||
{ \
|
||||
do { \
|
||||
int r = pthread_mutex_lock(&(__LOCK__)); \
|
||||
assert(r == 0); \
|
||||
} while (0); \
|
||||
}
|
||||
|
||||
#define MUTEX_UNLOCK(__LOCK__) \
|
||||
{ \
|
||||
do { \
|
||||
int r = pthread_mutex_unlock(&(__LOCK__)); \
|
||||
assert(r == 0); \
|
||||
} while (0); \
|
||||
}
|
||||
|
||||
#define DESTROY_LOCK(__LOCK__) \
|
||||
{ \
|
||||
do { \
|
||||
int r = pthread_mutex_destroy(&(__LOCK__)); \
|
||||
assert(r == 0); \
|
||||
} while (0); \
|
||||
}
|
||||
|
||||
#endif // HEADER_FILE
|
|
@ -0,0 +1,191 @@
|
|||
#ifndef C_STRUCTURES_H
|
||||
#define C_STRUCTURES_H
|
||||
|
||||
#include "pthread.h"
|
||||
#define cudaStream_t cstreamData
|
||||
typedef struct device {
|
||||
int max_compute_units;
|
||||
int device_id;
|
||||
} cu_device;
|
||||
|
||||
typedef struct c_thread {
|
||||
pthread_t thread;
|
||||
unsigned long executed_commands;
|
||||
unsigned index;
|
||||
bool exit;
|
||||
} cu_ptd;
|
||||
|
||||
typedef struct scheduler_pool {
|
||||
|
||||
struct c_thread *thread_pool;
|
||||
|
||||
size_t num_worker_threads;
|
||||
size_t local_mem_size;
|
||||
int num_kernel_launch;
|
||||
int num_kernel_finished;
|
||||
int num_kernel_queued;
|
||||
size_t idle_threads;
|
||||
|
||||
pthread_cond_t wake_pool;
|
||||
|
||||
int threadpool_shutdown_requested;
|
||||
|
||||
// lock for scheduler
|
||||
pthread_mutex_t work_queue_lock;
|
||||
|
||||
// C99 array at the end
|
||||
// user kernel queue for only user called functions
|
||||
struct kernel_queue *kernelQueue;
|
||||
|
||||
} cu_pool;
|
||||
|
||||
struct kernel_queue {
|
||||
|
||||
struct kernel *head;
|
||||
struct kernel *tail;
|
||||
|
||||
// finish command count
|
||||
unsigned long finish_count;
|
||||
|
||||
// waiting to be run on threads
|
||||
unsigned long waiting_count;
|
||||
|
||||
// running count
|
||||
unsigned long running_count;
|
||||
|
||||
// total count
|
||||
unsigned long kernel_count;
|
||||
|
||||
// current index for task to be run
|
||||
unsigned long current_index;
|
||||
};
|
||||
|
||||
typedef struct command {
|
||||
|
||||
struct kernel *ker;
|
||||
|
||||
struct command *next;
|
||||
struct command *prev;
|
||||
|
||||
} cu_command;
|
||||
|
||||
typedef struct argument {
|
||||
// size of the argument to allocation
|
||||
size_t size;
|
||||
void *value;
|
||||
unsigned int index;
|
||||
} cu_argument;
|
||||
|
||||
typedef struct input_arg {
|
||||
// real values for the input
|
||||
char *p;
|
||||
struct argument *argus[];
|
||||
// (TODO): implement meta_data
|
||||
// the type of metadata will need to change to list of ints or something
|
||||
// so that we can parse the arguments p
|
||||
} cu_input;
|
||||
|
||||
struct dim3 {
|
||||
size_t x;
|
||||
size_t y;
|
||||
size_t z;
|
||||
dim3(int d1) {
|
||||
x = d1;
|
||||
y = z = 1;
|
||||
}
|
||||
dim3() { x = y = z = 1; }
|
||||
};
|
||||
|
||||
enum StreamType {
|
||||
DEFAULT,
|
||||
LOW,
|
||||
HIGH,
|
||||
EXT,
|
||||
};
|
||||
|
||||
struct cStreamDataInternal {
|
||||
/*
|
||||
status of the stream (run , wait)
|
||||
Run: Stream will asynchronously assign the kernel assign with this stream
|
||||
Wait: Stream will halt kernels from exiting the scheduler
|
||||
*/
|
||||
int status;
|
||||
/*
|
||||
if status == wait, wait on the number of kernels to wait to become 0
|
||||
*/
|
||||
unsigned long numKernelsToWait;
|
||||
unsigned int lastKernelIdToWait;
|
||||
unsigned int count; // number of task left in the stream
|
||||
};
|
||||
|
||||
typedef struct streamData {
|
||||
|
||||
// execution status of current event monitor
|
||||
struct cStreamDataInternal ev;
|
||||
pthread_mutex_t stream_lock; // lock on the stream
|
||||
StreamType stream_priority;
|
||||
unsigned int id;
|
||||
unsigned int stream_flags;
|
||||
|
||||
// queue of the kernels in this stream
|
||||
struct kernel_queue *kernelQueue;
|
||||
|
||||
} cstreamData;
|
||||
// kernel information
|
||||
typedef struct kernel {
|
||||
|
||||
void *(*start_routine)(void *);
|
||||
|
||||
void **args;
|
||||
|
||||
dim3 gridDim;
|
||||
dim3 blockDim;
|
||||
|
||||
struct kernel *next;
|
||||
struct kernel *prev;
|
||||
|
||||
size_t shared_mem;
|
||||
|
||||
cstreamData *stream;
|
||||
|
||||
struct event *barrier;
|
||||
|
||||
int status;
|
||||
|
||||
int totalBlocks;
|
||||
int N;
|
||||
|
||||
int blockSize;
|
||||
int kernelId;
|
||||
|
||||
// current blockId
|
||||
int blockId;
|
||||
|
||||
void *shared_mem_loc;
|
||||
|
||||
} cu_kernel;
|
||||
|
||||
typedef struct asyncKernel {
|
||||
unsigned int numBlocks;
|
||||
unsigned int numThreads;
|
||||
struct event *evt;
|
||||
struct kernel *ker;
|
||||
|
||||
struct asyncKernel *prev;
|
||||
struct asyncKernel *next;
|
||||
|
||||
} asyncKernel;
|
||||
|
||||
// command queue of command nodes
|
||||
|
||||
typedef struct kernel_arg_array {
|
||||
size_t size;
|
||||
unsigned int index;
|
||||
} karg_arr;
|
||||
|
||||
typedef struct kernel_image_arg {
|
||||
size_t size;
|
||||
unsigned int index;
|
||||
} k_arg;
|
||||
|
||||
#endif // HEADER_FILE
|
|
@ -0,0 +1,456 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <thread>
|
||||
|
||||
#include "api.h"
|
||||
#include "def.h"
|
||||
#include "macros.h"
|
||||
#include "structures.h"
|
||||
|
||||
/*
|
||||
Initialize the device
|
||||
*/
|
||||
int init_device() {
|
||||
|
||||
cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
|
||||
if (device == NULL)
|
||||
return C_ERROR_MEMALLOC;
|
||||
|
||||
device->max_compute_units = std::thread::hardware_concurrency();
|
||||
|
||||
// initialize scheduler
|
||||
int ret = scheduler_init(*device);
|
||||
|
||||
if (ret != C_SUCCESS)
|
||||
return ret;
|
||||
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Create Kernel
|
||||
|
||||
*/
|
||||
static int kernelIds = 0;
|
||||
cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
void ***args, size_t sharedMem, cstreamData *stream) {
|
||||
cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
|
||||
|
||||
// set the function pointer
|
||||
ker->start_routine = (void *(*)(void *))func;
|
||||
// ker->start_routine(args);
|
||||
ker->args = *args;
|
||||
|
||||
ker->gridDim = gridDim;
|
||||
ker->blockDim = blockDim;
|
||||
|
||||
ker->shared_mem = sharedMem;
|
||||
|
||||
// malloc shared memory dynamic (heap , needs to be on the stack)
|
||||
// each thread create their own shared memory // after the task submission
|
||||
ker->shared_mem_loc = calloc(1, sharedMem);
|
||||
|
||||
ker->stream = stream;
|
||||
|
||||
ker->blockId = 0;
|
||||
|
||||
ker->totalBlocks = gridDim.x;
|
||||
|
||||
ker->N = blockDim.x;
|
||||
|
||||
ker->kernelId = kernelIds;
|
||||
kernelIds += 1;
|
||||
|
||||
ker->blockSize = blockDim.x;
|
||||
|
||||
return ker;
|
||||
}
|
||||
|
||||
/*
|
||||
Create Kernel Queue
|
||||
*/
|
||||
int create_KernelQueue(kernel_queue **q) {
|
||||
*q = (kernel_queue *)calloc(1, sizeof(kernel_queue));
|
||||
|
||||
if (*q == NULL) {
|
||||
return C_ERROR_MEMALLOC;
|
||||
}
|
||||
|
||||
(*q)->kernel_count = 0;
|
||||
(*q)->running_count = 0;
|
||||
(*q)->waiting_count = 0;
|
||||
(*q)->finish_count = 0;
|
||||
(*q)->current_index = 0;
|
||||
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
int dequeKernelLL(struct kernel_queue **qu) {
|
||||
|
||||
struct kernel_queue *q = *qu;
|
||||
q->finish_count += 1;
|
||||
|
||||
// free the pointer
|
||||
if (q->head == NULL) {
|
||||
return C_QUEUE_EMPTY;
|
||||
} else {
|
||||
//*ker = *(q->head);
|
||||
q->head = (q->head)->next;
|
||||
if (q->head != NULL) {
|
||||
q->head->prev = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) {
|
||||
struct kernel_queue *q = *qu;
|
||||
cu_kernel *p = *ker;
|
||||
|
||||
if (q->head == NULL) {
|
||||
q->head = p;
|
||||
q->tail = p;
|
||||
} else {
|
||||
p->prev = q->tail;
|
||||
q->tail->next = p;
|
||||
q->tail = p;
|
||||
p->next = NULL;
|
||||
}
|
||||
q->kernel_count += 1;
|
||||
q->waiting_count += 1;
|
||||
|
||||
// user kernel command
|
||||
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
// scheduler
|
||||
static cu_pool *scheduler;
|
||||
|
||||
__thread int block_index = 0;
|
||||
__thread int thread_memory_size = 0;
|
||||
|
||||
/*
|
||||
Enqueue Kernel (k) to the scheduler kernelQueue
|
||||
*/
|
||||
int schedulerEnqueueKernel(cu_kernel **k) {
|
||||
cu_kernel *ker = *k;
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
|
||||
enqueueKernel(&scheduler->kernelQueue, &ker);
|
||||
|
||||
pthread_cond_broadcast(&(scheduler->wake_pool));
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
Kernel Launch with numBlocks and numThreadsPerBlock
|
||||
*/
|
||||
int cuLaunchKernel(cu_kernel **k) {
|
||||
|
||||
// Calculate Block Size N/numBlocks
|
||||
|
||||
cu_kernel *ker = *k;
|
||||
int status = C_RUN;
|
||||
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
scheduler->num_kernel_queued += 1;
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
|
||||
// stream == 0 add to the kernelQueue
|
||||
if (ker->stream == 0) {
|
||||
|
||||
schedulerEnqueueKernel(&ker);
|
||||
} else {
|
||||
// add to it's stream queue
|
||||
// stream queue can be waiting or running with or without tasks
|
||||
MUTEX_LOCK(((cstreamData *)(ker->stream))->stream_lock);
|
||||
status = ((cstreamData *)(ker->stream))->ev.status;
|
||||
|
||||
// if stream queue status is run (first kernel) (enqueue to the kernel
|
||||
// queue)
|
||||
cstreamData *e = ((cstreamData *)(ker->stream));
|
||||
// synchronized is called after no job in the queue so stream is stuck on
|
||||
// synchronize
|
||||
if (e->ev.status == C_SYNCHRONIZE) {
|
||||
if ((e->kernelQueue->finish_count) == (e->kernelQueue->kernel_count)) {
|
||||
e->ev.status = C_RUN;
|
||||
}
|
||||
}
|
||||
|
||||
if (e->ev.status == C_RUN) {
|
||||
// change the status to wait
|
||||
e->ev.status == C_WAIT;
|
||||
MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
|
||||
|
||||
schedulerEnqueueKernel(&ker);
|
||||
} else {
|
||||
// the status of stream queue is wait so just enqueue to the stream
|
||||
enqueueKernel(&((cstreamData *)(ker->stream))->kernelQueue, &ker);
|
||||
MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Get Work Item: get the kernel from the queue and increment blockId
|
||||
*/
|
||||
int getWorkItem(struct kernel_queue **qu, cu_kernel **kern, int blockId) {
|
||||
struct kernel_queue *q = *qu;
|
||||
if (q->waiting_count > 0) {
|
||||
*kern = q->head;
|
||||
cu_kernel *ker = *kern;
|
||||
if (blockId + 1 == q->head->totalBlocks) {
|
||||
// deque the head
|
||||
dequeKernelLL(qu);
|
||||
ker->status = C_COMPLETE;
|
||||
q->waiting_count -= 1;
|
||||
} else {
|
||||
q->head->blockId += 1;
|
||||
}
|
||||
q->finish_count += 1;
|
||||
} else {
|
||||
return C_QUEUE_EMPTY;
|
||||
}
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Thread Gets Work
|
||||
*/
|
||||
int get_work(c_thread *th) {
|
||||
|
||||
cu_kernel ker;
|
||||
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
|
||||
RETRY:
|
||||
|
||||
int is_exit = 0;
|
||||
int is_command_not_null = 0;
|
||||
|
||||
int blockId;
|
||||
int localBlockSize;
|
||||
int status;
|
||||
int completion_status = 0;
|
||||
|
||||
is_exit = scheduler->threadpool_shutdown_requested;
|
||||
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
|
||||
if (!is_exit) {
|
||||
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
|
||||
// if kernel waiting to be complete is not zero
|
||||
if (scheduler->kernelQueue->waiting_count > 0) {
|
||||
blockId = scheduler->kernelQueue->head->blockId;
|
||||
localBlockSize = scheduler->kernelQueue->head->blockSize;
|
||||
// set status as success fully queue
|
||||
status = C_SUCCESS;
|
||||
ker = *(scheduler->kernelQueue->head);
|
||||
// if the blockId + 1 is equal to the goal block size ,
|
||||
// then its the last block
|
||||
|
||||
if (blockId + 1 == scheduler->kernelQueue->head->totalBlocks) {
|
||||
// deque the head
|
||||
dequeKernelLL(&scheduler->kernelQueue);
|
||||
|
||||
ker.status = C_COMPLETE;
|
||||
scheduler->kernelQueue->waiting_count -= 1;
|
||||
} else {
|
||||
// increment the blockId
|
||||
scheduler->kernelQueue->head->blockId =
|
||||
scheduler->kernelQueue->head->blockId + 1;
|
||||
}
|
||||
// status = getWorkItem(&scheduler->kernelQueue, &ker, blockId);
|
||||
} else {
|
||||
status = C_QUEUE_EMPTY;
|
||||
}
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
}
|
||||
|
||||
if (status != C_QUEUE_EMPTY) {
|
||||
|
||||
block_index = blockId;
|
||||
thread_memory_size = ker.shared_mem;
|
||||
ker.start_routine(ker.args);
|
||||
|
||||
is_command_not_null = 1;
|
||||
if (ker.status == C_COMPLETE) {
|
||||
|
||||
// check if this kernel's stream has more jobs to run (enqueue the next
|
||||
// job)
|
||||
if (ker.stream != NULL) {
|
||||
bool synchronize = false;
|
||||
|
||||
MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
|
||||
|
||||
if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
|
||||
// synchronize stream
|
||||
if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
|
||||
((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
|
||||
}
|
||||
|
||||
MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock);
|
||||
|
||||
if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) {
|
||||
// synchronize stream
|
||||
if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) {
|
||||
((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1;
|
||||
}
|
||||
|
||||
if (((cstreamData *)(ker.stream))->ev.numKernelsToWait == 0) {
|
||||
synchronize = false;
|
||||
} else {
|
||||
synchronize = true;
|
||||
}
|
||||
}
|
||||
if (synchronize == false) {
|
||||
if (((cstreamData *)(ker.stream))->kernelQueue->waiting_count > 0) {
|
||||
((cstreamData *)(ker.stream))->ev.status = C_WAIT;
|
||||
|
||||
MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
|
||||
cu_kernel *kern =
|
||||
((cstreamData *)(ker.stream))->kernelQueue->head;
|
||||
schedulerEnqueueKernel(&kern);
|
||||
dequeKernelLL(&((cstreamData *)(ker.stream))->kernelQueue);
|
||||
|
||||
} else {
|
||||
|
||||
// switch the stream to run to allow for the next execution
|
||||
((cstreamData *)(ker.stream))->ev.status = C_RUN;
|
||||
|
||||
MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
scheduler->num_kernel_finished += 1;
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
}
|
||||
}
|
||||
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
|
||||
if ((is_exit == 0 && is_command_not_null == 0)) {
|
||||
// all threads in condition wait
|
||||
scheduler->idle_threads += 1;
|
||||
pthread_cond_wait(&(scheduler->wake_pool), &(scheduler->work_queue_lock));
|
||||
scheduler->idle_threads -= 1;
|
||||
goto RETRY;
|
||||
}
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
|
||||
return is_exit;
|
||||
}
|
||||
|
||||
void *driver_thread(void *p) {
|
||||
struct c_thread *td = (struct c_thread *)p;
|
||||
int is_exit = 0;
|
||||
td->exit = false;
|
||||
|
||||
while (1) {
|
||||
// get work
|
||||
is_exit = get_work(td);
|
||||
|
||||
// exit the routine
|
||||
if (is_exit) {
|
||||
td->exit = true;
|
||||
// pthread_exit
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Initialize the scheduler
|
||||
*/
|
||||
int scheduler_init(cu_device device) {
|
||||
|
||||
scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
|
||||
scheduler->num_worker_threads = device.max_compute_units;
|
||||
|
||||
scheduler->thread_pool = (struct c_thread *)calloc(
|
||||
scheduler->num_worker_threads, sizeof(c_thread));
|
||||
kernel_queue *asq;
|
||||
create_KernelQueue(&asq);
|
||||
scheduler->kernelQueue = asq;
|
||||
|
||||
INIT_LOCK(scheduler->work_queue_lock);
|
||||
pthread_cond_init(&scheduler->wake_pool, NULL);
|
||||
scheduler->idle_threads = 0;
|
||||
|
||||
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
||||
scheduler->thread_pool[i].index = i;
|
||||
pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread,
|
||||
(void *)&scheduler->thread_pool[i]);
|
||||
}
|
||||
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
void scheduler_uninit() {
|
||||
unsigned i;
|
||||
|
||||
int r = pthread_mutex_lock(&scheduler->work_queue_lock);
|
||||
assert(r == 0);
|
||||
scheduler->threadpool_shutdown_requested = 1;
|
||||
pthread_cond_broadcast(&scheduler->wake_pool);
|
||||
|
||||
int r1 = pthread_mutex_unlock(&scheduler->work_queue_lock);
|
||||
assert(r1 == 0);
|
||||
|
||||
for (i = 0; i < scheduler->num_worker_threads; i++) {
|
||||
|
||||
pthread_join(scheduler->thread_pool[i].thread, NULL);
|
||||
}
|
||||
free(scheduler->thread_pool);
|
||||
free(scheduler->kernelQueue);
|
||||
|
||||
pthread_mutex_destroy(&scheduler->work_queue_lock);
|
||||
pthread_cond_destroy(&scheduler->wake_pool);
|
||||
|
||||
scheduler->threadpool_shutdown_requested = 0;
|
||||
}
|
||||
|
||||
int cuWait(cstreamData *evt) {
|
||||
|
||||
AGAIN:
|
||||
int r = pthread_mutex_lock(&evt->stream_lock);
|
||||
assert(r == 0);
|
||||
if (evt->ev.status != C_COMPLETE) {
|
||||
int r1 = pthread_mutex_unlock(&evt->stream_lock);
|
||||
assert(r1 == 0);
|
||||
goto AGAIN;
|
||||
}
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Barrier for Kernel Launch
|
||||
|
||||
During kernel launch, increment the number of work items required to finish
|
||||
Each kernel will point to the same event
|
||||
|
||||
During Running Command, decrement the event.work_item count
|
||||
when count is 0, all work items for this kernel launch is finish
|
||||
|
||||
Sense Like Barrier
|
||||
Counting Barrier basically
|
||||
*/
|
||||
void cuSynchronizeBarrier() {
|
||||
AGAIN:
|
||||
|
||||
MUTEX_LOCK(scheduler->work_queue_lock);
|
||||
|
||||
if (scheduler->num_kernel_finished != scheduler->num_kernel_queued ||
|
||||
scheduler->idle_threads != scheduler->num_worker_threads) {
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
goto AGAIN;
|
||||
} else {
|
||||
MUTEX_UNLOCK(scheduler->work_queue_lock);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue