From addf0a95b730f25f586322c984a306141606881f Mon Sep 17 00:00:00 2001 From: RobinHan Date: Tue, 11 Jan 2022 11:01:42 -0500 Subject: [PATCH] add backbone, including basic features for compilation --- CMakeLists.txt | 41 + CONTRIBUTING.md | 34 + LICENSE | 21 + README.md | 62 ++ compilation/CMakeLists.txt | 23 + compilation/HostTranslation.cpp | 25 + compilation/HostTranslation/CMakeLists.txt | 21 + .../include/ReplaceKernelLaunch.h | 11 + .../lib/ReplaceKernelLaunch.cpp | 94 ++ compilation/KernelTranslation.cpp | 53 ++ compilation/KernelTranslation/CMakeLists.txt | 21 + .../include/generate_x86_format.h | 8 + .../KernelTranslation/include/handle_sync.h | 10 + compilation/KernelTranslation/include/init.h | 7 + .../KernelTranslation/include/insert_sync.h | 9 + .../include/insert_warp_loop.h | 12 + .../include/memory_hierarchy.h | 9 + .../KernelTranslation/include/performance.h | 7 + compilation/KernelTranslation/include/tool.h | 24 + .../KernelTranslation/include/warp_func.h | 10 + .../lib/generate_x86_format.cpp | 119 +++ .../KernelTranslation/lib/handle_sync.cpp | 57 ++ compilation/KernelTranslation/lib/init.cpp | 302 +++++++ .../KernelTranslation/lib/insert_sync.cpp | 494 ++++++++++ .../lib/insert_warp_loop.cpp | 848 ++++++++++++++++++ .../lib/memory_hierarchy.cpp | 126 +++ .../KernelTranslation/lib/performance.cpp | 88 ++ compilation/KernelTranslation/lib/tool.cpp | 480 ++++++++++ .../KernelTranslation/lib/warp_func.cpp | 217 +++++ compilation/examples/reduce/host.cpp | 82 ++ .../kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll | 150 ++++ compilation/examples/reduce/run.sh | 6 + compilation/examples/reduce_shuffle/host.cpp | 82 ++ .../kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll | 179 ++++ compilation/examples/reduce_shuffle/run.sh | 6 + compilation/examples/run_example.sh | 11 + compilation/examples/vecadd/host.cpp | 84 ++ .../kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll | 86 ++ compilation/examples/vecadd/run.sh | 6 + runtime/CMakeLists.txt | 16 + runtime/include/cudaRuntimeImpl.h | 19 + runtime/include/cudaStatus.h | 18 + runtime/lib/cudaRuntimeImpl.cpp | 100 +++ runtime/threadPool/CMakeLists.txt | 17 + runtime/threadPool/include/api.h | 25 + runtime/threadPool/include/def.h | 26 + runtime/threadPool/include/macros.h | 38 + runtime/threadPool/include/structures.h | 191 ++++ runtime/threadPool/lib/api.cpp | 456 ++++++++++ 49 files changed, 4831 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 compilation/CMakeLists.txt create mode 100644 compilation/HostTranslation.cpp create mode 100644 compilation/HostTranslation/CMakeLists.txt create mode 100644 compilation/HostTranslation/include/ReplaceKernelLaunch.h create mode 100644 compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp create mode 100644 compilation/KernelTranslation.cpp create mode 100644 compilation/KernelTranslation/CMakeLists.txt create mode 100644 compilation/KernelTranslation/include/generate_x86_format.h create mode 100644 compilation/KernelTranslation/include/handle_sync.h create mode 100644 compilation/KernelTranslation/include/init.h create mode 100644 compilation/KernelTranslation/include/insert_sync.h create mode 100644 compilation/KernelTranslation/include/insert_warp_loop.h create mode 100644 compilation/KernelTranslation/include/memory_hierarchy.h create mode 100644 compilation/KernelTranslation/include/performance.h create mode 100644 compilation/KernelTranslation/include/tool.h create mode 100644 compilation/KernelTranslation/include/warp_func.h create mode 100644 compilation/KernelTranslation/lib/generate_x86_format.cpp create mode 100644 compilation/KernelTranslation/lib/handle_sync.cpp create mode 100644 compilation/KernelTranslation/lib/init.cpp create mode 100644 compilation/KernelTranslation/lib/insert_sync.cpp create mode 100644 compilation/KernelTranslation/lib/insert_warp_loop.cpp create mode 100644 compilation/KernelTranslation/lib/memory_hierarchy.cpp create mode 100644 compilation/KernelTranslation/lib/performance.cpp create mode 100644 compilation/KernelTranslation/lib/tool.cpp create mode 100644 compilation/KernelTranslation/lib/warp_func.cpp create mode 100644 compilation/examples/reduce/host.cpp create mode 100644 compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll create mode 100644 compilation/examples/reduce/run.sh create mode 100644 compilation/examples/reduce_shuffle/host.cpp create mode 100644 compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll create mode 100644 compilation/examples/reduce_shuffle/run.sh create mode 100644 compilation/examples/run_example.sh create mode 100644 compilation/examples/vecadd/host.cpp create mode 100644 compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll create mode 100644 compilation/examples/vecadd/run.sh create mode 100644 runtime/CMakeLists.txt create mode 100644 runtime/include/cudaRuntimeImpl.h create mode 100644 runtime/include/cudaStatus.h create mode 100644 runtime/lib/cudaRuntimeImpl.cpp create mode 100644 runtime/threadPool/CMakeLists.txt create mode 100644 runtime/threadPool/include/api.h create mode 100644 runtime/threadPool/include/def.h create mode 100644 runtime/threadPool/include/macros.h create mode 100644 runtime/threadPool/include/structures.h create mode 100644 runtime/threadPool/lib/api.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..7b08bef --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,41 @@ +cmake_minimum_required(VERSION 3.1 FATAL_ERROR) + +project(CudaOnX86) +set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.") +set(CMAKE_CXX_STANDARD "14") +set(MAJOR_VERSION 0) +set(MINOR_VERSION 1) +set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION}) +set(COX_VERSION ${VERSION_STRING}) +# get LLVM PATH get PATH for head file +if(DEFINED LLVM_CONFIG_PATH) + if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}") + if(EXISTS "${LLVM_CONFIG_PATH}") + set(LLVM_CONFIG "${LLVM_CONFIG_PATH}") + else() + message(FATAL_ERROR "llvm-config is not found in ${LLVM_CONFIG_PATH}") + endif() + message(STATUS "Using llvm-config: ${LLVM_CONFIG}") + execute_process( + COMMAND "${LLVM_CONFIG}" "--cxxflags" + OUTPUT_VARIABLE LLVM_CXX_FLAG + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND "${LLVM_CONFIG}" "--libdir" + OUTPUT_VARIABLE LLVM_LIB_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND "${LLVM_CONFIG}" "--libs" + OUTPUT_VARIABLE LLVM_LINK_FLAG + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() +else() + message(FATAL_ERROR "llvm-config is required") +endif() + +set(CMAKE_CXX_FLAGS "${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}") + +set(GCC_COVERAGE_LINK_FLAGS + "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm") + +add_subdirectory(compilation) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..90d3f91 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,34 @@ +# Contributing to COX + +Thank you for your interest in contributing to COX! +We appreciate all contributions, including but not limited to: + +- Add documentation +- Add new features and components +- Fix bugs + +## How to contribute? + +0. (Optional) Open an issue and discuss your idea before start +1. Fork the latest version COX +2. Commit to the forked repo +3. Create a Pull Request to COX main branch + +## Code style + +We follow the Clang format in this repo. +To make sure your contribution is following the correct style, +we highly recommend you to install [pre-commit](https://pre-commit.com/) before development. + +```bash +# Python environment is required +pip install pre-commit +``` + +Then, from the repository folder, execute the following instruction: + +```bash + pre-commit install +``` + +With pre-commit plugin, each local commit will be automatically checked. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9864eaa --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Ruobing Han + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..34461f1 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# COX: CUDA on X86 + +## Introduction + +This project consists of two parts: a series of LLVM passes that +achieve a SPMD NVVM IR as input, and output the corresponding +MPMD+SIMD version of LLVM IR which can be execute on CPU devices. + +## Install + +### Prerequisites + +* Linux: Verified on Ubuntu 18.04 +* LLVM10.0 +* NVIDIA CUDA-toolkit +* x86 CPU +* pthread +* GCC 7.5.0 + +### Installation + +1. Clone from github + + ```bash + git clone https://github.com/drcut/open_source_template + cd open_source_template + ``` + +2. Build the transformer for NVVM IR to LLVM IR for X86 + + ```bash + mkdir build && cd build + cmake .. -DLLVM_CONFIG_PATH=`which llvm-config` # need path to llvm-config + make + ``` + +## Run Vecadd samples + +```bash +# Generate bitcode from human-readable LLVM IR +llvm-as ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +# use LLVM passes to transform NVVM IR (SPMD) to LLVM IR (MPMD+SIMD). +# NOTE: we hard-code the grid size (1, 1, 1) +# and block size (1024, 1, 1) into the generated LLVM IR +./compilation/nvvm2x86 \ + ../compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc \ + kernel.bc 1 1 1 32 1 1 +# generate object file from LLVM IR +llc --filetype=obj kernel.bc +# link generated kernel function +# with host function and generate excutable file +g++ ../compilation/examples/vecadd/host.cpp \ + kernel.o -lpthread -o vecadd_example +# execute the executable file +./vecadd_example +``` + +## Author + +[Ruobing Han](https://drcut.github.io/) is a CS phd student in +Georgia Institute Technology, under the supervision +of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/). diff --git a/compilation/CMakeLists.txt b/compilation/CMakeLists.txt new file mode 100644 index 0000000..868f9c4 --- /dev/null +++ b/compilation/CMakeLists.txt @@ -0,0 +1,23 @@ +cmake_minimum_required(VERSION 3.1 FATAL_ERROR) +project( + NVVM2X86 + DESCRIPTION "Translate NVVM IR to LLVM IR for X86" + LANGUAGES CXX) + +set(CMAKE_VERBOSE_MAKEFILE ON) + +# compile kernel translator +include_directories(./KernelTranslation/include) +add_subdirectory(KernelTranslation) + +add_executable(kernelTranslator KernelTranslation.cpp) +target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS}) + +# compile host translator +include_directories(./HostTranslation/include) +add_subdirectory(HostTranslation) + +add_executable(hostTranslator HostTranslation.cpp) + +target_link_libraries(hostTranslator spmd2mpmd cudaRuntime2cpuRuntime + ${GCC_COVERAGE_LINK_FLAGS}) diff --git a/compilation/HostTranslation.cpp b/compilation/HostTranslation.cpp new file mode 100644 index 0000000..9695b56 --- /dev/null +++ b/compilation/HostTranslation.cpp @@ -0,0 +1,25 @@ +#include "ReplaceKernelLaunch.h" +#include "tool.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include +#include +#include + +using namespace llvm; + +int main(int argc, char **argv) { + assert(argc == 3 && "incorrect number of arguments\n"); + + char *input_host_path = argv[1]; + char *output_host_path = argv[2]; + + // load LLVM module(s) + llvm::Module *hostModule = LoadModuleFromFilr(input_host_path); + VerifyModule(hostModule); + // process host module + ReplaceKernelLaunch(hostModule); + VerifyModule(hostModule); + DumpModule(hostModule, output_host_path); + return 0; +} diff --git a/compilation/HostTranslation/CMakeLists.txt b/compilation/HostTranslation/CMakeLists.txt new file mode 100644 index 0000000..2e97dd6 --- /dev/null +++ b/compilation/HostTranslation/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.1) + +# C project +project( + HostTranslation + DESCRIPTION "Translate CUDA host modules to CPU host modules, + mainly replace CUDA Runtime APIs with CPU Runtime APIs" + LANGUAGES CXX) + +set(CMAKE_VERBOSE_MAKEFILE ON) +set(LIB_NAME cudaRuntime2cpuRuntime) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Debug) +include_directories(./include) + +file(GLOB proj_HEADERS "include/*.h") +file(GLOB proj_SOURCES "lib/*.cpp") + +# Add core library. +add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/HostTranslation/include/ReplaceKernelLaunch.h b/compilation/HostTranslation/include/ReplaceKernelLaunch.h new file mode 100644 index 0000000..769489c --- /dev/null +++ b/compilation/HostTranslation/include/ReplaceKernelLaunch.h @@ -0,0 +1,11 @@ +#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__ +#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__ + +#include "llvm/IR/Module.h" +/* + * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) + * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) + */ +void ReplaceKernelLaunch(llvm::Module *M); + +#endif diff --git a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp new file mode 100644 index 0000000..67525d7 --- /dev/null +++ b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp @@ -0,0 +1,94 @@ +#include "ReplaceKernelLaunch.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include +#include +#include + +using namespace llvm; + +// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) +// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) +void ReplaceKernelLaunch(llvm::Module *M) { + LLVMContext &context = M->getContext(); + auto VoidTy = llvm::Type::getVoidTy(context); + auto I8 = llvm::Type::getInt8PtrTy(context); + std::map kernels; + + LLVMContext *C = &M->getContext(); + + llvm::Type *Int32T = Type::getInt32Ty(*C); + llvm::Type *Int8T = Type::getInt8Ty(*C); + + llvm::FunctionType *LauncherFuncT = + FunctionType::get(Type::getVoidTy(*C), NULL); + + llvm::FunctionType *LaunchFun2 = + FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL); + + bool done = false; + + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + + if (llvm::CallInst *callInst = llvm::dyn_cast(inst)) { + if (Function *calledFunction = callInst->getCalledFunction()) { + + if (calledFunction->getName().startswith("cudaLaunchKernel")) { + + Value *callOperand = callInst->getArgOperand(0); + + Function *functionOperand = + dyn_cast(callInst->getArgOperand(0)); + + // call function is wrapped in a bitcast + if (functionOperand == NULL) { + + std::vector arg_sizes; + functionOperand = + dyn_cast(callOperand->stripPointerCasts()); + + FunctionType *ft = calledFunction->getFunctionType(); + std::cout << " Parent (Caller) Function Name: " << func_name + << ", cudaLaunchKernel Function: " + << functionOperand->getName().str() << ", args " + << functionOperand->arg_size() << std::endl; + auto rep = kernels.find(functionOperand->getName().str()); + if (rep != kernels.end()) { + + callInst->setArgOperand(0, rep->second); + continue; + } + + std::vector Params; + Params.push_back(I8); + FunctionType *FT = FunctionType::get(VoidTy, Params, false); + std::string newName = + functionOperand->getName().str() + "_wrapper"; + + Function *F = + Function::Create(FT, Function::ExternalLinkage, newName, M); + F->setDSOLocal(true); + + BitCastInst *BC = new BitCastInst(F, I8, "", callInst); + callInst->setArgOperand(0, BC); + kernels.insert({functionOperand->getName().str(), BC}); + } + } + } + } + } + } + } +} diff --git a/compilation/KernelTranslation.cpp b/compilation/KernelTranslation.cpp new file mode 100644 index 0000000..1d24c7b --- /dev/null +++ b/compilation/KernelTranslation.cpp @@ -0,0 +1,53 @@ +#include "generate_x86_format.h" +#include "handle_sync.h" +#include "init.h" +#include "insert_sync.h" +#include "insert_warp_loop.h" +#include "performance.h" +#include "tool.h" +#include "warp_func.h" +#include "llvm/IR/Module.h" +#include +#include +#include +#include +#include + +using namespace llvm; + +int main(int argc, char **argv) { + assert(argc == 9 && "incorrect number of arguments\n"); + llvm::Module *program = LoadModuleFromFilr(argv[1]); + // get size of grid and dim from input arguments + int *grid_dim = new int[3]; + int *block_dim = new int[3]; + grid_dim[0] = atoi(argv[3]); + grid_dim[1] = atoi(argv[4]); + grid_dim[2] = atoi(argv[5]); + block_dim[0] = atoi(argv[6]); + block_dim[1] = atoi(argv[7]); + block_dim[2] = atoi(argv[8]); + + // inline, and create auxiliary global variables + init_block(program); + // insert sync before each vote, and replace the + // original vote function to warp vote + handle_warp_vote(program); + // replace warp shuffle + handle_warp_shfl(program); + // insert sync + insert_sync(program); + // split block by sync + split_block_by_sync(program); + // add loop for intra&intera thread + insert_warp_loop(program); + // (TODO): replace this patch + replace_built_in_function(program, grid_dim, block_dim); + // VerifyModule(program); + generate_x86_format(program); + // performance optimization + performance_optimization(program); + + DumpModule(program, argv[2]); + return 0; +} diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt new file mode 100644 index 0000000..7ec3898 --- /dev/null +++ b/compilation/KernelTranslation/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.1) + +# C project +project( + KernelTranslation + DESCRIPTION + "Translate SPMD Kernel to MPMD format with hierarchical collapsing" + LANGUAGES CXX) + +set(CMAKE_VERBOSE_MAKEFILE ON) +set(LIB_NAME spmd2mpmd) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Debug) +include_directories(./include) + +file(GLOB proj_HEADERS "include/*.h") +file(GLOB proj_SOURCES "lib/*.cpp") + +# Add core library. +add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/KernelTranslation/include/generate_x86_format.h b/compilation/KernelTranslation/include/generate_x86_format.h new file mode 100644 index 0000000..dff3694 --- /dev/null +++ b/compilation/KernelTranslation/include/generate_x86_format.h @@ -0,0 +1,8 @@ +#ifndef __NVVM2x86_GENERATE_X86_FORMAT__ +#define __NVVM2x86_GENERATE_X86_FORMAT__ + +#include "llvm/IR/Module.h" + +void generate_x86_format(llvm::Module *M); + +#endif diff --git a/compilation/KernelTranslation/include/handle_sync.h b/compilation/KernelTranslation/include/handle_sync.h new file mode 100644 index 0000000..e007f96 --- /dev/null +++ b/compilation/KernelTranslation/include/handle_sync.h @@ -0,0 +1,10 @@ +#ifndef __NVVM2x86_HANDLE_SYNC__ +#define __NVVM2x86_HANDLE_SYNC__ + +#include "llvm/IR/Module.h" + +using namespace llvm; + +void split_block_by_sync(llvm::Module *M); + +#endif diff --git a/compilation/KernelTranslation/include/init.h b/compilation/KernelTranslation/include/init.h new file mode 100644 index 0000000..10f5186 --- /dev/null +++ b/compilation/KernelTranslation/include/init.h @@ -0,0 +1,7 @@ +#ifndef __NVVM2x86_INIT__ +#define __NVVM2x86_INIT__ + +#include "llvm/IR/Module.h" + +void init_block(llvm::Module *M); +#endif diff --git a/compilation/KernelTranslation/include/insert_sync.h b/compilation/KernelTranslation/include/insert_sync.h new file mode 100644 index 0000000..a7e2d8a --- /dev/null +++ b/compilation/KernelTranslation/include/insert_sync.h @@ -0,0 +1,9 @@ +#ifndef __NVVM2x86_INSERT_SYNC__ +#define __NVVM2x86_INSERT_SYNC__ + +#include "llvm/IR/Function.h" + +// insert extra barrier +void insert_sync(llvm::Module *M); + +#endif diff --git a/compilation/KernelTranslation/include/insert_warp_loop.h b/compilation/KernelTranslation/include/insert_warp_loop.h new file mode 100644 index 0000000..da1a708 --- /dev/null +++ b/compilation/KernelTranslation/include/insert_warp_loop.h @@ -0,0 +1,12 @@ +#ifndef __NVVM2x86_INSERT_WARP_LOOP__ +#define __NVVM2x86_INSERT_WARP_LOOP__ + +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +void insert_warp_loop(llvm::Module *M); + +#endif diff --git a/compilation/KernelTranslation/include/memory_hierarchy.h b/compilation/KernelTranslation/include/memory_hierarchy.h new file mode 100644 index 0000000..1f8495c --- /dev/null +++ b/compilation/KernelTranslation/include/memory_hierarchy.h @@ -0,0 +1,9 @@ +#ifndef __NVVM2x86_MEMORY_HIERARCHY__ +#define __NVVM2x86_MEMORY_HIERARCHY__ +#include "llvm/IR/Module.h" + +using namespace llvm; + +void mem_share2global(llvm::Module *M); + +#endif diff --git a/compilation/KernelTranslation/include/performance.h b/compilation/KernelTranslation/include/performance.h new file mode 100644 index 0000000..bb9bf88 --- /dev/null +++ b/compilation/KernelTranslation/include/performance.h @@ -0,0 +1,7 @@ +#ifndef __NVVM2x86_PERFORMANCE__ +#define __NVVM2x86_PERFORMANCE__ + +#include "llvm/IR/Module.h" + +void performance_optimization(llvm::Module *M); +#endif diff --git a/compilation/KernelTranslation/include/tool.h b/compilation/KernelTranslation/include/tool.h new file mode 100644 index 0000000..cb1963f --- /dev/null +++ b/compilation/KernelTranslation/include/tool.h @@ -0,0 +1,24 @@ +#ifndef __NVVM2x86_TOOL__ +#define __NVVM2x86_TOOL__ + +#include "llvm/IR/Module.h" +llvm::Module *LoadModuleFromFilr(char *file_name); +void DumpModule(llvm::Module *M, char *file_name); +bool isKernelFunction(llvm::Module *M, llvm::Function *F); +void replace_block(llvm::Function *F, llvm::BasicBlock *before, + llvm::BasicBlock *after); +llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore); +llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore); +void VerifyModule(llvm::Module *); +void phi2alloc(llvm::Module *M); +void remove_cuda_built_in(llvm::Module *M); +void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim); +void replace_asm_call(llvm::Module *M); +bool find_block_barrier_in_region(llvm::BasicBlock *start, + llvm::BasicBlock *end); +bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end); +bool has_warp_barrier(llvm::BasicBlock *B); +bool has_barrier(llvm::BasicBlock *B); +bool has_block_barrier(llvm::BasicBlock *B); +bool has_barrier(llvm::Function *F); +#endif diff --git a/compilation/KernelTranslation/include/warp_func.h b/compilation/KernelTranslation/include/warp_func.h new file mode 100644 index 0000000..1fc0554 --- /dev/null +++ b/compilation/KernelTranslation/include/warp_func.h @@ -0,0 +1,10 @@ +#ifndef __NVVM2x86_WARP_FUNC__ +#define __NVVM2x86_WARP_FUNC__ + +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +void handle_warp_vote(llvm::Module *M); +void handle_warp_shfl(llvm::Module *M); +#endif diff --git a/compilation/KernelTranslation/lib/generate_x86_format.cpp b/compilation/KernelTranslation/lib/generate_x86_format.cpp new file mode 100644 index 0000000..dbfacbc --- /dev/null +++ b/compilation/KernelTranslation/lib/generate_x86_format.cpp @@ -0,0 +1,119 @@ +#include "generate_x86_format.h" +#include "tool.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +using namespace llvm; + +void set_meta_data(llvm::Module *M) { + M->setTargetTriple("x86_64-unknown-linux-gnu"); + M->setDataLayout( + "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); +} + +// as pthread only accept a single void* for input +// we have to decode this input inside the kernel +void decode_input(llvm::Module *M) { + + std::set need_remove; + + LLVMContext *C = &M->getContext(); + llvm::Type *Int32T = Type::getInt32Ty(*C); + llvm::Type *Int8T = Type::getInt8Ty(*C); + + llvm::FunctionType *LauncherFuncT = FunctionType::get( + Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false); + + // generate Wrapper Function type + // now we only support a single int32* + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + if (!isKernelFunction(M, F)) + continue; + auto func_name = F->getName().str(); + llvm::IRBuilder<> Builder(M->getContext()); + + FunctionCallee fc = + M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT); + Function *WorkGroup = dyn_cast(fc.getCallee()); + + BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup); + Builder.SetInsertPoint(Block); + + // WorkGroup has only a single input + Function::arg_iterator ai = WorkGroup->arg_begin(); + + SmallVector Arguments; + Value *input_arg = &*ai; + // convert to int** + input_arg = Builder.CreateBitOrPointerCast( + input_arg, PointerType::get(PointerType::get(Int32T, 0), 0)); + size_t idx = 0; + for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); + ii != ee; ++ii) { + Type *ArgType = ii->getType(); + + // calculate addr + Value *GEP = Builder.CreateGEP(input_arg, ConstantInt::get(Int32T, idx)); + // load corresponding int* + GEP = Builder.CreateLoad(GEP); + // bitcast + GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0)); + Value *Arg = Builder.CreateLoad(GEP); + Arguments.push_back(Arg); + ++idx; + } + + CallInst *c = Builder.CreateCall(F, ArrayRef(Arguments)); + Builder.CreateRetVoid(); + } + for (auto f : need_remove) { + f->dropAllReferences(); + f->eraseFromParent(); + } +} + +void remove_barrier(llvm::Module *M) { + std::vector need_remove; + for (auto F = M->begin(); F != M->end(); ++F) + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Call = dyn_cast(BI)) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.bar.warp.sync" || + func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.barrier.sync") { + need_remove.push_back(Call); + } + } + } + } + for (auto inst : need_remove) { + inst->eraseFromParent(); + } +} + +void generate_x86_format(llvm::Module *M) { + // change metadata + set_meta_data(M); + // decode argument + decode_input(M); + // remove barrier + remove_barrier(M); +} diff --git a/compilation/KernelTranslation/lib/handle_sync.cpp b/compilation/KernelTranslation/lib/handle_sync.cpp new file mode 100644 index 0000000..e0fb19b --- /dev/null +++ b/compilation/KernelTranslation/lib/handle_sync.cpp @@ -0,0 +1,57 @@ +#include "handle_sync.h" +#include "tool.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include +#include + +using namespace llvm; + +void split_block_by_sync(llvm::Function *F) { + std::set sync_inst; + bool jump_first_sync = 1; + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + if (jump_first_sync) { + jump_first_sync = 0; + Instruction *next_inst = &(*std::next(i)); + sync_inst.insert(next_inst); + continue; + } + llvm::CallInst *Call = llvm::dyn_cast(inst); + if (Call) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.bar.warp.sync" || + func_name == "llvm.nvvm.barrier.sync") { + sync_inst.insert(Call); + // we should also sync the next instruction + // so that we can get a block with sync inst only + Instruction *next_inst = &(*std::next(i)); + sync_inst.insert(next_inst); + } + } + } + } + int _tmp = 0; + for (auto inst : sync_inst) { + inst->getParent()->splitBasicBlock( + inst, inst->getParent()->getName().str() + "_after_block_sync_" + + std::to_string(_tmp++)); + } +} + +void split_block_by_sync(llvm::Module *M) { + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + if (isKernelFunction(M, F)) + split_block_by_sync(F); + } +} diff --git a/compilation/KernelTranslation/lib/init.cpp b/compilation/KernelTranslation/lib/init.cpp new file mode 100644 index 0000000..8007470 --- /dev/null +++ b/compilation/KernelTranslation/lib/init.cpp @@ -0,0 +1,302 @@ +#include "init.h" +#include "memory_hierarchy.h" +#include "tool.h" +#include +#include + +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassInfo.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +using namespace llvm; + +void inline_func_vote(llvm::Module *M) { + std::set need_remove; + + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (!isKernelFunction(M, F)) + continue; + Function::iterator I = F->begin(); + for (Function::iterator E = F->end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + if (CallInst *c = dyn_cast(BI++)) { + if (c->getCalledFunction()) { + if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") { + InlineFunctionInfo IFI; + InlineFunction(c, IFI); + need_remove.insert(c->getCalledFunction()); + } + } + } + } + } + } + for (auto f : need_remove) { + f->dropAllReferences(); + f->eraseFromParent(); + } +} + +void create_global_variable(llvm::Module *M) { + llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext()); + llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext()); + + auto zero = llvm::ConstantInt::get(I32, 0, true); + + // we need global variable used for warp shuffle + llvm::Type *WarpArrayType = llvm::ArrayType::get(I32, 32); + llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32); + + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + zero, "intra_warp_index", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + zero, "inter_warp_index", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_size", NULL, + llvm::GlobalValue::NotThreadLocal, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_size_x", NULL, + llvm::GlobalValue::NotThreadLocal, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_size_y", NULL, + llvm::GlobalValue::NotThreadLocal, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_size_z", NULL, + llvm::GlobalValue::NotThreadLocal, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "grid_size", NULL, + llvm::GlobalValue::NotThreadLocal, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_index", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + // TLS variable used for warp-level collective operators + new llvm::GlobalVariable( + *M, WarpArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL, + "warp_shfl", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + auto warp_vote = new llvm::GlobalVariable( + *M, VoteArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL, + "warp_vote", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + warp_vote->setAlignment(llvm::MaybeAlign(32)); +} + +void remove_metadata(llvm::Module *M) { + SmallVector, 4> MDs; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + F->getAllMetadata(MDs); + for (auto &MD : MDs) { + F->setMetadata(MD.first, NULL); + } + F->removeFnAttr("target-features"); + F->removeFnAttr("target-cpu"); + } +} + +void init_llvm_pass() { + + InitializeAllTargets(); + InitializeAllTargetMCs(); + InitializeAllAsmPrinters(); + InitializeAllAsmParsers(); + + PassRegistry &Registry = *PassRegistry::getPassRegistry(); + + initializeCore(Registry); + initializeScalarOpts(Registry); + initializeVectorization(Registry); + initializeIPO(Registry); + initializeAnalysis(Registry); + initializeTransformUtils(Registry); + initializeInstCombine(Registry); + initializeInstrumentation(Registry); + initializeTarget(Registry); + + llvm::StringMap &opts = llvm::cl::getRegisteredOptions(); + + llvm::cl::Option *O = nullptr; + + O = opts["scalarize-load-store"]; + assert(O && "could not find LLVM option 'scalarize-load-store'"); + O->addOccurrence(1, StringRef("scalarize-load-store"), StringRef("1"), false); + + // LLVM inner loop vectorizer does not check whether the loop inside + // another loop, in which case even a small trip count loops might be + // worthwhile to vectorize. + O = opts["vectorizer-min-trip-count"]; + assert(O && "could not find LLVM option 'vectorizer-min-trip-count'"); + O->addOccurrence(1, StringRef("vectorizer-min-trip-count"), StringRef("2"), + false); + + // Disable jump threading optimization with following two options from + // duplicating blocks. Using jump threading will mess up parallel region + // construction especially when kernel contains barriers. + // TODO: If enabled then parallel region construction code needs + // improvements and make sure it doesn't disallow other optimizations like + // vectorization. + O = opts["jump-threading-threshold"]; + assert(O && "could not find LLVM option 'jump-threading-threshold'"); + O->addOccurrence(1, StringRef("jump-threading-threshold"), StringRef("0"), + false); + O = opts["jump-threading-implication-search-threshold"]; + assert(O && "could not find LLVM option " + "'jump-threading-implication-search-threshold'"); + O->addOccurrence(1, StringRef("jump-threading-implication-search-threshold"), + StringRef("0"), false); + + // Enable diagnostics from the loop vectorizer. + O = opts["pass-remarks-missed"]; + assert(O && "could not find LLVM option 'pass-remarks-missed'"); + O->addOccurrence(1, StringRef("pass-remarks-missed"), + StringRef("loop-vectorize"), false); + O->addOccurrence(1, StringRef("pass-remarks-missed"), + StringRef("slp-vectorize"), false); + + O = opts["pass-remarks-analysis"]; + assert(O && "could not find LLVM option 'pass-remarks-analysis'"); + O->addOccurrence(1, StringRef("pass-remarks-analysis"), + StringRef("loop-vectorize"), false); + O->addOccurrence(1, StringRef("pass-remarks-analysis"), + StringRef("slp-vectorize"), false); + + O = opts["pass-remarks"]; + assert(O && "could not find LLVM option 'pass-remarks'"); + O->addOccurrence(1, StringRef("pass-remarks"), StringRef("loop-vectorize"), + false); + O->addOccurrence(1, StringRef("pass-remarks"), StringRef("slp-vectorize"), + false); +} + +void llvm_preprocess(llvm::Module *M) { + init_llvm_pass(); + + auto Registry = PassRegistry::getPassRegistry(); + + llvm::legacy::PassManager Passes; + + std::vector passes; + passes.push_back("lowerswitch"); + passes.push_back("mem2reg"); + passes.push_back("simplifycfg"); + passes.push_back("loop-simplify"); + for (auto pass : passes) { + const PassInfo *PIs = Registry->getPassInfo(StringRef(pass)); + if (PIs) { + Pass *thispass = PIs->createPass(); + Passes.add(thispass); + } else { + printf("Pass: %s not found\n", pass.c_str()); + } + } + Passes.run(*M); +} + +bool lower_constant_expr(llvm::Module *M) { + bool modified = false; + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::vector need_remove; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (!isKernelFunction(M, F)) + continue; + + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto load_inst = dyn_cast(BI)) { + auto load_from = load_inst->getOperand(0); + if (auto get_element_ptr = dyn_cast(load_from)) { + modified = true; + auto ReplInst = get_element_ptr->getAsInstruction(); + ReplInst->insertBefore(load_inst); + std::vector Users; + // Do not replace use during iteration of use. Do it in another loop + for (auto U : get_element_ptr->users()) { + if (auto InstUser = dyn_cast(U)) { + Users.push_back(InstUser); + } + } + for (auto &User : Users) + User->replaceUsesOfWith(get_element_ptr, ReplInst); + } + } else if (auto store_inst = dyn_cast(BI)) { + auto store_to = store_inst->getOperand(1); + if (auto addr_cast = dyn_cast(store_to)) { + modified = true; + auto ReplInst = addr_cast->getAsInstruction(); + ReplInst->insertBefore(store_inst); + std::vector Users; + // Do not replace use during iteration of use. Do it in another loop + for (auto U : addr_cast->users()) { + if (auto InstUser = dyn_cast(U)) { + Users.push_back(InstUser); + } + } + for (auto &User : Users) + User->replaceUsesOfWith(addr_cast, ReplInst); + } + } else if (auto get_element_ptr = + dyn_cast(BI)) { + auto get_from = get_element_ptr->getOperand(0); + if (auto addr_cast = dyn_cast(get_from)) { + modified = true; + auto ReplInst = addr_cast->getAsInstruction(); + ReplInst->insertBefore(get_element_ptr); + std::vector Users; + // Do not replace use during iteration of use. Do it in another loop + for (auto U : addr_cast->users()) { + if (auto InstUser = dyn_cast(U)) { + Users.push_back(InstUser); + } + } + for (auto &User : Users) + User->replaceUsesOfWith(addr_cast, ReplInst); + } + } + } + } + } + return modified; +} + +void init_block(llvm::Module *M) { + // using official llvm preprocess + llvm_preprocess(M); + // remove useles Cuda function + remove_cuda_built_in(M); + + // lower ConstantExpression + bool modified; + do { + modified = lower_constant_expr(M); + } while (modified); + // remove useless metadata + remove_metadata(M); + // inline vote function + inline_func_vote(M); + // create global variable for warp and vote + create_global_variable(M); + // replace phi with data load + phi2alloc(M); + // replace share memory + mem_share2global(M); + // replace asm Inline + replace_asm_call(M); +} diff --git a/compilation/KernelTranslation/lib/insert_sync.cpp b/compilation/KernelTranslation/lib/insert_sync.cpp new file mode 100644 index 0000000..dfe0676 --- /dev/null +++ b/compilation/KernelTranslation/lib/insert_sync.cpp @@ -0,0 +1,494 @@ +#include "insert_sync.h" +#include "assert.h" +#include "handle_sync.h" +#include "tool.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassInfo.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include + +using namespace llvm; + +class InsertBuiltInBarrier : public llvm::FunctionPass { + +public: + static char ID; + + InsertBuiltInBarrier() : FunctionPass(ID) {} + + virtual bool runOnFunction(Function &F) { + if (!isKernelFunction(F.getParent(), &F)) + return 0; + std::vector insert_intra_warp_sync_before; + std::vector insert_inter_warp_sync_before; + + // insert sync in the entry + BasicBlock *entry = &(*F.begin()); + for (auto i = entry->begin(); i != entry->end(); i++) { + if (!isa(i)) { + insert_inter_warp_sync_before.push_back(&(*(i))); + break; + } + } + + for (Function::iterator I = F.begin(); I != F.end(); ++I) { + BasicBlock::iterator BI = I->begin(); + + // insert barrier before return + for (; BI != I->end(); BI++) { + llvm::ReturnInst *Ret = llvm::dyn_cast(&(*BI)); + if (Ret) { + insert_inter_warp_sync_before.push_back(&(*BI)); + } + } + } + if (insert_intra_warp_sync_before.empty() && + insert_inter_warp_sync_before.empty()) + return 0; + for (auto inst : insert_intra_warp_sync_before) { + CreateIntraWarpBarrier(inst); + } + for (auto inst : insert_inter_warp_sync_before) { + CreateInterWarpBarrier(inst); + } + return 1; + } +}; + +class InsertConditionalBarrier : public llvm::FunctionPass { + +public: + static char ID; + + InsertConditionalBarrier() : FunctionPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + } + + BasicBlock *firstNonBackedgePredecessor(llvm::BasicBlock *bb) { + + DominatorTree *DT = &getAnalysis().getDomTree(); + + pred_iterator I = pred_begin(bb), E = pred_end(bb); + if (I == E) + return NULL; + while (DT->dominates(bb, *I) && I != E) + ++I; + if (I == E) + return NULL; + else + return *I; + } + + BasicBlock *firstNonBackedgeSuccessor(llvm::BasicBlock *bb) { + DominatorTree *DT = &getAnalysis().getDomTree(); + auto t = bb->getTerminator(); + assert(t->getNumSuccessors() <= 2); + for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) { + BasicBlock *successor = t->getSuccessor(i); + bool isBackedge = DT->dominates(successor, bb); + if (isBackedge) + continue; + return successor; + } + }; + + virtual bool runOnFunction(Function &F) { + if (!isKernelFunction(F.getParent(), &F)) + return 0; + + auto PDT = &getAnalysis(); + + // first find all conditional barriers + std::vector conditionalBarriers; + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { + BasicBlock *b = &*i; + if (!has_barrier(b)) + continue; + + // Unconditional barrier postdominates the entry node. + if (PDT->getPostDomTree().dominates(b, &F.getEntryBlock())) + continue; + conditionalBarriers.push_back(b); + } + + if (conditionalBarriers.size() == 0) + return 0; + + bool changed = false; + + while (!conditionalBarriers.empty()) { + BasicBlock *b = conditionalBarriers.back(); + conditionalBarriers.pop_back(); + + // insert barrier in the start of if-condition + + BasicBlock *pos = b; + BasicBlock *pred = firstNonBackedgePredecessor(b); + + while (PDT->getPostDomTree().dominates(b, pred)) { + pos = pred; + // If our BB post dominates the given block, we know it is not the + // branching block that makes the barrier conditional. + pred = firstNonBackedgePredecessor(pred); + + if (pred == b) + break; // Traced across a loop edge, skip this case. + } + // we should create warp/block barrier based on the conditional barrier + if (has_warp_barrier(b)) { + CreateIntraWarpBarrier(pred->getTerminator()); + } else { + CreateInterWarpBarrier(pred->getTerminator()); + } + changed = true; + + // insert barrier in the merge point for then-else branches + // also insert barrier at the end of conditional branch + DominatorTree *DT = &getAnalysis().getDomTree(); + std::queue successor_queue; + for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) { + auto ss = pred->getTerminator()->getSuccessor(i); + if (!DT->dominates(ss, pred)) + successor_queue.push(ss); + } + std::set visited; + llvm::BasicBlock *merge_point = NULL; + while (!successor_queue.empty()) { + auto curr = successor_queue.front(); + successor_queue.pop(); + if (visited.find(curr) != visited.end()) + continue; + + visited.insert(curr); + if (PDT->getPostDomTree().dominates(curr, pred)) { + // find the truly merge point + merge_point = curr; + if (has_warp_barrier(b)) { + CreateIntraWarpBarrier(&(*curr->begin())); + for (BasicBlock *Pred : predecessors(curr)) { + CreateIntraWarpBarrier(&(*Pred->getTerminator())); + } + } else { + CreateInterWarpBarrier(&(*curr->begin())); + for (BasicBlock *Pred : predecessors(curr)) { + CreateInterWarpBarrier(&(*Pred->getTerminator())); + } + } + break; + } + for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) { + auto ss = curr->getTerminator()->getSuccessor(i); + if (!DT->dominates(ss, curr)) + successor_queue.push(ss); + } + } + assert(merge_point && "do not find merge point\n"); + changed = true; + + // we may create a new conditional barrier after insert + if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) + conditionalBarriers.push_back(pred); + + // find any block which are not dominated by header + // but be posdiminated by merge point + std::queue if_body; + std::set visited_block; + for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) { + if_body.push(pred->getTerminator()->getSuccessor(i)); + } + while (!if_body.empty()) { + auto curr = if_body.front(); + if_body.pop(); + if (visited_block.find(curr) != visited_block.end()) + continue; + visited_block.insert(curr); + if (!PDT->getPostDomTree().dominates(merge_point, curr)) + continue; + if (!DT->dominates(pred, curr) && + PDT->getPostDomTree().dominates(merge_point, curr)) { + // we should insert barrier at the beginning and + // end of its predecessor + if (has_warp_barrier(b)) { + CreateIntraWarpBarrier(&(*curr->begin())); + for (BasicBlock *Pred : predecessors(curr)) { + CreateIntraWarpBarrier(&(*Pred->getTerminator())); + } + } else { + CreateInterWarpBarrier(&(*curr->begin())); + for (BasicBlock *Pred : predecessors(curr)) { + CreateInterWarpBarrier(&(*Pred->getTerminator())); + } + } + } + for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) { + if_body.push(curr->getTerminator()->getSuccessor(i)); + } + } + } + return changed; + } +}; + +class InsertBarrierForSpecialCase : public llvm::FunctionPass { +public: + static char ID; + + InsertBarrierForSpecialCase() : FunctionPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + } + + virtual bool runOnFunction(Function &F) { + if (!isKernelFunction(F.getParent(), &F)) + return 0; + bool changed = false; + std::set if_head; + // insert an extra block for the following case + // 1) there is a merge point for an if-else branch, + // but this merge point has other income edge + + auto PDT = &getAnalysis(); + auto DT = &getAnalysis().getDomTree(); + + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { + BasicBlock *b = &*i; + BasicBlock *merge_point = NULL; + if (b->getTerminator()->getNumSuccessors() == 2) { + auto b1 = b->getTerminator()->getSuccessor(0); + auto b2 = b->getTerminator()->getSuccessor(1); + if (PDT->getPostDomTree().dominates(b1, b2)) { + merge_point = b1; + } else if (PDT->getPostDomTree().dominates(b2, b2)) { + merge_point = b2; + } else { + assert(0 && "find complex if-else branch\n"); + } + std::cout << std::flush; + for (BasicBlock *Pred : predecessors(merge_point)) { + if (!DT->dominates(b, Pred)) { + // we need to insert an extra block to be the merge point + // for the if-branch + if_head.insert(b); + } + } + } + } + + auto M = F.getParent(); + for (auto head : if_head) { + assert(head->getTerminator()->getNumSuccessors() == 2); + BasicBlock *merge_point = NULL; + auto s1 = head->getTerminator()->getSuccessor(0); + auto s2 = head->getTerminator()->getSuccessor(1); + if (PDT->getPostDomTree().dominates(s1, s2)) { + merge_point = s1; + } else { + merge_point = s2; + } + if (!find_barrier_in_region(head, merge_point)) { + printf("do not need to handle tri-income if: %s\n", + merge_point->getName().str().c_str()); + continue; + } + + BasicBlock *Block = BasicBlock::Create(M->getContext(), "if_end", &F); + llvm::IRBuilder<> Builder(M->getContext()); + Builder.SetInsertPoint(Block); + auto br_inst = Builder.CreateBr(merge_point); + assert(has_barrier(head) && "preheader does not have barrier\n"); + if (has_warp_barrier(head)) { + CreateIntraWarpBarrier(br_inst); + } else { + CreateInterWarpBarrier(br_inst); + } + // replace usage in if-branch + std::set need_replace; + for (BasicBlock *Pred : predecessors(merge_point)) { + if (DT->dominates(head, Pred) && Pred != Block) { + need_replace.insert(Pred->getTerminator()); + } + } + for (auto inst : need_replace) { + inst->replaceUsesOfWith(merge_point, Block); + } + changed = 1; + } + return changed; + } +}; + +class InsertConditionalForBarrier : public llvm::LoopPass { + +public: + static char ID; + + InsertConditionalForBarrier() : LoopPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) { + if (!isKernelFunction(L->getHeader()->getParent()->getParent(), + L->getHeader()->getParent())) + return 0; + // check whether this loop has barrier + bool is_conditional_loop = 0; + bool is_warp = 0; + for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); i != e; + ++i) { + for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e; + ++j) { + if (auto Call = dyn_cast(j)) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.bar.warp.sync" || + func_name == "llvm.nvvm.barrier.sync") { + is_conditional_loop = true; + if (func_name == "llvm.nvvm.bar.warp.sync") { + is_warp = 1; + } + break; + } + } + } + } + if (!is_conditional_loop) + return 0; + // insert barrier at the beginning of header + // and the end of pre header, so that we can get a + // single block connected with latch + if (!is_warp) { + auto prehead_block = L->getLoopPreheader(); + CreateInterWarpBarrier(prehead_block->getTerminator()); + auto header_block = L->getHeader(); + CreateInterWarpBarrier(&(*header_block->begin())); + } else { + auto prehead_block = L->getLoopPreheader(); + CreateIntraWarpBarrier(prehead_block->getTerminator()); + auto header_block = L->getHeader(); + CreateIntraWarpBarrier(&(*header_block->begin())); + } + + // as we assume all loops are rotated, we have to insert + // barrier before the condition jump of the loop exit + + if (auto exit_block = L->getExitingBlock()) { + auto conditional_br = + dyn_cast(exit_block->getTerminator()); + assert(conditional_br && conditional_br->isConditional()); + // insert barrier at the beginning of successor of exit + if (!is_warp) + CreateInterWarpBarrier(conditional_br); + else + CreateIntraWarpBarrier(conditional_br); + } else { + // handle break in for-loop + printf("loop has multiply exists\n"); + // this time, we have also insert sync before the for-body + auto header_block = L->getHeader(); + assert(header_block->getTerminator()->getNumSuccessors() == 2 && + "has more than 2 successors of the for-head\n"); + BasicBlock *for_body = NULL; + for (int i = 0; i < header_block->getTerminator()->getNumSuccessors(); + i++) { + auto bb = header_block->getTerminator()->getSuccessor(i); + if (L->contains(bb)) { + if (is_warp) { + CreateIntraWarpBarrier(&(*bb->begin())); + } else { + CreateInterWarpBarrier(&(*bb->begin())); + } + } + } + + SmallVector ExitingBlocks; + + L->getExitingBlocks(ExitingBlocks); + while (!ExitingBlocks.empty()) { + auto exit_block = ExitingBlocks.back(); + ExitingBlocks.pop_back(); + auto conditional_br = + dyn_cast(exit_block->getTerminator()); + assert(conditional_br && conditional_br->isConditional()); + // insert barrier at the beginning of successor of exit + if (!is_warp) + CreateInterWarpBarrier(conditional_br); + else + CreateIntraWarpBarrier(conditional_br); + } + } + + return 1; + } +}; + +char InsertBuiltInBarrier::ID = 0; +char InsertConditionalBarrier::ID = 0; +char InsertConditionalForBarrier::ID = 0; +char InsertBarrierForSpecialCase::ID = 0; + +namespace { +static RegisterPass + insert_conditional_barrier("insert-conditional-if-barriers", + "Insert conditional barriers for if body"); +static RegisterPass + insert_conditional_for_barrier("insert-conditional-for-barriers", + "Insert conditional barriers for for loop"); +static RegisterPass + insert_special_case("insert-special-case-barriers", + "Insert barriers for special cases"); +static RegisterPass + insert_built_in_barrier("insert-built-in-barriers", + "Insert built in barriers"); +} // namespace + +void insert_sync(llvm::Module *M) { + auto Registry = PassRegistry::getPassRegistry(); + + llvm::legacy::PassManager Passes; + + std::vector passes; + passes.push_back("insert-built-in-barriers"); + passes.push_back("insert-conditional-if-barriers"); + passes.push_back("insert-conditional-for-barriers"); + passes.push_back("insert-special-case-barriers"); + for (auto pass : passes) { + const PassInfo *PIs = Registry->getPassInfo(StringRef(pass)); + if (PIs) { + Pass *thispass = PIs->createPass(); + Passes.add(thispass); + } else { + assert(0 && "Pass not found\n"); + } + } + Passes.run(*M); +} diff --git a/compilation/KernelTranslation/lib/insert_warp_loop.cpp b/compilation/KernelTranslation/lib/insert_warp_loop.cpp new file mode 100644 index 0000000..f4023f0 --- /dev/null +++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp @@ -0,0 +1,848 @@ + +#include "insert_warp_loop.h" +#include "handle_sync.h" +#include "tool.h" +#include +#include +#include + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassInfo.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +#include +#include +#include + +using namespace llvm; + +struct ParallelRegion { + std::set wrapped_block; + llvm::BasicBlock *successor_block; + llvm::BasicBlock *start_block; + llvm::BasicBlock *end_block; + bool inst_in_region(llvm::Instruction *inst) { + for (auto bb : wrapped_block) { + if (inst->getParent()->getName().str() == bb->getName().str()) + return true; + } + return false; + } + bool inst_used_in_region(llvm::Instruction *inst) { + for (auto ui = inst->use_begin(); ui != inst->use_end(); ++ui) { + auto *user = dyn_cast(ui->getUser()); + if (user == NULL) + continue; + if (inst_in_region(user)) { + return 1; + } + } + return 0; + } +}; + +std::map tempInstructionIds; +std::map contextArrays; +int tempInstructionIndex = 0; +int need_nested_loop; + +bool ShouldNotBeContextSaved(llvm::Instruction *instr) { + if (isa(instr)) + return true; + + llvm::Module *M = instr->getParent()->getParent()->getParent(); + llvm::LoadInst *load = dyn_cast(instr); + if (load != NULL) { + auto load_addr = load->getPointerOperand(); + if (load_addr == M->getGlobalVariable("intra_warp_index")) + return true; + if (load_addr == M->getGlobalVariable("inter_warp_index")) + return true; + if (load_addr == M->getGlobalVariable("warp_vote")) + return true; + } + + // TODO: we should further analyze whether the local variable + // is same among all threads within a wrap + return false; +} + +// generate countpart alloc in the beginning of the Function +llvm::Instruction *GetContextArray(llvm::Instruction *instruction, + bool intra_warp_loop) { + std::ostringstream var; + + if (std::string(instruction->getName().str()) != "") { + var << instruction->getName().str(); + } else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) { + var << tempInstructionIds[instruction]; + } else { + tempInstructionIds[instruction] = tempInstructionIndex++; + var << tempInstructionIds[instruction]; + } + if (intra_warp_loop) + var << "_intra_warp_"; + else + var << "_inter_warp_"; + std::string varName = var.str(); + + if (contextArrays.find(varName) != contextArrays.end()) + return contextArrays[varName]; + + BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock(); + IRBuilder<> builder(&*(bb.getFirstInsertionPt())); + Function *FF = instruction->getParent()->getParent(); + Module *M = instruction->getParent()->getParent()->getParent(); + LLVMContext &C = M->getContext(); + const llvm::DataLayout &Layout = M->getDataLayout(); + + llvm::Type *elementType; + if (isa(instruction)) { + elementType = + dyn_cast(instruction)->getType()->getElementType(); + } else { + elementType = instruction->getType(); + } + + Type *AllocType = elementType; + AllocaInst *InstCast = dyn_cast(instruction); + if (InstCast) { + unsigned Alignment = InstCast->getAlignment(); + + uint64_t StoreSize = Layout.getTypeStoreSize(InstCast->getAllocatedType()); + + if ((Alignment > 1) && (StoreSize & (Alignment - 1))) { + uint64_t AlignedSize = (StoreSize & (~(Alignment - 1))) + Alignment; + assert(AlignedSize > StoreSize); + uint64_t RequiredExtraBytes = AlignedSize - StoreSize; + + if (isa(elementType)) { + + ArrayType *StructPadding = ArrayType::get( + Type::getInt8Ty(M->getContext()), RequiredExtraBytes); + + std::vector PaddedStructElements; + PaddedStructElements.push_back(elementType); + PaddedStructElements.push_back(StructPadding); + const ArrayRef NewStructElements(PaddedStructElements); + AllocType = StructType::get(M->getContext(), NewStructElements, true); + uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType); + assert(NewStoreSize == AlignedSize); + } else if (isa(elementType)) { + StructType *OldStruct = dyn_cast(elementType); + + ArrayType *StructPadding = ArrayType::get( + Type::getInt8Ty(M->getContext()), RequiredExtraBytes); + std::vector PaddedStructElements; + for (unsigned j = 0; j < OldStruct->getNumElements(); j++) + PaddedStructElements.push_back(OldStruct->getElementType(j)); + PaddedStructElements.push_back(StructPadding); + const ArrayRef NewStructElements(PaddedStructElements); + AllocType = StructType::get(OldStruct->getContext(), NewStructElements, + OldStruct->isPacked()); + uint64_t NewStoreSize = Layout.getTypeStoreSize(AllocType); + assert(NewStoreSize == AlignedSize); + } + } + } + + llvm::Value *ItemSize = nullptr; + llvm::AllocaInst *Alloca = nullptr; + + auto block_size_addr = M->getGlobalVariable("block_size"); + auto block_size = builder.CreateLoad(block_size_addr); + Alloca = builder.CreateAlloca(AllocType, block_size, varName); + + contextArrays[varName] = Alloca; + return Alloca; +} + +// save the local variable into replicated array +llvm::Instruction *AddContextSave(llvm::Instruction *instruction, + llvm::Instruction *alloca, + bool intra_warp_loop) { + + if (isa(instruction)) { + return NULL; + } + + llvm::Module *M = instruction->getParent()->getParent()->getParent(); + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + + /* Save the produced variable to the array. */ + BasicBlock::iterator definition = + (dyn_cast(instruction))->getIterator(); + ++definition; + + IRBuilder<> builder(&*definition); + std::vector gepArgs; + + auto inter_warp_index = + builder.CreateLoad(M->getGlobalVariable("inter_warp_index")); + auto intra_warp_index = + builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + auto thread_idx = builder.CreateBinOp( + Instruction::Add, intra_warp_index, + builder.CreateBinOp(Instruction::Mul, inter_warp_index, + ConstantInt::get(I32, 32)), + "thread_idx"); + gepArgs.push_back(thread_idx); + + return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs)); +} + +llvm::Instruction *AddContextRestore(llvm::Value *val, + llvm::Instruction *alloca, + llvm::Instruction *before, bool isAlloca, + bool intra_warp_loop) { + assert(val != NULL); + assert(alloca != NULL); + IRBuilder<> builder(alloca); + if (before != NULL) { + builder.SetInsertPoint(before); + } else if (isa(val)) { + builder.SetInsertPoint(dyn_cast(val)); + before = dyn_cast(val); + } else { + assert(false && "Unknown context restore location!"); + } + + std::vector gepArgs; + + auto M = before->getParent()->getParent()->getParent(); + auto I32 = llvm::Type::getInt32Ty(M->getContext()); + auto inter_warp_index = + builder.CreateLoad(M->getGlobalVariable("inter_warp_index")); + auto intra_warp_index = + builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + auto thread_idx = builder.CreateBinOp( + Instruction::Add, intra_warp_index, + builder.CreateBinOp(Instruction::Mul, inter_warp_index, + ConstantInt::get(I32, 32)), + "thread_idx"); + gepArgs.push_back(thread_idx); + + llvm::Instruction *gep = + dyn_cast(builder.CreateGEP(alloca, gepArgs)); + if (isAlloca) { + return gep; + } + return builder.CreateLoad(gep); +} + +void AddContextSaveRestore(llvm::Instruction *instruction, + bool intra_warp_loop) { + + /* Allocate the context data array for the variable. */ + llvm::Instruction *alloca = GetContextArray(instruction, intra_warp_loop); + + llvm::Instruction *theStore = + AddContextSave(instruction, alloca, intra_warp_loop); + + std::vector uses; + + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) { + llvm::Instruction *user = cast(ui->getUser()); + if (user == NULL) + continue; + if (user == theStore) + continue; + uses.push_back(user); + } + + for (auto user : uses) { + Instruction *contextRestoreLocation = user; + llvm::Value *loadedValue = + AddContextRestore(user, alloca, contextRestoreLocation, + isa(instruction), intra_warp_loop); + user->replaceUsesOfWith(instruction, loadedValue); + } +} + +void handle_alloc(llvm::Function *F) { + auto M = F->getParent(); + LLVMContext &C = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(C); + + std::vector instruction_to_fix; + for (auto bb = F->begin(); bb != F->end(); bb++) { + for (auto ii = bb->begin(); ii != bb->end(); ii++) { + if (llvm::AllocaInst *i = dyn_cast(ii)) { + instruction_to_fix.push_back(i); + } + } + } + + std::vector need_remove; + + for (auto inst : instruction_to_fix) { + // generate a new alloc + auto block_size_addr = M->getGlobalVariable("block_size"); + IRBuilder<> builder(inst); + auto block_size = builder.CreateLoad(block_size_addr); + + llvm::Type *elementType = NULL; + if (dyn_cast(inst)->getType()->getElementType()) { + elementType = dyn_cast(inst)->getType()->getElementType(); + } + assert(elementType != NULL); + + auto Alloca = builder.CreateAlloca(elementType, block_size, + inst->getName().str() + "inter_warp"); + + // replace all usage + std::set replace_user; + for (Instruction::use_iterator ui = inst->use_begin(), ue = inst->use_end(); + ui != ue; ++ui) { + replace_user.insert(dyn_cast(ui->getUser())); + } + for (auto user : replace_user) { + + IRBuilder<> builder(user); + // std::vector gepArgs; + auto inter_warp_index = + builder.CreateLoad(M->getGlobalVariable("inter_warp_index")); + auto intra_warp_index = + builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + auto thread_idx = builder.CreateBinOp( + Instruction::Add, intra_warp_index, + builder.CreateBinOp(Instruction::Mul, inter_warp_index, + ConstantInt::get(I32, 32)), + "thread_idx"); + + auto gep = builder.CreateGEP(Alloca, thread_idx); + + user->replaceUsesOfWith(inst, gep); + } + need_remove.push_back(inst); + } + for (auto inst : need_remove) { + inst->dropAllReferences(); + inst->eraseFromParent(); + } +} + +void handle_local_variable_intra_warp(std::vector PRs) { + bool intra_warp_loop = 1; + // we should handle allocation generated by PHI + { + std::vector instruction_to_fix; + auto F = PRs[0].start_block->getParent(); + for (auto bb = F->begin(); bb != F->end(); bb++) { + for (auto ii = bb->begin(); ii != bb->end(); ii++) { + if (isa(&(*ii))) + instruction_to_fix.push_back(&(*ii)); + } + for (auto inst : instruction_to_fix) { + AddContextSaveRestore(inst, intra_warp_loop); + } + } + } + + for (auto parallel_regions : PRs) { + std::set instruction_in_region; + std::vector instruction_to_fix; + + for (auto bb : parallel_regions.wrapped_block) { + for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end(); + ++instr) { + llvm::Instruction *instruction = &*instr; + instruction_in_region.insert(instruction); + } + } + /* Find all the instructions that define new values and + check if they need to be context saved. */ + for (auto bb : parallel_regions.wrapped_block) { + for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end(); + ++instr) { + llvm::Instruction *instruction = &*instr; + + if (ShouldNotBeContextSaved(instruction)) + continue; + + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) { + llvm::Instruction *user = dyn_cast(ui->getUser()); + + if (user == NULL) + continue; + if (isa(instruction) || + (instruction_in_region.find(user) == + instruction_in_region.end())) { + instruction_to_fix.push_back(instruction); + break; + } + } + } + } + for (auto inst : instruction_to_fix) { + AddContextSaveRestore(inst, intra_warp_loop); + } + } +} + +BasicBlock *insert_loop_init(llvm::BasicBlock *InsertInitBefore, + bool intra_warp_loop) { + llvm::Module *M = InsertInitBefore->getParent()->getParent(); + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::string block_name = + (intra_warp_loop) ? "intra_warp_init" : "inter_warp_init"; + BasicBlock *loop_init = BasicBlock::Create( + context, block_name, InsertInitBefore->getParent(), InsertInitBefore); + IRBuilder<> builder(context); + builder.SetInsertPoint(loop_init); + if (intra_warp_loop) { // intra warp + auto intra_warp_index = M->getGlobalVariable("intra_warp_index"); + builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index); + } else { // inter warp + auto inter_warp_index = M->getGlobalVariable("inter_warp_index"); + builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index); + } + builder.CreateBr(InsertInitBefore); + return loop_init; +} + +BasicBlock *insert_loop_cond(llvm::BasicBlock *InsertCondBefore, + llvm::BasicBlock *LoopEnd, bool intra_warp_loop) { + llvm::Module *M = InsertCondBefore->getParent()->getParent(); + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::string block_name = + (intra_warp_loop) ? "intra_warp_cond" : "inter_warp_cond"; + BasicBlock *loop_cond = BasicBlock::Create( + context, block_name, InsertCondBefore->getParent(), InsertCondBefore); + IRBuilder<> builder(context); + builder.SetInsertPoint(loop_cond); + + llvm::Value *cmpResult = NULL; + if (!intra_warp_loop) { + auto inter_warp_index = M->getGlobalVariable("inter_warp_index"); + auto block_size = M->getGlobalVariable("block_size"); + auto warp_cnt = + builder.CreateBinOp(Instruction::SDiv, builder.CreateLoad(block_size), + ConstantInt::get(I32, 32), "warp_number"); + + cmpResult = + builder.CreateICmpULT(builder.CreateLoad(inter_warp_index), warp_cnt); + } else { + auto intra_warp_index = M->getGlobalVariable("intra_warp_index"); + auto block_size = M->getGlobalVariable("block_size"); + if (!need_nested_loop) { + cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index), + builder.CreateLoad(block_size)); + } else { + cmpResult = builder.CreateICmpULT(builder.CreateLoad(intra_warp_index), + ConstantInt::get(I32, 32)); + } + } + builder.CreateCondBr(cmpResult, InsertCondBefore, LoopEnd); + return loop_cond; +} + +BasicBlock *insert_loop_inc(llvm::BasicBlock *InsertIncBefore, + bool intra_warp_loop) { + llvm::Module *M = InsertIncBefore->getParent()->getParent(); + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::string block_name = + (intra_warp_loop) ? "intra_warp_inc" : "inter_warp_inc"; + BasicBlock *loop_inc = BasicBlock::Create( + context, block_name, InsertIncBefore->getParent(), InsertIncBefore); + IRBuilder<> builder(context); + builder.SetInsertPoint(loop_inc); + if (intra_warp_loop) { // intra warp + auto intra_warp_index = M->getGlobalVariable("intra_warp_index"); + auto new_index = builder.CreateBinOp( + Instruction::Add, builder.CreateLoad(intra_warp_index), + ConstantInt::get(I32, 1), "intra_warp_index_increment"); + builder.CreateStore(new_index, intra_warp_index); + } else { // inter warp + auto inter_warp_index = M->getGlobalVariable("inter_warp_index"); + auto new_index = builder.CreateBinOp( + Instruction::Add, builder.CreateLoad(inter_warp_index), + ConstantInt::get(I32, 1), "inter_warp_index_increment"); + builder.CreateStore(new_index, inter_warp_index); + } + builder.CreateBr(InsertIncBefore); + return loop_inc; +} + +void add_warp_loop(std::vector parallel_regions, + bool intra_warp_loop) { + for (auto region : parallel_regions) { + auto start_block = region.start_block; + auto tail_block = region.end_block; + auto next_block = region.successor_block; + + auto loop_cond = insert_loop_cond(start_block, next_block, intra_warp_loop); + auto loop_init = insert_loop_init(loop_cond, intra_warp_loop); + + auto F = start_block->getParent(); + for (Function::iterator i = F->begin(); i != F->end(); ++i) { + llvm::BasicBlock *bb = &(*i); + if (bb == loop_cond) + continue; + bb->getTerminator()->replaceUsesOfWith(start_block, loop_init); + } + auto loop_inc = insert_loop_inc(loop_cond, intra_warp_loop); + tail_block->getTerminator()->replaceUsesOfWith(next_block, loop_inc); + // we have to reset inter/intra warp index to 0, as these maybe used + // outside PR when there are conditional loop/branch + llvm::Module *M = start_block->getParent()->getParent(); + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + BasicBlock *reset_index = BasicBlock::Create(start_block->getContext(), + "reset_block", F, next_block); + IRBuilder<> builder(start_block->getContext()); + builder.SetInsertPoint(reset_index); + if (intra_warp_loop) { // intra warp + auto intra_warp_index = M->getGlobalVariable("intra_warp_index"); + builder.CreateStore(ConstantInt::get(I32, 0), intra_warp_index); + } else { // inter warp + auto inter_warp_index = M->getGlobalVariable("inter_warp_index"); + builder.CreateStore(ConstantInt::get(I32, 0), inter_warp_index); + } + builder.CreateBr(next_block); + loop_cond->getTerminator()->replaceUsesOfWith(next_block, reset_index); + // add metadata + MDNode *Dummy = + MDNode::getTemporary(context, ArrayRef()).release(); + MDNode *AccessGroupMD = MDNode::getDistinct(context, {}); + MDNode *ParallelAccessMD = MDNode::get( + context, + {MDString::get(context, "llvm.loop.parallel_accesses"), AccessGroupMD}); + MDNode *Root = MDNode::get(context, {Dummy, ParallelAccessMD}); + + Root->replaceOperandWith(0, Root); + MDNode::deleteTemporary(Dummy); + // We now have + // !1 = metadata !{metadata !1} <- self-referential root + loop_cond->getTerminator()->setMetadata("llvm.loop", Root); + + for (auto bb : region.wrapped_block) { + for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); ii != ee; + ii++) { + if (!ii->mayReadOrWriteMemory()) { + continue; + } + MDNode *NewMD = MDNode::get(bb->getContext(), AccessGroupMD); + MDNode *OldMD = ii->getMetadata("llvm.mem.parallel_loop_access"); + if (OldMD != nullptr) { + NewMD = llvm::MDNode::concatenate(OldMD, NewMD); + } + ii->setMetadata("llvm.mem.parallel_loop_access", NewMD); + } + } + } +} + +void print_parallel_region(std::vector parallel_regions) { + printf("get PR:\n"); + for (auto region : parallel_regions) { + auto start = region.start_block; + auto end = region.end_block; + auto next = region.successor_block; + printf("parallel region: %s->%s next: %s\n", start->getName().str().c_str(), + end->getName().str().c_str(), next->getName().str().c_str()); + printf("have: \n"); + for (auto b : region.wrapped_block) { + printf("%s\n", b->getName().str().c_str()); + } + } +} + +void remove_barrier(llvm::Function *F, bool intra_warp_loop) { + std::vector need_remove; + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Call = dyn_cast(BI)) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.bar.warp.sync") { + need_remove.push_back(Call); + } + if (!intra_warp_loop && (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.barrier.sync")) { + need_remove.push_back(Call); + } + } + } + } + for (auto inst : need_remove) { + inst->eraseFromParent(); + } +} + +class InsertWarpLoopPass : public llvm::FunctionPass { + +public: + static char ID; + bool intra_warp_loop; + DominatorTree *DT; + PostDominatorTree *PDT; + + InsertWarpLoopPass(bool intra_warp = 0) + : FunctionPass(ID), intra_warp_loop(intra_warp) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + } + + void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop, + std::vector ¶llel_regions) { + ParallelRegion current_region; + + SmallVector pending_blocks; + BasicBlock *region_entry_barrier = NULL; + BasicBlock *entry = NULL; + BasicBlock *exit = B->getSinglePredecessor(); + for (BasicBlock *Pred : predecessors(B)) { + pending_blocks.push_back(Pred); + } + if (pending_blocks.size() > 1) { + // becuase we have insert the sync and split by them, + // so if B has several income edges, it must be a merge point + // for a conditional if. We can safely ignore it + // TODO: we have to further check whether this conditional if + // is for inter warp or intra warp + return; + } + + while (!pending_blocks.empty()) { + BasicBlock *current = pending_blocks.back(); + pending_blocks.pop_back(); + + // avoid infinite recursion of loops + if (current_region.wrapped_block.count(current) != 0) { + continue; + } + + // If we reach another barrier this must be the + // parallel region entry. + bool has_barrier = 0; + for (auto i = current->begin(), e = current->end(); i != e; ++i) { + if (llvm::CallInst *call_inst = llvm::dyn_cast(&(*i))) { + auto func_name = call_inst->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.barrier.sync") + has_barrier = 1; + if (func_name == "llvm.nvvm.bar.warp.sync" && intra_warp_loop) + has_barrier = 1; + } + } + + // if we reach a block which only has a single condtional branch, + // it is the start point of a B-condition, we have to stop here + bool is_single_conditional_branch_block = 0; + if (auto br = dyn_cast(current->getTerminator())) { + if (br->isConditional()) { + if (current->size() == 1) { + is_single_conditional_branch_block = 1; + } else { + // generate by replicate local variable + printf( + "[WARNING] match single conditional branch with HARD CODE\n"); + bool branch_to_intra_init = false; + for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) { + llvm::BasicBlock *entryCandidate = br->getSuccessor(suc); + auto block_name = entryCandidate->getName().str(); + if (find_block_barrier_in_region(current, B)) { + if (block_name.find("warp_init") != block_name.npos) { + is_single_conditional_branch_block = 1; + break; + } + } + } + } + } + } + + if (has_barrier || is_single_conditional_branch_block) { + if (region_entry_barrier == NULL) + region_entry_barrier = current; + else if (region_entry_barrier != current) { + // this means there is not PR before B, just return + return; + } + continue; + } + + // Non-barrier block, this must be on the region. + current_region.wrapped_block.insert(current); + + // Add predecessors to pending queue. + for (BasicBlock *Pred : predecessors(current)) { + pending_blocks.push_back(Pred); + } + } + + if (current_region.wrapped_block.empty()) { + return; + } + + // if do not find entry node, this means all predecessor + // blocks do not need to execute multiply times + if (region_entry_barrier == NULL) { + return; + } + // Find the entry node. + assert(region_entry_barrier != NULL); + for (unsigned + suc = 0, + num = region_entry_barrier->getTerminator()->getNumSuccessors(); + suc < num; ++suc) { + llvm::BasicBlock *entryCandidate = + region_entry_barrier->getTerminator()->getSuccessor(suc); + if (current_region.wrapped_block.count(entryCandidate) == 0) + continue; + entry = entryCandidate; + break; + } + // delete useless PR, those PRs only have branch + if (entry == exit) { + if (entry->size() == 1 && isa(entry->begin())) { + return; + } + } + bool is_useless = true; + auto iter = entry; + do { + if (iter->size() != 1 || !isa(entry->begin())) { + is_useless = false; + break; + } + if (iter->getTerminator()->getNumSuccessors() > 1) { + is_useless = false; + break; + } + iter = iter->getTerminator()->getSuccessor(0); + } while (iter != exit); + if (is_useless) { + return; + } + assert(current_region.wrapped_block.count(entry) != 0); + current_region.start_block = entry; + current_region.end_block = exit; + current_region.successor_block = B; + parallel_regions.push_back(current_region); + } + + std::vector getParallelRegions(llvm::Function *F, + bool intra_warp_loop) { + std::vector parallel_regions; + + SmallVector exit_blocks; + for (Function::iterator s = F->begin(); s != F->end(); s++) { + if (llvm::CallInst *call_inst = + llvm::dyn_cast(s->begin())) { + auto func_name = call_inst->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.barrier.sync") { + exit_blocks.push_back(&(*s)); + } + // when handling intra warp loop, we need also split the blocks + // between warp barrier + if (intra_warp_loop && func_name == "llvm.nvvm.bar.warp.sync") { + exit_blocks.push_back(&(*s)); + } + } + } + + // First find all the ParallelRegions in the Function. + while (!exit_blocks.empty()) { + BasicBlock *exit = exit_blocks.back(); + exit_blocks.pop_back(); + getParallelRegionBefore(exit, intra_warp_loop, parallel_regions); + } + return parallel_regions; + } + + virtual bool runOnFunction(Function &F) { + if (!isKernelFunction(F.getParent(), &F)) + return 0; + + DT = &getAnalysis().getDomTree(); + PDT = &getAnalysis().getPostDomTree(); + + // find parallel region we need to wrap + auto parallel_regions = getParallelRegions(&F, intra_warp_loop); + assert(!parallel_regions.empty() && "can not find any parallel regions\n"); + // print_parallel_region(parallel_regions); + add_warp_loop(parallel_regions, intra_warp_loop); + + if (intra_warp_loop) { + handle_local_variable_intra_warp(parallel_regions); + } + remove_barrier(&F, intra_warp_loop); + return 1; + } +}; + +char InsertWarpLoopPass::ID = 0; + +namespace { +static RegisterPass X("insert-warp-loop", + "Insert inter/intra warp loop"); +} // namespace + +bool has_warp_barrier(llvm::Module *M) { + for (auto F = M->begin(); F != M->end(); ++F) + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Call = dyn_cast(BI)) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.bar.warp.sync") { + return true; + } + } + } + } + return false; +} + +void insert_warp_loop(llvm::Module *M) { + llvm::legacy::PassManager Passes; + need_nested_loop = has_warp_barrier(M); + // use nested loop only when there are warp-level barrier + if (need_nested_loop) { + bool intra_warp = true; + Passes.add(new InsertWarpLoopPass(intra_warp)); + // insert inter warp loop + Passes.add(new InsertWarpLoopPass(!intra_warp)); + Passes.run(*M); + } else { + bool intra_warp = true; + // only need a single loop, with size=block_size + Passes.add(new InsertWarpLoopPass(intra_warp)); + Passes.run(*M); + // remove all barriers + for (auto F = M->begin(); F != M->end(); ++F) + remove_barrier(dyn_cast(F), false); + } +} diff --git a/compilation/KernelTranslation/lib/memory_hierarchy.cpp b/compilation/KernelTranslation/lib/memory_hierarchy.cpp new file mode 100644 index 0000000..9152500 --- /dev/null +++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp @@ -0,0 +1,126 @@ +#include "memory_hierarchy.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +#include +#include +#include +#include + +void mem_share2global(llvm::Module *M) { + LLVMContext *C = &M->getContext(); + llvm::Type *Int32T = Type::getInt32Ty(*C); + llvm::Type *Int64T = Type::getInt64Ty(*C); + llvm::Type *Int8T = Type::getInt8Ty(*C); + + std::map corresponding_global_memory; + std::set need_remove; + std::set need_remove_share_memory; + + // find all share memory and generate corresponding global memory + for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) { + if (GlobalVariable *share_memory = dyn_cast(I)) { + if (auto PT = dyn_cast(I->getType())) { + unsigned AS = PT->getAddressSpace(); + if (AS == 3) { // find a share memory + need_remove_share_memory.insert(share_memory); + // generate the corresponding global memory variable + auto new_name = "wrapper_global_" + share_memory->getName().str(); + auto element_type = PT->getElementType(); + if (auto array_type = dyn_cast(element_type)) { + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL, + new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1); + ConstantAggregateZero *const_array = + ConstantAggregateZero::get(array_type); + global_memory->setInitializer(const_array); + corresponding_global_memory.insert( + std::pair(share_memory, + global_memory)); + } else if (auto int_type = dyn_cast(element_type)) { + auto zero = llvm::ConstantInt::get(int_type, 0, true); + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, int_type, false, llvm::GlobalValue::ExternalLinkage, zero, + new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, + false); + corresponding_global_memory.insert( + std::pair(share_memory, + global_memory)); + } else { + assert(0 && "The required Share Memory Type is not supported\n"); + } + } + } + } + } + + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + BasicBlock *b = &*i; + for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) { + if (auto get_element_ptr = dyn_cast(i)) { + auto read_array = get_element_ptr->getPointerOperand(); + if (GlobalVariable *read_share_memory = + dyn_cast(read_array)) { + // find a GetElementPtr which read share memory + if (corresponding_global_memory.find(read_share_memory) != + corresponding_global_memory.end()) { + std::vector Indices; + for (int i = 0; i < get_element_ptr->getNumIndices(); i++) + Indices.push_back(get_element_ptr->getOperand(i + 1)); + + auto new_GEP = GetElementPtrInst::Create( + NULL, // Pointee type + corresponding_global_memory.find(read_share_memory) + ->second, // Alloca + Indices, // Indices + "", get_element_ptr); + // replace all get_element_ptr with new_GEP: + // we can not directly use: + // get_element_ptr->replaceAllUsesWith(new_GEP); + // as get_element_ptr and new_GEP have different return type + llvm::Type *original_type = get_element_ptr->getType(); + auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast( + new_GEP, original_type, "", get_element_ptr); + get_element_ptr->replaceAllUsesWith(FormatASC); + need_remove.insert(get_element_ptr); + } + } + } else if (auto addr_cast = dyn_cast(i)) { + auto read_array = addr_cast->getOperand(0); + if (GlobalVariable *read_share_memory = + dyn_cast(read_array)) { + // find a GetElementPtr which read share memory + if (corresponding_global_memory.find(read_share_memory) != + corresponding_global_memory.end()) { + llvm::Type *original_type = addr_cast->getType(); + auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast( + corresponding_global_memory.find(read_share_memory)->second, + original_type, "", addr_cast); + addr_cast->replaceAllUsesWith(FormatASC); + need_remove.insert(addr_cast); + } + } + } + } + } + } + + for (auto i : need_remove) { + i->dropAllReferences(); + i->eraseFromParent(); + } + for (auto i : need_remove_share_memory) { + i->dropAllReferences(); + i->eraseFromParent(); + } +} diff --git a/compilation/KernelTranslation/lib/performance.cpp b/compilation/KernelTranslation/lib/performance.cpp new file mode 100644 index 0000000..11d30e4 --- /dev/null +++ b/compilation/KernelTranslation/lib/performance.cpp @@ -0,0 +1,88 @@ +#include "performance.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassInfo.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +#include +#include +#include + +using namespace llvm; + +void performance_optimization(llvm::Module *M) { + for (auto F = M->begin(); F != M->end(); F++) { + for (auto I = F->arg_begin(); I != F->arg_end(); ++I) { + if (I->getType()->isPointerTy()) { + I->addAttr(llvm::Attribute::NoAlias); + } + } + } + + llvm::legacy::PassManager Passes; + + // add target machine info + llvm::Triple triple("x86_64-unknown-linux-gnu"); + + std::string Error; + const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error); + if (!TheTarget) { + printf("Error: %s\n", Error.c_str()); + assert(0); + } + llvm::TargetOptions Options; + Options.FloatABIType = FloatABI::Hard; + + TargetMachine *TM = TheTarget->createTargetMachine( + triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"), + Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive); + assert(TM && "No Machine Information\n"); + + Passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); + + TargetLibraryInfoImpl TLII(triple); + TLII.disableAllFunctions(); + Passes.add(new TargetLibraryInfoWrapperPass(TLII)); + + // Add O3 optimization + llvm::PassManagerBuilder Builder; + Builder.OptLevel = 3; + Builder.SizeLevel = 0; + + Builder.LoopVectorize = true; + Builder.SLPVectorize = true; + + Builder.VerifyInput = true; + Builder.VerifyOutput = true; + + Builder.populateModulePassManager(Passes); + Passes.run(*M); +} diff --git a/compilation/KernelTranslation/lib/tool.cpp b/compilation/KernelTranslation/lib/tool.cpp new file mode 100644 index 0000000..c3c379e --- /dev/null +++ b/compilation/KernelTranslation/lib/tool.cpp @@ -0,0 +1,480 @@ +#include "tool.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include + +using namespace llvm; + +llvm::Module *LoadModuleFromFilr(char *file_name) { + llvm::SMDiagnostic Err; + llvm::LLVMContext *globalContext = new llvm::LLVMContext; + auto program = parseIRFile(file_name, Err, *globalContext).release(); + if (!program) { + printf("error when opening the bitcode\n"); + exit(1); + } + return program; +} + +void VerifyModule(llvm::Module *program) { + std::string msg; + llvm::raw_string_ostream os(msg); + if (llvm::verifyModule(*program, &(llvm::errs()))) + llvm::report_fatal_error(os.str().c_str()); +} + +void DumpModule(llvm::Module *M, char *file_name) { + // modify the program, add a wrapper + std::string msg; + llvm::raw_string_ostream os(msg); + std::error_code EC; + ToolOutputFile Out(file_name, EC, sys::fs::F_None); + if (EC) { + errs() << "Fails to open output file: " << EC.message(); + return; + } + WriteBitcodeToFile(*M, Out.os()); + Out.keep(); +} + +bool isKernelFunction(llvm::Module *M, llvm::Function *F) { + NamedMDNode *NamedMD = M->getNamedMetadata("nvvm.annotations"); + if (!NamedMD) { + printf("there must be nvvm.annotations!\n"); + exit(1); + } + for (unsigned I = 0, E = NamedMD->getNumOperands(); I != E; ++I) { + MDNode *MD = NamedMD->getOperand(I); + if (!MD || MD->getNumOperands() == 0) + continue; + if (MD->getNumOperands() != 3) + continue; + Metadata *Op = MD->getOperand(1); + if (auto Str = llvm::cast(Op)) { + if (Str->getString().str() != "kernel") + continue; + llvm::Value *meta = + dyn_cast(MD->getOperand(0))->getValue(); + Function *FF = llvm::cast(meta); + if (FF->getName().str() == F->getName().str()) + return true; + } + } + return false; +} + +void replace_block(llvm::Function *F, llvm::BasicBlock *before, + llvm::BasicBlock *after) { + for (Function::iterator i = F->begin(); i != F->end(); ++i) { + llvm::BasicBlock *bb = &(*i); + if (bb == after) + continue; + bb->getTerminator()->replaceUsesOfWith(before, after); + } +} + +llvm::CallInst *CreateInterWarpBarrier(llvm::Instruction *InsertBefore) { + llvm::Module *M = InsertBefore->getParent()->getParent()->getParent(); + + llvm::FunctionType *LauncherFuncT = + FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false); + + llvm::FunctionCallee f = + M->getOrInsertFunction("llvm.nvvm.barrier0", LauncherFuncT); + llvm::Function *F = llvm::cast(f.getCallee()); + return llvm::CallInst::Create(F, "", InsertBefore); +} + +llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore) { + llvm::Module *M = InsertBefore->getParent()->getParent()->getParent(); + llvm::FunctionType *LauncherFuncT = + FunctionType::get(llvm::Type::getVoidTy(M->getContext()), {}, false); + llvm::FunctionCallee f = + M->getOrInsertFunction("llvm.nvvm.bar.warp.sync", LauncherFuncT); + llvm::Function *F = llvm::cast(f.getCallee()); + return llvm::CallInst::Create(F, "", InsertBefore); +} + +llvm::Instruction *BreakPHIToAllocas(PHINode *phi) { + + std::string allocaName = std::string(phi->getName().str()) + ".ex_phi"; + + llvm::Function *function = phi->getParent()->getParent(); + + IRBuilder<> builder(&*(function->getEntryBlock().getFirstInsertionPt())); + + llvm::Instruction *alloca = + builder.CreateAlloca(phi->getType(), 0, allocaName); + + for (unsigned incoming = 0; incoming < phi->getNumIncomingValues(); + ++incoming) { + Value *val = phi->getIncomingValue(incoming); + BasicBlock *incomingBB = phi->getIncomingBlock(incoming); + builder.SetInsertPoint(incomingBB->getTerminator()); + llvm::Instruction *store = builder.CreateStore(val, alloca); + } + builder.SetInsertPoint(phi); + + llvm::Instruction *loadedValue = builder.CreateLoad(alloca); + phi->replaceAllUsesWith(loadedValue); + phi->eraseFromParent(); + + return loadedValue; +} + +void phi2alloc(llvm::Module *M) { + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (!isKernelFunction(M, F)) + continue; + + typedef std::vector InstructionVec; + + InstructionVec PHIs; + + for (Function::iterator bb = F->begin(); bb != F->end(); ++bb) { + for (BasicBlock::iterator p = bb->begin(); p != bb->end(); ++p) { + Instruction *instr = &*p; + if (isa(instr)) { + PHIs.push_back(instr); + } + } + } + + bool changed = false; + for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) { + Instruction *instr = *i; + BreakPHIToAllocas(dyn_cast(instr)); + } + } +} + +void remove_cuda_built_in(llvm::Module *M) { + // initialize function name + std::set useless_func_name; + useless_func_name.insert("cudaMalloc"); + useless_func_name.insert("cudaFuncGetAttributes"); + useless_func_name.insert("cudaGetDevice"); + useless_func_name.insert("cudaDeviceGetAttribute"); + useless_func_name.insert("cudaOccupancyMaxActiveBlocksPerMultiprocessor"); + useless_func_name.insert( + "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags"); + + std::set need_remove; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (useless_func_name.find(func_name) != useless_func_name.end()) { + need_remove.insert(F); + } + } + for (auto F : need_remove) { + F->dropAllReferences(); + F->eraseFromParent(); + } +} + +void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::vector need_remove; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (!isKernelFunction(M, F)) + continue; + + IRBuilder<> builder(&*(F->getEntryBlock().getFirstInsertionPt())); + auto global_intra_warp_idx = + F->getParent()->getGlobalVariable("intra_warp_index"); + auto local_intra_warp_idx = + builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(), + 0, "local_intra_warp_idx"); + global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx); + auto global_inter_warp_idx = + F->getParent()->getGlobalVariable("inter_warp_index"); + auto local_inter_warp_idx = + builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(), + 0, "local_inter_warp_idx"); + global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx); + + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Load = dyn_cast(BI)) { + auto load_from = Load->getOperand(0); + if (load_from == F->getParent()->getGlobalVariable("block_size")) { + Load->replaceAllUsesWith(ConstantInt::get( + I32, block_dim[0] * block_dim[1] * block_dim[2])); + need_remove.push_back(Load); + } + } else if (auto Call = dyn_cast(BI)) { + if (Call->getCalledFunction()) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") { + // replace it by warp_id + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + + auto thread_idx = builder.CreateBinOp( + Instruction::Mul, builder.CreateLoad(local_inter_warp_idx), + ConstantInt::get(I32, 32), ""); + thread_idx = builder.CreateBinOp( + Instruction::Add, builder.CreateLoad(local_intra_warp_idx), + thread_idx, "thread_idx"); + if (block_dim[1] != 1 || block_dim[2] != 1) { + printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]); + thread_idx = builder.CreateBinOp( + Instruction::SRem, thread_idx, + ConstantInt::get(I32, block_dim[0]), "thread_id_x"); + } + + Call->replaceAllUsesWith(thread_idx); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.y") { + // replace it by warp_id + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + + auto thread_idx = builder.CreateBinOp( + Instruction::Mul, builder.CreateLoad(local_inter_warp_idx), + ConstantInt::get(I32, 32), ""); + thread_idx = builder.CreateBinOp( + Instruction::Add, builder.CreateLoad(local_intra_warp_idx), + thread_idx, "thread_idx"); + // tidy = tid / block_dim.x + thread_idx = builder.CreateBinOp( + Instruction::SDiv, thread_idx, + ConstantInt::get(I32, block_dim[0]), + // builder.CreateLoad(M->getGlobalVariable("block_size_x")), + "thread_id_y"); + + Call->replaceAllUsesWith(thread_idx); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") { + printf("[WARNING] We DO NOT support multi-dim block\n"); + auto zero = ConstantInt::get(I32, 0); + Call->replaceAllUsesWith(zero); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") { + auto block_index_addr = M->getGlobalVariable("block_index"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto block_idx = builder.CreateLoad(block_index_addr); + Call->replaceAllUsesWith(block_idx); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" || + func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") { + printf("[WARNING We DO NOT support multi-dim grid\n"); + auto zero = ConstantInt::get(I32, 0); + Call->replaceAllUsesWith(zero); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") { + auto block_size_addr = M->getGlobalVariable("block_size_x"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto block_size = ConstantInt::get(I32, block_dim[0]); + Call->replaceAllUsesWith(block_size); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") { + auto block_size_addr = M->getGlobalVariable("block_size_y"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto block_size = ConstantInt::get(I32, block_dim[1]); + Call->replaceAllUsesWith(block_size); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") { + auto block_size_addr = M->getGlobalVariable("block_size_z"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto block_size = ConstantInt::get(I32, block_dim[2]); + Call->replaceAllUsesWith(block_size); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") { + auto grid_size_addr = M->getGlobalVariable("grid_size"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto grid_size = ConstantInt::get(I32, grid_dim[0]); + Call->replaceAllUsesWith(grid_size); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" || + func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") { + printf("[WARNING We DO NOT support multi-dim grid\n"); + auto one = ConstantInt::get(I32, 1); + Call->replaceAllUsesWith(one); + need_remove.push_back(Call); + } + } + if (Call->isInlineAsm()) { + auto asm_inst = dyn_cast(Call->getCalledOperand()); + if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") { + printf("unknown InlineAsm\n"); + exit(1); + } + // return the rank within the warp + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto intra_warp_index = builder.CreateLoad(local_intra_warp_idx); + Call->replaceAllUsesWith(intra_warp_index); + need_remove.push_back(Call); + } + } + } + } + } + + for (auto inst : need_remove) { + inst->eraseFromParent(); + } +} + +void replace_asm_call(llvm::Module *M) { + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::vector need_remove; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (!isKernelFunction(M, F)) + continue; + + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Call = dyn_cast(BI)) { + if (Call->isInlineAsm()) { + auto asm_inst = dyn_cast(Call->getCalledOperand()); + if (asm_inst->getAsmString() != "mov.u32 $0, %laneid;") { + printf("unknown InlineAsm\n"); + exit(1); + } + // return the rank within the warp + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto intra_warp_index_addr = + M->getGlobalVariable("intra_warp_index"); + auto intra_warp_index = builder.CreateLoad(intra_warp_index_addr); + Call->replaceAllUsesWith(intra_warp_index); + need_remove.push_back(Call); + } + } + } + } + } + for (auto inst : need_remove) { + inst->eraseFromParent(); + } +} + +bool has_warp_barrier(llvm::BasicBlock *B) { + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + llvm::CallInst *Call = llvm::dyn_cast(inst); + if (Call) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.bar.warp.sync") { + return true; + } + } + } + return false; +} + +bool has_barrier(llvm::BasicBlock *B) { + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + llvm::CallInst *Call = llvm::dyn_cast(inst); + if (Call) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.bar.warp.sync" || + func_name == "llvm.nvvm.barrier.sync") { + return true; + } + } + } + return false; +} + +bool has_block_barrier(llvm::BasicBlock *B) { + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + llvm::CallInst *Call = llvm::dyn_cast(inst); + if (Call) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.barrier.sync") { + return true; + } + } + } + return false; +} + +bool has_barrier(llvm::Function *F) { + for (auto B = F->begin(); B != F->end(); B++) { + if (has_barrier(&(*B))) + return true; + } + return false; +} + +bool find_block_barrier_in_region(llvm::BasicBlock *start, + llvm::BasicBlock *end) { + std::set visit; + std::vector pending_blocks; + for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) { + pending_blocks.push_back(start->getTerminator()->getSuccessor(i)); + } + while (!pending_blocks.empty()) { + BasicBlock *current = pending_blocks.back(); + pending_blocks.pop_back(); + if (visit.find(current) != visit.end()) + continue; + visit.insert(current); + if (current == end) + continue; + if (has_block_barrier(current)) { + return 1; + } + for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) { + pending_blocks.push_back(current->getTerminator()->getSuccessor(i)); + } + } + return 0; +} + +bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) { + std::set visit; + std::vector pending_blocks; + for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) { + pending_blocks.push_back(start->getTerminator()->getSuccessor(i)); + } + while (!pending_blocks.empty()) { + BasicBlock *current = pending_blocks.back(); + pending_blocks.pop_back(); + if (visit.find(current) != visit.end()) + continue; + visit.insert(current); + if (current == end) + continue; + if (has_barrier(current)) { + return 1; + } + for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) { + pending_blocks.push_back(current->getTerminator()->getSuccessor(i)); + } + } + return 0; +} diff --git a/compilation/KernelTranslation/lib/warp_func.cpp b/compilation/KernelTranslation/lib/warp_func.cpp new file mode 100644 index 0000000..9708d74 --- /dev/null +++ b/compilation/KernelTranslation/lib/warp_func.cpp @@ -0,0 +1,217 @@ + +#include "warp_func.h" +#include "tool.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include + +using namespace llvm; + +/* + * Insert sync before each vote, and replace the + * original vote function to warp vote version + */ +void handle_warp_vote(llvm::Module *M) { + std::set need_replace; + llvm::Type *Int1T = Type::getInt1Ty(M->getContext()); + llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext()); + llvm::Type *I8 = llvm::Type::getInt8Ty(M->getContext()); + auto zero = llvm::ConstantInt::get(I32, 0, true); + auto one = llvm::ConstantInt::get(I32, 1, true); + llvm::Type *VoteArrayType = llvm::ArrayType::get(I8, 32)->getPointerTo(); + + llvm::FunctionType *LauncherFuncT = + FunctionType::get(Int1T, {VoteArrayType}, false); + llvm::FunctionCallee _f = M->getOrInsertFunction("warp_any", LauncherFuncT); + llvm::Function *func_warp_any = llvm::cast(_f.getCallee()); + _f = M->getOrInsertFunction("warp_all", LauncherFuncT); + llvm::Function *func_warp_all = llvm::cast(_f.getCallee()); + + // replace llvm.nvvm.vote.any.sync to warp vote function + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (!isKernelFunction(M, F)) + continue; + Function::iterator I = F->begin(); + for (Function::iterator E = F->end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) { + if (CallInst *vote_any_sync = dyn_cast(BI)) { + auto func_name = vote_any_sync->getCalledFunction()->getName(); + if (func_name == "llvm.nvvm.vote.any.sync" || + func_name == "llvm.nvvm.vote.all.sync") { + // insert sync before call + need_replace.insert(vote_any_sync); + } + } + } + } + } + + GlobalVariable *warp_vote_ptr = M->getNamedGlobal("warp_vote"); + for (auto sync_inst : need_replace) { + // create barrier + CreateIntraWarpBarrier(sync_inst); + /* + * store into warp_vote[tid] + */ + assert(warp_vote_ptr != NULL); + auto intra_warp_index_addr = M->getGlobalVariable("intra_warp_index"); + auto intra_warp_index = + new LoadInst(intra_warp_index_addr, "intra_warp_index", sync_inst); + + auto GEP = GetElementPtrInst::Create(NULL, // Pointee type + warp_vote_ptr, // Alloca + {zero, intra_warp_index}, // Indices + "", sync_inst); + + // as AVX only support 8bit for each thread + // so we have to cast the predict into int8 + auto predict = llvm::CastInst::CreateIntegerCast( + sync_inst->getArgOperand(1), I8, false, "", sync_inst); + // we need to concern mask + auto mask = llvm::CastInst::CreateIntegerCast(sync_inst->getArgOperand(0), + I32, false, "", sync_inst); + auto bit_flag = BinaryOperator::Create(Instruction::LShr, mask, + intra_warp_index, "", sync_inst); + auto valid = + BinaryOperator::Create(Instruction::And, one, bit_flag, "", sync_inst); + auto valid_8bit = + llvm::CastInst::CreateIntegerCast(valid, I8, false, "", sync_inst); + + llvm::Instruction *res; + if (sync_inst->getCalledFunction()->getName() == + "llvm.nvvm.vote.any.sync") { + res = BinaryOperator::Create(Instruction::Mul, valid_8bit, predict, "", + sync_inst); + } else if (sync_inst->getCalledFunction()->getName() == + "llvm.nvvm.vote.all.sync") { + auto reverse_valid = BinaryOperator::CreateNot(valid_8bit, "", sync_inst); + res = BinaryOperator::Create(Instruction::Or, reverse_valid, predict, "", + sync_inst); + // as AVX do not have all, we have to + // reverse the result and call AVX-any instead + res = BinaryOperator::CreateNot(res, "", sync_inst); + } + + auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst); + // create barrier + CreateIntraWarpBarrier(sync_inst); + /* + * replace llvm.nvvm.vote.any.sync(i32 mask, i1 predict) + * to warp_any(i32 mask, i8* predict) + */ + std::vector args; + // args.push_back(mask); + args.push_back(warp_vote_ptr); + llvm::Instruction *warp_inst; + if (sync_inst->getCalledFunction()->getName() == + "llvm.nvvm.vote.any.sync") { + warp_inst = llvm::CallInst::Create(func_warp_any, args, "", sync_inst); + } else if (sync_inst->getCalledFunction()->getName() == + "llvm.nvvm.vote.all.sync") { + warp_inst = llvm::CallInst::Create(func_warp_all, args, "", sync_inst); + } + sync_inst->replaceAllUsesWith(warp_inst); + sync_inst->eraseFromParent(); + } +} + +void handle_warp_shfl(llvm::Module *M) { + std::set need_replace; + + llvm::Type *I32 = llvm::Type::getInt32Ty(M->getContext()); + auto ZERO = llvm::ConstantInt::get(I32, 0, true); + // replace llvm.nvvm.vote.any.sync to warp vote function + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + if (!isKernelFunction(M, F)) + continue; + Function::iterator I = F->begin(); + for (Function::iterator E = F->end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) { + if (CallInst *warp_shfl = dyn_cast(BI)) { + auto func_name = warp_shfl->getCalledFunction()->getName(); + if (func_name == "llvm.nvvm.shfl.sync.down.i32" || + func_name == "llvm.nvvm.shfl.sync.up.i32" || + func_name == "llvm.nvvm.shfl.sync.bfly.i32") { + // insert sync before call + need_replace.insert(warp_shfl); + } + } + } + } + } + + GlobalVariable *warp_shfl_ptr = M->getNamedGlobal("warp_shfl"); + for (auto shfl_inst : need_replace) { + /* + * %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add32, i32 + * 16, i32 31) + * -> + * warp_shfl[warp_id] = add32 + * warp.barrier() + * %10 = warp_shfl[warp_id + offset] + */ + IRBuilder<> builder(shfl_inst); + + auto shfl_variable = shfl_inst->getArgOperand(1); + auto shfl_offset = shfl_inst->getArgOperand(2); + + auto intra_warp_index = + builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + builder.CreateStore( + shfl_variable, + builder.CreateGEP(warp_shfl_ptr, {ZERO, intra_warp_index})); + // we should create barrier before store + CreateIntraWarpBarrier(intra_warp_index); + // load shuffled data + auto new_intra_warp_index = + builder.CreateLoad(M->getGlobalVariable("intra_warp_index")); + auto shfl_name = shfl_inst->getCalledFunction()->getName().str(); + if (shfl_name.find("down") != shfl_name.npos) { + auto calculate_offset = builder.CreateBinOp( + Instruction::Add, new_intra_warp_index, shfl_offset); + auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset, + ConstantInt::get(I32, 32)); + auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index}); + auto load_inst = builder.CreateLoad(gep); + + // create barrier + CreateIntraWarpBarrier(new_intra_warp_index); + shfl_inst->replaceAllUsesWith(load_inst); + shfl_inst->eraseFromParent(); + } else if (shfl_name.find("up") != shfl_name.npos) { + auto calculate_offset = builder.CreateBinOp( + Instruction::Sub, new_intra_warp_index, shfl_offset); + auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset, + ConstantInt::get(I32, 32)); + auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index}); + auto load_inst = builder.CreateLoad(gep); + + // create barrier + CreateIntraWarpBarrier(new_intra_warp_index); + shfl_inst->replaceAllUsesWith(load_inst); + shfl_inst->eraseFromParent(); + } else if (shfl_name.find("bfly") != shfl_name.npos) { + auto calculate_offset = builder.CreateBinOp( + Instruction::Xor, new_intra_warp_index, shfl_offset); + auto new_index = builder.CreateBinOp(Instruction::SRem, calculate_offset, + ConstantInt::get(I32, 32)); + auto gep = builder.CreateGEP(warp_shfl_ptr, {ZERO, new_index}); + auto load_inst = builder.CreateLoad(gep); + + // create barrier + CreateIntraWarpBarrier(new_intra_warp_index); + shfl_inst->replaceAllUsesWith(load_inst); + shfl_inst->eraseFromParent(); + } + } +} diff --git a/compilation/examples/reduce/host.cpp b/compilation/examples/reduce/host.cpp new file mode 100644 index 0000000..297ff17 --- /dev/null +++ b/compilation/examples/reduce/host.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include + +#define NUM_WARP 2 +#define NUM_BLOCK 1 + +int block_size = 32 * NUM_WARP; +int block_size_x = block_size; +int block_size_y = 1; +int block_size_z = 1; +__thread int block_index = 0; +int grid_size = NUM_BLOCK; + +extern "C" { +void *_Z7reduce0PiS_j_wrapper(void *); +__thread int warp_shfl[32]; +} + +void *wrap(void *p) { + int **res = (int **)p; + block_index = (*(int *)res[3]); + _Z7reduce0PiS_j_wrapper(p); + return NULL; +} + +void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) { + int **ret = new int *[4]; + + int **p0 = new int *; + *p0 = g_idata; + ret[0] = (int *)(p0); + + int **p1 = new int *; + *p1 = g_odata; + ret[1] = (int *)(p1); + + unsigned int *p2 = new unsigned int; + *p2 = n; + ret[2] = (int *)p2; + + int *p3 = new int; + *p3 = bid; + ret[3] = (int *)p3; + + return (void *)ret; +} + +int main(int argc, char *argv[]) { + int *g_idata; + + int size = block_size * NUM_BLOCK; + g_idata = new int[size * 2]; + int *res = new int[size]; + + for (int i = 0; i < size; i++) { + g_idata[i] = i; + } + + pthread_t threads[NUM_BLOCK]; + + void *inp[NUM_BLOCK]; + for (long t = 0; t < NUM_BLOCK; t++) { + inp[t] = gen_input(t, g_idata, res, size); + } + + for (long t = 0; t < NUM_BLOCK; t++) { + pthread_create(&threads[t], NULL, wrap, inp[t]); + } + for (long t = 0; t < NUM_BLOCK; t++) + pthread_join(threads[t], NULL); + int gold = 0; + for (int i = 0; i < size; i++) { + gold += g_idata[i]; + } + assert(*res == gold && "Incorrect res\n"); + printf("PASS\n"); + + pthread_exit(NULL); +} diff --git a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..50b112d --- /dev/null +++ b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,150 @@ +; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "kernel.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 + +; Function Attrs: nounwind +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: convergent nounwind +define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 { +entry: + %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10 + %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11 + %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12 + %mul = mul i32 %2, %1 + %add = add i32 %mul, %0 + %cmp = icmp ult i32 %add, %n + br i1 %cmp, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4, !tbaa !13 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %3, %cond.true ], [ 0, %entry ] + %idxprom5 = zext i32 %0 to i64 + %arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5 + %arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32* + store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13 + tail call void @llvm.nvvm.barrier.sync(i32 0) #4 + %cmp839 = icmp ugt i32 %2, 1 + br i1 %cmp839, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %if.end, %cond.end + %cmp18 = icmp eq i32 %0, 0 + br i1 %cmp18, label %if.then19, label %if.end23 + +for.body: ; preds = %cond.end, %if.end + %s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ] + %mul9 = shl nuw nsw i32 %s.040, 1 + %rem = urem i32 %0, %mul9 + %cmp10 = icmp eq i32 %rem, 0 + br i1 %cmp10, label %if.then, label %if.end + +if.then: ; preds = %for.body + %add11 = add i32 %s.040, %0 + %idxprom12 = zext i32 %add11 to i64 + %arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12 + %arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32* + %4 = load i32, i32* %arrayidx13, align 4, !tbaa !13 + %5 = load i32, i32* %arrayidx6, align 4, !tbaa !13 + %add16 = add nsw i32 %5, %4 + store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13 + br label %if.end + +if.end: ; preds = %if.then, %for.body + tail call void @llvm.nvvm.barrier.sync(i32 0) #4 + %cmp8 = icmp ult i32 %mul9, %2 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +if.then19: ; preds = %for.cond.cleanup + %idxprom21 = zext i32 %1 to i64 + %arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21 + %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13 + store i32 %6, i32* %arrayidx22, align 4, !tbaa !13 + br label %if.end23 + +if.end23: ; preds = %if.then19, %for.cond.cleanup + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier.sync(i32) #3 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} +!10 = !{i32 0, i32 1024} +!11 = !{i32 0, i32 2147483647} +!12 = !{i32 1, i32 1025} +!13 = !{!14, !14, i64 0} +!14 = !{!"int", !15, i64 0} +!15 = !{!"omnipotent char", !16, i64 0} +!16 = !{!"Simple C++ TBAA"} diff --git a/compilation/examples/reduce/run.sh b/compilation/examples/reduce/run.sh new file mode 100644 index 0000000..93cd3fd --- /dev/null +++ b/compilation/examples/reduce/run.sh @@ -0,0 +1,6 @@ +#!/bin/bash +llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1 +llc --filetype=obj kernel.bc +g++ host.cpp kernel.o -lpthread -o test +./test diff --git a/compilation/examples/reduce_shuffle/host.cpp b/compilation/examples/reduce_shuffle/host.cpp new file mode 100644 index 0000000..41c5ae8 --- /dev/null +++ b/compilation/examples/reduce_shuffle/host.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include + +#define NUM_WARP 2 +#define NUM_BLOCK 1 + +int block_size = 32 * NUM_WARP; +int block_size_x = block_size; +int block_size_y = 1; +int block_size_z = 1; +__thread int block_index = 0; +int grid_size = NUM_BLOCK; + +extern "C" { +void *_Z7reduce5PiS_j_wrapper(void *); +__thread int warp_shfl[32]; +} + +void *wrap(void *p) { + int **res = (int **)p; + block_index = (*(int *)res[3]); + _Z7reduce5PiS_j_wrapper(p); + return NULL; +} + +void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) { + int **ret = new int *[4]; + + int **p0 = new int *; + *p0 = g_idata; + ret[0] = (int *)(p0); + + int **p1 = new int *; + *p1 = g_odata; + ret[1] = (int *)(p1); + + unsigned int *p2 = new unsigned int; + *p2 = n; + ret[2] = (int *)p2; + + int *p3 = new int; + *p3 = bid; + ret[3] = (int *)p3; + + return (void *)ret; +} + +int main(int argc, char *argv[]) { + int *g_idata; + + int size = block_size * NUM_BLOCK; + g_idata = new int[size * 2]; + int *res = new int[size]; + + for (int i = 0; i < size; i++) { + g_idata[i] = i; + } + + pthread_t threads[NUM_BLOCK]; + + void *inp[NUM_BLOCK]; + for (long t = 0; t < NUM_BLOCK; t++) { + inp[t] = gen_input(t, g_idata, res, size); + } + + for (long t = 0; t < NUM_BLOCK; t++) { + pthread_create(&threads[t], NULL, wrap, inp[t]); + } + for (long t = 0; t < NUM_BLOCK; t++) + pthread_join(threads[t], NULL); + int gold = 0; + for (int i = 0; i < size; i++) { + gold += g_idata[i]; + } + assert(*res == gold && "Incorrect res\n"); + printf("PASS\n"); + + pthread_exit(NULL); +} diff --git a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..7d056fd --- /dev/null +++ b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,179 @@ +; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "kernel.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 + +; Function Attrs: nounwind +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: convergent nounwind +define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 { +entry: + %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10 + %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11 + %mul = shl i32 %1, 7 + %add = add i32 %mul, %0 + %cmp = icmp ult i32 %add, %n + br i1 %cmp, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom + %2 = load i32, i32* %arrayidx, align 4, !tbaa !12 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %2, %cond.true ], [ 0, %entry ] + %add4 = add i32 %add, 64 + %cmp5 = icmp ult i32 %add4, %n + br i1 %cmp5, label %if.then, label %if.end + +if.then: ; preds = %cond.end + %idxprom7 = zext i32 %add4 to i64 + %arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7 + %3 = load i32, i32* %arrayidx8, align 4, !tbaa !12 + %add9 = add nsw i32 %3, %cond + br label %if.end + +if.end: ; preds = %if.then, %cond.end + %mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ] + %idxprom10 = zext i32 %0 to i64 + %arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10 + %arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32* + store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12 + tail call void @llvm.nvvm.barrier.sync(i32 0) #5 + tail call void @llvm.nvvm.barrier.sync(i32 0) #5 + tail call void @llvm.nvvm.barrier.sync(i32 0) #5 + tail call void @llvm.nvvm.barrier.sync(i32 0) #5 + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16 + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17 + %mul.i.i52 = mul nuw nsw i32 %5, %4 + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10 + %mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52 + %add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6 + %add8.i.i55 = add nuw nsw i32 %add.i.i54, %0 + %cmp14 = icmp ult i32 %add8.i.i55, 32 + br i1 %cmp14, label %if.then15, label %if.end32 + +if.then15: ; preds = %if.end + %add16 = add nuw nsw i32 %0, 32 + %idxprom17 = zext i32 %add16 to i64 + %arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17 + %arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32* + %8 = load i32, i32* %arrayidx18, align 4, !tbaa !12 + %add19 = add nsw i32 %8, %mySum.0 + %9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5 + %add23 = add nsw i32 %9, %add19 + %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5 + %add23.1 = add nsw i32 %10, %add23 + %11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5 + %add23.2 = add nsw i32 %11, %add23.1 + %12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5 + %add23.3 = add nsw i32 %12, %add23.2 + %13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5 + %cmp27 = icmp eq i32 %add8.i.i55, 0 + br i1 %cmp27, label %if.then28, label %if.end32 + +if.then28: ; preds = %if.then15 + %add23.4 = add nsw i32 %13, %add23.3 + %idxprom30 = zext i32 %1 to i64 + %arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30 + store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12 + br label %if.end32 + +if.end32: ; preds = %if.end, %if.then28, %if.then15 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier.sync(i32) #3 + +; Function Attrs: convergent inaccessiblememonly nounwind +declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } +attributes #4 = { convergent inaccessiblememonly nounwind } +attributes #5 = { nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} +!10 = !{i32 0, i32 1024} +!11 = !{i32 0, i32 2147483647} +!12 = !{!13, !13, i64 0} +!13 = !{!"int", !14, i64 0} +!14 = !{!"omnipotent char", !15, i64 0} +!15 = !{!"Simple C++ TBAA"} +!16 = !{i32 0, i32 64} +!17 = !{i32 1, i32 1025} diff --git a/compilation/examples/reduce_shuffle/run.sh b/compilation/examples/reduce_shuffle/run.sh new file mode 100644 index 0000000..93cd3fd --- /dev/null +++ b/compilation/examples/reduce_shuffle/run.sh @@ -0,0 +1,6 @@ +#!/bin/bash +llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1 +llc --filetype=obj kernel.bc +g++ host.cpp kernel.o -lpthread -o test +./test diff --git a/compilation/examples/run_example.sh b/compilation/examples/run_example.sh new file mode 100644 index 0000000..8600e28 --- /dev/null +++ b/compilation/examples/run_example.sh @@ -0,0 +1,11 @@ +#!bin/sh +for file in ./* +do + if test -d $file + then + echo executing $file + cd $file + bash run.sh + cd .. + fi +done diff --git a/compilation/examples/vecadd/host.cpp b/compilation/examples/vecadd/host.cpp new file mode 100644 index 0000000..153d7d5 --- /dev/null +++ b/compilation/examples/vecadd/host.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include + +#define NUM_BLOCK 1 +int N = 32; + +int block_size = 32; +int block_size_x = block_size; +int block_size_y = 1; +int block_size_z = 1; +__thread int block_index = 0; +int grid_size = NUM_BLOCK; + +extern "C" { +void *_Z9vectorAddPKfS0_Pfi_wrapper(void *); +} + +void *wrap(void *p) { + int **res = (int **)p; + block_index = (*(int *)res[4]); + _Z9vectorAddPKfS0_Pfi_wrapper(p); + return NULL; +} + +void *gen_input(int bid, float *A, float *B, float *C, int N) { + int **ret = new int *[5]; + + float **p0 = new float *; + *p0 = A; + ret[0] = (int *)(p0); + + float **p1 = new float *; + *p1 = B; + ret[1] = (int *)(p1); + + float **p2 = new float *; + *p2 = C; + ret[2] = (int *)(p2); + + int *p3 = new int; + *p3 = N; + ret[3] = (int *)p3; + + int *p4 = new int; + *p4 = bid; + ret[4] = (int *)p4; + + return (void *)ret; +} + +int main() { + float *A, *B, *C; + + A = new float[N]; + B = new float[N]; + C = new float[N]; + + for (int i = 0; i < N; i++) { + A[i] = i; + B[i] = 1; + C[i] = 0; + } + + pthread_t threads[NUM_BLOCK]; + + int rc; + for (long t = 0; t < NUM_BLOCK; t++) { + void *inp = gen_input(t, A, B, C, N); + rc = pthread_create(&threads[t], NULL, wrap, inp); + } + clock_t t1 = clock(); + /* Last thing that main() should do */ + for (long t = 0; t < NUM_BLOCK; t++) + pthread_join(threads[t], NULL); + + for (int i = 0; i < N; i++) { + assert(C[i] == (A[i] + B[i])); + } + printf("PASS\n"); + pthread_exit(NULL); +} diff --git a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..57d6b64 --- /dev/null +++ b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,86 @@ +; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "kernel.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +; Function Attrs: nounwind +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nounwind +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 { +entry: + ret i32 999 +} + +; Function Attrs: nofree nounwind +define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 { +entry: + %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10 + %idxprom8 = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8 + %1 = load float, float* %arrayidx, align 4, !tbaa !11 + %arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8 + %2 = load float, float* %arrayidx2, align 4, !tbaa !11 + %add = fadd contract float %1, %2 + %arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8 + store float %add, float* %arrayidx4, align 4, !tbaa !11 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} +!10 = !{i32 0, i32 1024} +!11 = !{!12, !12, i64 0} +!12 = !{!"float", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C++ TBAA"} diff --git a/compilation/examples/vecadd/run.sh b/compilation/examples/vecadd/run.sh new file mode 100644 index 0000000..973a99c --- /dev/null +++ b/compilation/examples/vecadd/run.sh @@ -0,0 +1,6 @@ +#!/bin/bash +llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1 +llc --filetype=obj kernel.bc +g++ host.cpp kernel.o -lpthread -o test +./test diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt new file mode 100644 index 0000000..31591ae --- /dev/null +++ b/runtime/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.1 FATAL_ERROR) +project( + X86runtime + DESCRIPTION "Implementation CUDA runtime API with x86" + LANGUAGES CXX) +set(LIB_NAME x86Runtime) +set(CMAKE_VERBOSE_MAKEFILE ON) + +# compile threadPool implementation +add_subdirectory(threadPool) + +# compile x86 runtime library +include_directories(./include) +include_directories(./threadPool/include) +file(GLOB proj_SOURCES "lib/*.cpp") +add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/include/cudaRuntimeImpl.h b/runtime/include/cudaRuntimeImpl.h new file mode 100644 index 0000000..0f5b8ae --- /dev/null +++ b/runtime/include/cudaRuntimeImpl.h @@ -0,0 +1,19 @@ +#ifndef __RUNTIME_IMPL__ +#define __RUNTIME_IMPL__ +#include "cudaStatus.h" +#include "structures.h" +cudaError_t cudaDeviceReset(void); +cudaError_t cudaDeviceSynchronize(void); +cudaError_t cudaFree(void *devPtr); +cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem, + cudaStream_t stream); +cudaError_t cudaMalloc(void **devPtr, size_t size); +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, + cudaMemcpyKind kind); +cudaError_t cudaSetDevice(int device); +cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src); +cudaError_t cudaStreamCreate(cudaStream_t *pStream); +cudaError_t cudaStreamDestroy(cudaStream_t stream); +cudaError_t cudaStreamSynchronize(cudaStream_t stream); +#endif diff --git a/runtime/include/cudaStatus.h b/runtime/include/cudaStatus.h new file mode 100644 index 0000000..8b60edc --- /dev/null +++ b/runtime/include/cudaStatus.h @@ -0,0 +1,18 @@ +#ifndef __RUNTIME_STATUS__ +#define __RUNTIME_STATUS__ +#include + +enum cudaError_t { + CudaSuccess = 0, + CudaErrorInvalidValue = 1, + CudaErrorInvalidMemoryAllocation = 2, +}; + +enum cudaMemcpyKind { + cudaMemcpyHostToHost = 0, + cudaMemcpyHostToDevice = 1, + cudaMemcpyDeviceToHost = 2, + cudaMemcpyDeviceToDevice = 3, + cudaMemcpyDefault = 4, +}; +#endif diff --git a/runtime/lib/cudaRuntimeImpl.cpp b/runtime/lib/cudaRuntimeImpl.cpp new file mode 100644 index 0000000..83054f3 --- /dev/null +++ b/runtime/lib/cudaRuntimeImpl.cpp @@ -0,0 +1,100 @@ +#include "cudaRuntimeImpl.h" +#include "api.h" +#include +#include +cudaError_t cudaDeviceReset(void) { scheduler_uninit(); } +cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); } +cudaError_t cudaFree(void *devPtr) { free(devPtr); } +cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem, + cudaStream_t stream) { + // if scheduler is null init device + + cu_kernel *ker = + create_kernel(func, gridDim, blockDim, &args, sharedMem, stream); + + int lstatus = cuLaunchKernel(&ker); +} +cudaError_t cudaMalloc(void **devPtr, size_t size) { + *devPtr = malloc(size); + if (devPtr == NULL) + return cudaErrorMemoryAllocation; + return cudaSuccess; +} +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, + cudaMemcpyKind kind) { + if (kind == cudaMemcpyHostToHost) { + memcpy(dst, src, count); + } else if (kind == cudaMemcpyDeviceToHost) { + // how does the code know which device accessing the memory + memcpy(dst, src, count); + } else if (kind == cudaMemcpyHostToDevice) { + // how does the code know which device accessing the memory + memcpy(dst, src, count); + } else if (kind == cudaMemcpyDeviceToHost) { + // how does the code know which device accessing the memory + memcpy(dst, src, count); + } else if (kind == cudaMemcpyDeviceToDevice) { + + memcpy(dst, dst, count); + } else if (kind == cudaMemcpyDefault) { + memcpy(dst, src, count); + } + return cudaSuccess; +} + +cudaError_t cudaSetDevice(int device) { + // error checking + init_device(); +} + +cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { + cstreamData *dst_stream = (cstreamData *)dst; + cstreamData *src_stream = (cstreamData *)src; + + if (dst_stream == NULL || src_stream == NULL) { + return cudaErrorInvalidValue; // 1 + } + + dst_stream->stream_priority = src_stream->stream_priority; + dst_stream->stream_flags = src_stream->stream_flags; + + return cudaSuccess; // 0 +} + +cudaError_t cudaStreamCreate(cudaStream_t *pStream) { + cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData)); + if (s == NULL) + return cudaErrorMemoryAllocation; + s->ev.status = C_RUN; + s->id = stream_counter; + stream_counter++; + s->stream_priority = DEFAULT; + create_KernelQueue(&(s->kernelQueue)); + + INIT_LOCK(s->stream_lock); + *pStream = (cudaStream_t)(s); + + return cudaSuccess; +} + +cudaError_t cudaStreamDestroy(cudaStream_t stream) { + cstreamData *s = (cstreamData *)(stream); + + free(s->kernelQueue); + + DESTROY_LOCK(s->stream_lock); + + free(s); + + return cudaSuccess; +} + +cudaError_t cudaStreamSynchronize(cudaStream_t stream) { + cstreamData *e = ((cstreamData *)(stream)); + MUTEX_LOCK(e->stream_lock); + + e->ev.status = C_SYNCHRONIZE; + e->ev.numKernelsToWait = e->kernelQueue->waiting_count; + MUTEX_UNLOCK(e->stream_lock); +} diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt new file mode 100644 index 0000000..ed599bf --- /dev/null +++ b/runtime/threadPool/CMakeLists.txt @@ -0,0 +1,17 @@ +cmake_minimum_required(VERSION 3.1) + +# C project +project( + ThreadPool + DESCRIPTION "Using pthread to implement ThreadPool" + LANGUAGES CXX) + +set(CMAKE_VERBOSE_MAKEFILE ON) +set(LIB_NAME threadPool) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Debug) +include_directories(./include) + +file(GLOB proj_SOURCES "lib/*.cpp") +add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/threadPool/include/api.h b/runtime/threadPool/include/api.h new file mode 100644 index 0000000..0c0aad6 --- /dev/null +++ b/runtime/threadPool/include/api.h @@ -0,0 +1,25 @@ +#ifndef C_API_H +#define C_API_H + +#include "structures.h" + +cu_kernel *create_kernel(void (*wrap)(cu_argument *)); +int getWorkItem(struct kernel_queue **qu, cu_kernel *ker, + struct argument *kernel_arg, int **blockId); +int create_KernelQueue(kernel_queue **q); + +int dequeKernelLL(struct kernel_queue **qu); + +int dequeKernel(struct kernel_queue **qu, cu_kernel *ker); +int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker); + +int scheduler_init(cu_device device); +void scheduler_uninit(); +void cuSynchronizeBarrier(); + +int set_kernel_arguments(cu_kernel **k, unsigned int arg_num, void **arg_value); + +int setKernelDimensions(cu_kernel *k, struct argument **arg, + void **totalBlockSize, void *blockId); + +#endif diff --git a/runtime/threadPool/include/def.h b/runtime/threadPool/include/def.h new file mode 100644 index 0000000..259733e --- /dev/null +++ b/runtime/threadPool/include/def.h @@ -0,0 +1,26 @@ +#ifndef C_DEF_H +#define C_DEF_H + +// Error +#define C_SUCCESS 0x0 +#define C_ERROR 0x1 + +// execution status +#define C_COMPLETE 0x2 +#define C_RUNNING 0x3 +#define C_SUBMITTED 0x4 +#define C_QUEUED 0x5 +#define C_CREATED 0x5 + +// stream status +#define C_RUN 0x1 +#define C_WAIT 0x2 +#define C_SYNCHRONIZE 0x3 + +// Not Initliazed Error +#define C_ERROR_NOT_INITIALIZED 0x6 +#define C_ERROR_MEMALLOC 0x7 + +#define C_QUEUE_EMPTY 0x8 + +#endif diff --git a/runtime/threadPool/include/macros.h b/runtime/threadPool/include/macros.h new file mode 100644 index 0000000..b047c28 --- /dev/null +++ b/runtime/threadPool/include/macros.h @@ -0,0 +1,38 @@ +#ifndef C_MACROS_H +#define C_MACROS_H +#include "assert.h" +#include + +#define INIT_LOCK(__LOCK__) \ + { \ + do { \ + int r = pthread_mutex_init(&(__LOCK__), NULL); \ + assert(r == 0); \ + } while (0); \ + } + +#define MUTEX_LOCK(__LOCK__) \ + { \ + do { \ + int r = pthread_mutex_lock(&(__LOCK__)); \ + assert(r == 0); \ + } while (0); \ + } + +#define MUTEX_UNLOCK(__LOCK__) \ + { \ + do { \ + int r = pthread_mutex_unlock(&(__LOCK__)); \ + assert(r == 0); \ + } while (0); \ + } + +#define DESTROY_LOCK(__LOCK__) \ + { \ + do { \ + int r = pthread_mutex_destroy(&(__LOCK__)); \ + assert(r == 0); \ + } while (0); \ + } + +#endif // HEADER_FILE diff --git a/runtime/threadPool/include/structures.h b/runtime/threadPool/include/structures.h new file mode 100644 index 0000000..d78a034 --- /dev/null +++ b/runtime/threadPool/include/structures.h @@ -0,0 +1,191 @@ +#ifndef C_STRUCTURES_H +#define C_STRUCTURES_H + +#include "pthread.h" +#define cudaStream_t cstreamData +typedef struct device { + int max_compute_units; + int device_id; +} cu_device; + +typedef struct c_thread { + pthread_t thread; + unsigned long executed_commands; + unsigned index; + bool exit; +} cu_ptd; + +typedef struct scheduler_pool { + + struct c_thread *thread_pool; + + size_t num_worker_threads; + size_t local_mem_size; + int num_kernel_launch; + int num_kernel_finished; + int num_kernel_queued; + size_t idle_threads; + + pthread_cond_t wake_pool; + + int threadpool_shutdown_requested; + + // lock for scheduler + pthread_mutex_t work_queue_lock; + + // C99 array at the end + // user kernel queue for only user called functions + struct kernel_queue *kernelQueue; + +} cu_pool; + +struct kernel_queue { + + struct kernel *head; + struct kernel *tail; + + // finish command count + unsigned long finish_count; + + // waiting to be run on threads + unsigned long waiting_count; + + // running count + unsigned long running_count; + + // total count + unsigned long kernel_count; + + // current index for task to be run + unsigned long current_index; +}; + +typedef struct command { + + struct kernel *ker; + + struct command *next; + struct command *prev; + +} cu_command; + +typedef struct argument { + // size of the argument to allocation + size_t size; + void *value; + unsigned int index; +} cu_argument; + +typedef struct input_arg { + // real values for the input + char *p; + struct argument *argus[]; + // (TODO): implement meta_data + // the type of metadata will need to change to list of ints or something + // so that we can parse the arguments p +} cu_input; + +struct dim3 { + size_t x; + size_t y; + size_t z; + dim3(int d1) { + x = d1; + y = z = 1; + } + dim3() { x = y = z = 1; } +}; + +enum StreamType { + DEFAULT, + LOW, + HIGH, + EXT, +}; + +struct cStreamDataInternal { + /* + status of the stream (run , wait) + Run: Stream will asynchronously assign the kernel assign with this stream + Wait: Stream will halt kernels from exiting the scheduler + */ + int status; + /* + if status == wait, wait on the number of kernels to wait to become 0 + */ + unsigned long numKernelsToWait; + unsigned int lastKernelIdToWait; + unsigned int count; // number of task left in the stream +}; + +typedef struct streamData { + + // execution status of current event monitor + struct cStreamDataInternal ev; + pthread_mutex_t stream_lock; // lock on the stream + StreamType stream_priority; + unsigned int id; + unsigned int stream_flags; + + // queue of the kernels in this stream + struct kernel_queue *kernelQueue; + +} cstreamData; +// kernel information +typedef struct kernel { + + void *(*start_routine)(void *); + + void **args; + + dim3 gridDim; + dim3 blockDim; + + struct kernel *next; + struct kernel *prev; + + size_t shared_mem; + + cstreamData *stream; + + struct event *barrier; + + int status; + + int totalBlocks; + int N; + + int blockSize; + int kernelId; + + // current blockId + int blockId; + + void *shared_mem_loc; + +} cu_kernel; + +typedef struct asyncKernel { + unsigned int numBlocks; + unsigned int numThreads; + struct event *evt; + struct kernel *ker; + + struct asyncKernel *prev; + struct asyncKernel *next; + +} asyncKernel; + +// command queue of command nodes + +typedef struct kernel_arg_array { + size_t size; + unsigned int index; +} karg_arr; + +typedef struct kernel_image_arg { + size_t size; + unsigned int index; +} k_arg; + +#endif // HEADER_FILE diff --git a/runtime/threadPool/lib/api.cpp b/runtime/threadPool/lib/api.cpp new file mode 100644 index 0000000..491d0d7 --- /dev/null +++ b/runtime/threadPool/lib/api.cpp @@ -0,0 +1,456 @@ +#include +#include +#include + +#include "api.h" +#include "def.h" +#include "macros.h" +#include "structures.h" + +/* +Initialize the device +*/ +int init_device() { + + cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); + if (device == NULL) + return C_ERROR_MEMALLOC; + + device->max_compute_units = std::thread::hardware_concurrency(); + + // initialize scheduler + int ret = scheduler_init(*device); + + if (ret != C_SUCCESS) + return ret; + + return C_SUCCESS; +} + +/* + Create Kernel + +*/ +static int kernelIds = 0; +cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim, + void ***args, size_t sharedMem, cstreamData *stream) { + cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel)); + + // set the function pointer + ker->start_routine = (void *(*)(void *))func; + // ker->start_routine(args); + ker->args = *args; + + ker->gridDim = gridDim; + ker->blockDim = blockDim; + + ker->shared_mem = sharedMem; + + // malloc shared memory dynamic (heap , needs to be on the stack) + // each thread create their own shared memory // after the task submission + ker->shared_mem_loc = calloc(1, sharedMem); + + ker->stream = stream; + + ker->blockId = 0; + + ker->totalBlocks = gridDim.x; + + ker->N = blockDim.x; + + ker->kernelId = kernelIds; + kernelIds += 1; + + ker->blockSize = blockDim.x; + + return ker; +} + +/* + Create Kernel Queue +*/ +int create_KernelQueue(kernel_queue **q) { + *q = (kernel_queue *)calloc(1, sizeof(kernel_queue)); + + if (*q == NULL) { + return C_ERROR_MEMALLOC; + } + + (*q)->kernel_count = 0; + (*q)->running_count = 0; + (*q)->waiting_count = 0; + (*q)->finish_count = 0; + (*q)->current_index = 0; + + return C_SUCCESS; +} + +int dequeKernelLL(struct kernel_queue **qu) { + + struct kernel_queue *q = *qu; + q->finish_count += 1; + + // free the pointer + if (q->head == NULL) { + return C_QUEUE_EMPTY; + } else { + //*ker = *(q->head); + q->head = (q->head)->next; + if (q->head != NULL) { + q->head->prev = NULL; + } + } + + return C_SUCCESS; +} + +int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) { + struct kernel_queue *q = *qu; + cu_kernel *p = *ker; + + if (q->head == NULL) { + q->head = p; + q->tail = p; + } else { + p->prev = q->tail; + q->tail->next = p; + q->tail = p; + p->next = NULL; + } + q->kernel_count += 1; + q->waiting_count += 1; + + // user kernel command + + return C_SUCCESS; +} + +// scheduler +static cu_pool *scheduler; + +__thread int block_index = 0; +__thread int thread_memory_size = 0; + +/* + Enqueue Kernel (k) to the scheduler kernelQueue +*/ +int schedulerEnqueueKernel(cu_kernel **k) { + cu_kernel *ker = *k; + MUTEX_LOCK(scheduler->work_queue_lock); + + enqueueKernel(&scheduler->kernelQueue, &ker); + + pthread_cond_broadcast(&(scheduler->wake_pool)); + MUTEX_UNLOCK(scheduler->work_queue_lock); +} + +/* + Kernel Launch with numBlocks and numThreadsPerBlock +*/ +int cuLaunchKernel(cu_kernel **k) { + + // Calculate Block Size N/numBlocks + + cu_kernel *ker = *k; + int status = C_RUN; + + MUTEX_LOCK(scheduler->work_queue_lock); + scheduler->num_kernel_queued += 1; + MUTEX_UNLOCK(scheduler->work_queue_lock); + + // stream == 0 add to the kernelQueue + if (ker->stream == 0) { + + schedulerEnqueueKernel(&ker); + } else { + // add to it's stream queue + // stream queue can be waiting or running with or without tasks + MUTEX_LOCK(((cstreamData *)(ker->stream))->stream_lock); + status = ((cstreamData *)(ker->stream))->ev.status; + + // if stream queue status is run (first kernel) (enqueue to the kernel + // queue) + cstreamData *e = ((cstreamData *)(ker->stream)); + // synchronized is called after no job in the queue so stream is stuck on + // synchronize + if (e->ev.status == C_SYNCHRONIZE) { + if ((e->kernelQueue->finish_count) == (e->kernelQueue->kernel_count)) { + e->ev.status = C_RUN; + } + } + + if (e->ev.status == C_RUN) { + // change the status to wait + e->ev.status == C_WAIT; + MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock); + + schedulerEnqueueKernel(&ker); + } else { + // the status of stream queue is wait so just enqueue to the stream + enqueueKernel(&((cstreamData *)(ker->stream))->kernelQueue, &ker); + MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock); + } + } +} + +/* + Get Work Item: get the kernel from the queue and increment blockId +*/ +int getWorkItem(struct kernel_queue **qu, cu_kernel **kern, int blockId) { + struct kernel_queue *q = *qu; + if (q->waiting_count > 0) { + *kern = q->head; + cu_kernel *ker = *kern; + if (blockId + 1 == q->head->totalBlocks) { + // deque the head + dequeKernelLL(qu); + ker->status = C_COMPLETE; + q->waiting_count -= 1; + } else { + q->head->blockId += 1; + } + q->finish_count += 1; + } else { + return C_QUEUE_EMPTY; + } + return C_SUCCESS; +} + +/* + Thread Gets Work +*/ +int get_work(c_thread *th) { + + cu_kernel ker; + + MUTEX_LOCK(scheduler->work_queue_lock); + +RETRY: + + int is_exit = 0; + int is_command_not_null = 0; + + int blockId; + int localBlockSize; + int status; + int completion_status = 0; + + is_exit = scheduler->threadpool_shutdown_requested; + + MUTEX_UNLOCK(scheduler->work_queue_lock); + + if (!is_exit) { + + MUTEX_LOCK(scheduler->work_queue_lock); + + // if kernel waiting to be complete is not zero + if (scheduler->kernelQueue->waiting_count > 0) { + blockId = scheduler->kernelQueue->head->blockId; + localBlockSize = scheduler->kernelQueue->head->blockSize; + // set status as success fully queue + status = C_SUCCESS; + ker = *(scheduler->kernelQueue->head); + // if the blockId + 1 is equal to the goal block size , + // then its the last block + + if (blockId + 1 == scheduler->kernelQueue->head->totalBlocks) { + // deque the head + dequeKernelLL(&scheduler->kernelQueue); + + ker.status = C_COMPLETE; + scheduler->kernelQueue->waiting_count -= 1; + } else { + // increment the blockId + scheduler->kernelQueue->head->blockId = + scheduler->kernelQueue->head->blockId + 1; + } + // status = getWorkItem(&scheduler->kernelQueue, &ker, blockId); + } else { + status = C_QUEUE_EMPTY; + } + MUTEX_UNLOCK(scheduler->work_queue_lock); + } + + if (status != C_QUEUE_EMPTY) { + + block_index = blockId; + thread_memory_size = ker.shared_mem; + ker.start_routine(ker.args); + + is_command_not_null = 1; + if (ker.status == C_COMPLETE) { + + // check if this kernel's stream has more jobs to run (enqueue the next + // job) + if (ker.stream != NULL) { + bool synchronize = false; + + MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock); + + if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) { + // synchronize stream + if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) { + ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1; + } + + MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock); + + if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) { + // synchronize stream + if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) { + ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1; + } + + if (((cstreamData *)(ker.stream))->ev.numKernelsToWait == 0) { + synchronize = false; + } else { + synchronize = true; + } + } + if (synchronize == false) { + if (((cstreamData *)(ker.stream))->kernelQueue->waiting_count > 0) { + ((cstreamData *)(ker.stream))->ev.status = C_WAIT; + + MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock); + cu_kernel *kern = + ((cstreamData *)(ker.stream))->kernelQueue->head; + schedulerEnqueueKernel(&kern); + dequeKernelLL(&((cstreamData *)(ker.stream))->kernelQueue); + + } else { + + // switch the stream to run to allow for the next execution + ((cstreamData *)(ker.stream))->ev.status = C_RUN; + + MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock); + } + } + } + } + MUTEX_LOCK(scheduler->work_queue_lock); + scheduler->num_kernel_finished += 1; + MUTEX_UNLOCK(scheduler->work_queue_lock); + } + } + + MUTEX_LOCK(scheduler->work_queue_lock); + + if ((is_exit == 0 && is_command_not_null == 0)) { + // all threads in condition wait + scheduler->idle_threads += 1; + pthread_cond_wait(&(scheduler->wake_pool), &(scheduler->work_queue_lock)); + scheduler->idle_threads -= 1; + goto RETRY; + } + MUTEX_UNLOCK(scheduler->work_queue_lock); + + return is_exit; +} + +void *driver_thread(void *p) { + struct c_thread *td = (struct c_thread *)p; + int is_exit = 0; + td->exit = false; + + while (1) { + // get work + is_exit = get_work(td); + + // exit the routine + if (is_exit) { + td->exit = true; + // pthread_exit + pthread_exit(NULL); + } + } +} + +/* +Initialize the scheduler +*/ +int scheduler_init(cu_device device) { + + scheduler = (cu_pool *)calloc(1, sizeof(cu_pool)); + scheduler->num_worker_threads = device.max_compute_units; + + scheduler->thread_pool = (struct c_thread *)calloc( + scheduler->num_worker_threads, sizeof(c_thread)); + kernel_queue *asq; + create_KernelQueue(&asq); + scheduler->kernelQueue = asq; + + INIT_LOCK(scheduler->work_queue_lock); + pthread_cond_init(&scheduler->wake_pool, NULL); + scheduler->idle_threads = 0; + + for (int i = 0; i < scheduler->num_worker_threads; i++) { + scheduler->thread_pool[i].index = i; + pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread, + (void *)&scheduler->thread_pool[i]); + } + + return C_SUCCESS; +} + +void scheduler_uninit() { + unsigned i; + + int r = pthread_mutex_lock(&scheduler->work_queue_lock); + assert(r == 0); + scheduler->threadpool_shutdown_requested = 1; + pthread_cond_broadcast(&scheduler->wake_pool); + + int r1 = pthread_mutex_unlock(&scheduler->work_queue_lock); + assert(r1 == 0); + + for (i = 0; i < scheduler->num_worker_threads; i++) { + + pthread_join(scheduler->thread_pool[i].thread, NULL); + } + free(scheduler->thread_pool); + free(scheduler->kernelQueue); + + pthread_mutex_destroy(&scheduler->work_queue_lock); + pthread_cond_destroy(&scheduler->wake_pool); + + scheduler->threadpool_shutdown_requested = 0; +} + +int cuWait(cstreamData *evt) { + +AGAIN: + int r = pthread_mutex_lock(&evt->stream_lock); + assert(r == 0); + if (evt->ev.status != C_COMPLETE) { + int r1 = pthread_mutex_unlock(&evt->stream_lock); + assert(r1 == 0); + goto AGAIN; + } + return C_SUCCESS; +} + +/* + Barrier for Kernel Launch + + During kernel launch, increment the number of work items required to finish + Each kernel will point to the same event + + During Running Command, decrement the event.work_item count + when count is 0, all work items for this kernel launch is finish + + Sense Like Barrier + Counting Barrier basically +*/ +void cuSynchronizeBarrier() { +AGAIN: + + MUTEX_LOCK(scheduler->work_queue_lock); + + if (scheduler->num_kernel_finished != scheduler->num_kernel_queued || + scheduler->idle_threads != scheduler->num_worker_threads) { + MUTEX_UNLOCK(scheduler->work_queue_lock); + goto AGAIN; + } else { + MUTEX_UNLOCK(scheduler->work_queue_lock); + } +}