From fd56811650dbdea1254bf80a46e748eecfa62ff3 Mon Sep 17 00:00:00 2001 From: Ruobing Han Date: Wed, 13 Dec 2023 14:29:17 -0500 Subject: [PATCH] Refactoring the codebase. Remove useless variables; Add comments; Remove useless header files; Remove hard code and support both x86 and ARM CPU --- .github/workflows/build.yml | 6 +- CMakeLists.txt | 19 +- README.md | 2 +- compilation/CMakeLists.txt | 12 +- compilation/HostTranslation.cpp | 5 - compilation/HostTranslation/CMakeLists.txt | 7 +- .../include/{x86 => cpu}/RemoveCudaBuiltin.h | 0 .../include/{x86 => cpu}/RemoveMetadata.h | 0 .../{x86 => cpu}/ReplaceConstantMemory.h | 0 .../include/{x86 => cpu}/ReplaceCudaBuiltin.h | 0 .../include/{x86 => cpu}/ReplaceKernelArgs.h | 0 .../src/{x86 => cpu}/RemoveCudaBuiltin.cpp | 8 - .../src/{x86 => cpu}/RemoveMetadata.cpp | 16 +- .../{x86 => cpu}/ReplaceConstantMemory.cpp | 5 - .../src/{x86 => cpu}/ReplaceCudaBuiltin.cpp | 21 -- .../src/{x86 => cpu}/ReplaceKernelArgs.cpp | 10 - compilation/KernelTranslation.cpp | 29 ++- compilation/KernelTranslation/CMakeLists.txt | 7 +- .../include/cpu/generate_cpu_format.h | 10 + .../include/{x86 => cpu}/handle_sync.h | 0 .../include/{x86 => cpu}/init.h | 0 .../include/{x86 => cpu}/insert_sync.h | 0 .../include/{x86 => cpu}/insert_warp_loop.h | 0 .../include/{x86 => cpu}/memory_hierarchy.h | 0 .../include/{x86 => cpu}/performance.h | 0 .../include/{x86 => cpu}/tool.h | 0 .../include/{x86 => cpu}/warp_func.h | 0 .../include/x86/generate_x86_format.h | 10 - .../src/cpu/generate_cpu_format.cpp | 125 ++++++++++++ .../src/{x86 => cpu}/handle_sync.cpp | 6 - .../src/{x86 => cpu}/init.cpp | 30 +-- .../src/{x86 => cpu}/insert_sync.cpp | 32 +-- .../src/{x86 => cpu}/insert_warp_loop.cpp | 33 +--- .../src/{x86 => cpu}/memory_hierarchy.cpp | 31 +-- .../src/{x86 => cpu}/performance.cpp | 37 +--- .../src/{x86 => cpu}/tool.cpp | 29 +-- .../src/{x86 => cpu}/warp_func.cpp | 11 +- .../src/x86/generate_x86_format.cpp | 186 ------------------ runtime/CMakeLists.txt | 12 +- runtime/include/{x86 => cpu}/cudaKernelImpl.h | 2 +- .../include/{x86 => cpu}/cudaRuntimeImpl.h | 0 runtime/src/{x86 => cpu}/cudaKernelImpl.cpp | 0 runtime/src/{x86 => cpu}/cudaRuntimeImpl.cpp | 21 +- runtime/threadPool/CMakeLists.txt | 5 +- runtime/threadPool/include/{x86 => cpu}/api.h | 0 runtime/threadPool/include/{x86 => cpu}/def.h | 0 .../threadPool/include/{x86 => cpu}/macros.h | 0 .../include/{x86 => cpu}/structures.h | 0 runtime/threadPool/src/{x86 => cpu}/api.cpp | 51 ++--- test/runHeteroMark.sh | 2 +- 50 files changed, 249 insertions(+), 531 deletions(-) rename compilation/HostTranslation/include/{x86 => cpu}/RemoveCudaBuiltin.h (100%) rename compilation/HostTranslation/include/{x86 => cpu}/RemoveMetadata.h (100%) rename compilation/HostTranslation/include/{x86 => cpu}/ReplaceConstantMemory.h (100%) rename compilation/HostTranslation/include/{x86 => cpu}/ReplaceCudaBuiltin.h (100%) rename compilation/HostTranslation/include/{x86 => cpu}/ReplaceKernelArgs.h (100%) rename compilation/HostTranslation/src/{x86 => cpu}/RemoveCudaBuiltin.cpp (93%) rename compilation/HostTranslation/src/{x86 => cpu}/RemoveMetadata.cpp (66%) rename compilation/HostTranslation/src/{x86 => cpu}/ReplaceConstantMemory.cpp (96%) rename compilation/HostTranslation/src/{x86 => cpu}/ReplaceCudaBuiltin.cpp (94%) rename compilation/HostTranslation/src/{x86 => cpu}/ReplaceKernelArgs.cpp (88%) create mode 100644 compilation/KernelTranslation/include/cpu/generate_cpu_format.h rename compilation/KernelTranslation/include/{x86 => cpu}/handle_sync.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/init.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/insert_sync.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/insert_warp_loop.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/memory_hierarchy.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/performance.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/tool.h (100%) rename compilation/KernelTranslation/include/{x86 => cpu}/warp_func.h (100%) delete mode 100644 compilation/KernelTranslation/include/x86/generate_x86_format.h create mode 100644 compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp rename compilation/KernelTranslation/src/{x86 => cpu}/handle_sync.cpp (90%) rename compilation/KernelTranslation/src/{x86 => cpu}/init.cpp (93%) rename compilation/KernelTranslation/src/{x86 => cpu}/insert_sync.cpp (95%) rename compilation/KernelTranslation/src/{x86 => cpu}/insert_warp_loop.cpp (96%) rename compilation/KernelTranslation/src/{x86 => cpu}/memory_hierarchy.cpp (90%) rename compilation/KernelTranslation/src/{x86 => cpu}/performance.cpp (60%) rename compilation/KernelTranslation/src/{x86 => cpu}/tool.cpp (96%) rename compilation/KernelTranslation/src/{x86 => cpu}/warp_func.cpp (95%) delete mode 100644 compilation/KernelTranslation/src/x86/generate_x86_format.cpp rename runtime/include/{x86 => cpu}/cudaKernelImpl.h (96%) rename runtime/include/{x86 => cpu}/cudaRuntimeImpl.h (100%) rename runtime/src/{x86 => cpu}/cudaKernelImpl.cpp (100%) rename runtime/src/{x86 => cpu}/cudaRuntimeImpl.cpp (97%) rename runtime/threadPool/include/{x86 => cpu}/api.h (100%) rename runtime/threadPool/include/{x86 => cpu}/def.h (100%) rename runtime/threadPool/include/{x86 => cpu}/macros.h (100%) rename runtime/threadPool/include/{x86 => cpu}/structures.h (100%) rename runtime/threadPool/src/{x86 => cpu}/api.cpp (82%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 58fe7c7..8a77a6c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -51,7 +51,7 @@ jobs: hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc llc --relocation-model=pic --filetype=obj kernel.bc llc --relocation-model=pic --filetype=obj host.bc - g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread + g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread ./reverse - name: Execute the dynamic shared memory demo run: | @@ -63,7 +63,7 @@ jobs: hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc llc --relocation-model=pic --filetype=obj kernel.bc llc --relocation-model=pic --filetype=obj host.bc - g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread + g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread ./reverse - name: Execute Hetero-mark benchmark run: | @@ -79,5 +79,5 @@ jobs: hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc llc --relocation-model=pic --filetype=obj kernel.bc llc --relocation-model=pic --filetype=obj host.bc - g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread + g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread ./lavaMD -boxes1d 10 diff --git a/CMakeLists.txt b/CMakeLists.txt index d889303..cf10d4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,9 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project(CudaOnX86) -set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.") +project(CuPBoP) +set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.") set(CMAKE_CXX_STANDARD "14") -set(MAJOR_VERSION 0) -set(MINOR_VERSION 1) -set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION}) -set(COX_VERSION ${VERSION_STRING}) -# get LLVM PATH get PATH for head file + if(DEFINED LLVM_CONFIG_PATH) if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}") if(EXISTS "${LLVM_CONFIG_PATH}") @@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH) else() message(FATAL_ERROR "llvm-config is required") endif() -# get CUDA PATH + if(DEFINED CUDA_PATH) message(STATUS "Using CUDA: ${CUDA_PATH}") else() @@ -45,7 +41,7 @@ if(DEBUG) endif() set(CMAKE_CXX_FLAGS - "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}") + "-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused") set(GCC_COVERAGE_LINK_FLAGS "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm") @@ -54,5 +50,8 @@ add_subdirectory(compilation) add_subdirectory(runtime) enable_testing() -option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data) +set(HETERO_MARK_DATA + "/tmp/data" + CACHE PATH "The path to download hetero-mark dataset.") + add_subdirectory(test) diff --git a/README.md b/README.md index 435d225..34986f2 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \ -L$CuPBoP_PATH/build/runtime \ -L$CuPBoP_PATH/build/runtime/threadPool \ host.o kernel.o \ - -I../.. -lc -lx86Runtime -lthreadPool -lpthread + -I../.. -lc -lCPUruntime -lthreadPool -lpthread # Execute ./vecadd ``` diff --git a/compilation/CMakeLists.txt b/compilation/CMakeLists.txt index e0f8bbc..7c54c29 100644 --- a/compilation/CMakeLists.txt +++ b/compilation/CMakeLists.txt @@ -1,20 +1,16 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project( - NVVM2X86 - DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend" - LANGUAGES CXX) set(CMAKE_VERBOSE_MAKEFILE ON) -# compile kernel translator -include_directories(./KernelTranslation/include/x86) +# build kernel translator +include_directories(./KernelTranslation/include/cpu) add_subdirectory(KernelTranslation) add_executable(kernelTranslator KernelTranslation.cpp) target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS}) -# compile host translator -include_directories(./HostTranslation/include/x86) +# build host translator +include_directories(./HostTranslation/include/cpu) add_subdirectory(HostTranslation) add_executable(hostTranslator HostTranslation.cpp) diff --git a/compilation/HostTranslation.cpp b/compilation/HostTranslation.cpp index cb32b34..02a7d9c 100644 --- a/compilation/HostTranslation.cpp +++ b/compilation/HostTranslation.cpp @@ -4,12 +4,7 @@ #include "ReplaceCudaBuiltin.h" #include "ReplaceKernelArgs.h" #include "tool.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" #include -#include -#include -#include using namespace llvm; diff --git a/compilation/HostTranslation/CMakeLists.txt b/compilation/HostTranslation/CMakeLists.txt index cf893ac..4fd9cb8 100644 --- a/compilation/HostTranslation/CMakeLists.txt +++ b/compilation/HostTranslation/CMakeLists.txt @@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON) set(LIB_NAME cudaRuntime2cpuRuntime) set(CMAKE_CXX_STANDARD 14) -set(CMAKE_BUILD_TYPE Debug) -include_directories(./include/x86) +include_directories(./include/cpu) include_directories(../../common) -file(GLOB proj_HEADERS "include/x86/*.h") -file(GLOB proj_SOURCES "src/x86/*.cpp") +file(GLOB proj_HEADERS "include/cpu/*.h") +file(GLOB proj_SOURCES "src/cpu/*.cpp") # Add core library. add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/HostTranslation/include/x86/RemoveCudaBuiltin.h b/compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h similarity index 100% rename from compilation/HostTranslation/include/x86/RemoveCudaBuiltin.h rename to compilation/HostTranslation/include/cpu/RemoveCudaBuiltin.h diff --git a/compilation/HostTranslation/include/x86/RemoveMetadata.h b/compilation/HostTranslation/include/cpu/RemoveMetadata.h similarity index 100% rename from compilation/HostTranslation/include/x86/RemoveMetadata.h rename to compilation/HostTranslation/include/cpu/RemoveMetadata.h diff --git a/compilation/HostTranslation/include/x86/ReplaceConstantMemory.h b/compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h similarity index 100% rename from compilation/HostTranslation/include/x86/ReplaceConstantMemory.h rename to compilation/HostTranslation/include/cpu/ReplaceConstantMemory.h diff --git a/compilation/HostTranslation/include/x86/ReplaceCudaBuiltin.h b/compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h similarity index 100% rename from compilation/HostTranslation/include/x86/ReplaceCudaBuiltin.h rename to compilation/HostTranslation/include/cpu/ReplaceCudaBuiltin.h diff --git a/compilation/HostTranslation/include/x86/ReplaceKernelArgs.h b/compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h similarity index 100% rename from compilation/HostTranslation/include/x86/ReplaceKernelArgs.h rename to compilation/HostTranslation/include/cpu/ReplaceKernelArgs.h diff --git a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp b/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp similarity index 93% rename from compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp rename to compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp index f74b816..acb8024 100644 --- a/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp +++ b/compilation/HostTranslation/src/cpu/RemoveCudaBuiltin.cpp @@ -3,16 +3,8 @@ */ #include "RemoveCudaBuiltin.h" #include "debug.hpp" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include "llvm/Support/ToolOutputFile.h" -#include "llvm/Transforms/Utils/CtorUtils.h" -#include -#include #include using namespace llvm; diff --git a/compilation/HostTranslation/src/x86/RemoveMetadata.cpp b/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp similarity index 66% rename from compilation/HostTranslation/src/x86/RemoveMetadata.cpp rename to compilation/HostTranslation/src/cpu/RemoveMetadata.cpp index 515befc..4d7ddc3 100644 --- a/compilation/HostTranslation/src/x86/RemoveMetadata.cpp +++ b/compilation/HostTranslation/src/cpu/RemoveMetadata.cpp @@ -1,16 +1,14 @@ #include "RemoveMetadata.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/ToolOutputFile.h" -#include +#include "llvm/Support/Host.h" using namespace llvm; void RemoveMetadata(llvm::Module *M) { + // change the target triple to the host triple + M->setTargetTriple(llvm::sys::getProcessTriple()); + // use the default DataLayout + M->setDataLayout(""); + SmallVector, 4> MDs; for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); @@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) { F->removeFnAttr("min-legal-vector-width"); F->removeFnAttr("no-trapping-math"); F->removeFnAttr(llvm::Attribute::OptimizeNone); + F->removeFnAttr("target-cpu"); + F->removeFnAttr("target-features"); } } diff --git a/compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp b/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp similarity index 96% rename from compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp rename to compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp index 48b0a0f..adb191b 100644 --- a/compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp +++ b/compilation/HostTranslation/src/cpu/ReplaceConstantMemory.cpp @@ -1,12 +1,7 @@ #include "ReplaceConstantMemory.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include #include -#include #include #include diff --git a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp b/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp similarity index 94% rename from compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp rename to compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp index 5c1ea5d..23f2df3 100644 --- a/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp +++ b/compilation/HostTranslation/src/cpu/ReplaceCudaBuiltin.cpp @@ -1,13 +1,6 @@ #include "ReplaceCudaBuiltin.h" #include "debug.hpp" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/ToolOutputFile.h" -#include #include #include #include @@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) { std::map kernels; std::set need_remove; - LLVMContext *C = &M->getContext(); - - llvm::Type *Int32T = Type::getInt32Ty(*C); - llvm::Type *Int8T = Type::getInt8Ty(*C); - - llvm::FunctionType *LauncherFuncT = - FunctionType::get(Type::getVoidTy(*C), NULL); - - llvm::FunctionType *LaunchFun2 = - FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL); - - bool done = false; std::set cuda_register_kernel_names; @@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) { std::vector arg_sizes; functionOperand = dyn_cast(callOperand->stripPointerCasts()); - - FunctionType *ft = calledFunction->getFunctionType(); DEBUG_INFO("Parent (Caller) Function Name: %s, " "cudaLaunchKernel Function: %s, args : %d\n", func_name.c_str(), diff --git a/compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp b/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp similarity index 88% rename from compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp rename to compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp index 501a783..6d6212c 100644 --- a/compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp +++ b/compilation/HostTranslation/src/cpu/ReplaceKernelArgs.cpp @@ -1,12 +1,5 @@ #include "ReplaceKernelArgs.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/ToolOutputFile.h" -#include #include #include @@ -23,12 +16,9 @@ using namespace llvm; // to use use-analysis to find the arguments in the future void ReplaceKernelArg(llvm::Module *M) { LLVMContext &context = M->getContext(); - auto VoidTy = llvm::Type::getVoidTy(context); - auto I8 = llvm::Type::getInt8PtrTy(context); std::map kernels; std::set need_replace; - LLVMContext *C = &M->getContext(); for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); diff --git a/compilation/KernelTranslation.cpp b/compilation/KernelTranslation.cpp index 2dd318b..bb2d0b1 100644 --- a/compilation/KernelTranslation.cpp +++ b/compilation/KernelTranslation.cpp @@ -1,4 +1,4 @@ -#include "generate_x86_format.h" +#include "generate_cpu_format.h" #include "handle_sync.h" #include "init.h" #include "insert_sync.h" @@ -6,17 +6,14 @@ #include "performance.h" #include "tool.h" #include "warp_func.h" -#include "llvm/IR/Module.h" #include -#include -#include -#include -#include -#include -#include using namespace llvm; +// to support constant memory variables, we need to convert information +// from kernelTranslator to HostTranslator, since HostTranslator knows nothing +// about the kernel functions, we need to write the information to a file +// by KernelTranslator and read it in HostTranslator std::string PATH = "kernel_meta.log"; int main(int argc, char **argv) { @@ -26,8 +23,9 @@ int main(int argc, char **argv) { std::ofstream fout; fout.open(PATH); - // inline, and create auxiliary global variables + // inline __device__ functions, and create auxiliary global variables init_block(program, fout); + // insert sync before each vote, and replace the // original vote function to warp vote handle_warp_vote(program); @@ -40,17 +38,18 @@ int main(int argc, char **argv) { // split block by sync split_block_by_sync(program); - // add loop for intra&intera thread + + // add loop for intra&intera thread, it refers 'hierarchical collapsing' in + // COX paper. insert_warp_loop(program); - // (TODO): replace this patch replace_built_in_function(program); - // TODO: replace with a more general function - // Not only for x86 backend - generate_x86_format(program); + // the input kernel programs have NVIDIA metadata, they need to be replaced to + // CPU metadata + generate_cpu_format(program); - // performance optimization + // execute O3 pipeline on the transformed program performance_optimization(program); VerifyModule(program); diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt index a5d71b9..e2e7633 100644 --- a/compilation/KernelTranslation/CMakeLists.txt +++ b/compilation/KernelTranslation/CMakeLists.txt @@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON) set(LIB_NAME spmd2mpmd) set(CMAKE_CXX_STANDARD 14) -set(CMAKE_BUILD_TYPE Debug) -include_directories(./include/x86) +include_directories(./include/cpu) include_directories(../../common) -file(GLOB proj_HEADERS "include/x86/*.h") -file(GLOB proj_SOURCES "src/x86/*.cpp") +file(GLOB proj_HEADERS "include/cpu/*.h") +file(GLOB proj_SOURCES "src/cpu/*.cpp") # Add core library. add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/KernelTranslation/include/cpu/generate_cpu_format.h b/compilation/KernelTranslation/include/cpu/generate_cpu_format.h new file mode 100644 index 0000000..86e6abc --- /dev/null +++ b/compilation/KernelTranslation/include/cpu/generate_cpu_format.h @@ -0,0 +1,10 @@ +#ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__ +#define __NVVM2CPU_GENERATE_CPU_FORMAT__ + +#include "llvm/IR/Module.h" + +void generate_cpu_format(llvm::Module *M); + +void set_meta_data(llvm::Module *M); + +#endif diff --git a/compilation/KernelTranslation/include/x86/handle_sync.h b/compilation/KernelTranslation/include/cpu/handle_sync.h similarity index 100% rename from compilation/KernelTranslation/include/x86/handle_sync.h rename to compilation/KernelTranslation/include/cpu/handle_sync.h diff --git a/compilation/KernelTranslation/include/x86/init.h b/compilation/KernelTranslation/include/cpu/init.h similarity index 100% rename from compilation/KernelTranslation/include/x86/init.h rename to compilation/KernelTranslation/include/cpu/init.h diff --git a/compilation/KernelTranslation/include/x86/insert_sync.h b/compilation/KernelTranslation/include/cpu/insert_sync.h similarity index 100% rename from compilation/KernelTranslation/include/x86/insert_sync.h rename to compilation/KernelTranslation/include/cpu/insert_sync.h diff --git a/compilation/KernelTranslation/include/x86/insert_warp_loop.h b/compilation/KernelTranslation/include/cpu/insert_warp_loop.h similarity index 100% rename from compilation/KernelTranslation/include/x86/insert_warp_loop.h rename to compilation/KernelTranslation/include/cpu/insert_warp_loop.h diff --git a/compilation/KernelTranslation/include/x86/memory_hierarchy.h b/compilation/KernelTranslation/include/cpu/memory_hierarchy.h similarity index 100% rename from compilation/KernelTranslation/include/x86/memory_hierarchy.h rename to compilation/KernelTranslation/include/cpu/memory_hierarchy.h diff --git a/compilation/KernelTranslation/include/x86/performance.h b/compilation/KernelTranslation/include/cpu/performance.h similarity index 100% rename from compilation/KernelTranslation/include/x86/performance.h rename to compilation/KernelTranslation/include/cpu/performance.h diff --git a/compilation/KernelTranslation/include/x86/tool.h b/compilation/KernelTranslation/include/cpu/tool.h similarity index 100% rename from compilation/KernelTranslation/include/x86/tool.h rename to compilation/KernelTranslation/include/cpu/tool.h diff --git a/compilation/KernelTranslation/include/x86/warp_func.h b/compilation/KernelTranslation/include/cpu/warp_func.h similarity index 100% rename from compilation/KernelTranslation/include/x86/warp_func.h rename to compilation/KernelTranslation/include/cpu/warp_func.h diff --git a/compilation/KernelTranslation/include/x86/generate_x86_format.h b/compilation/KernelTranslation/include/x86/generate_x86_format.h deleted file mode 100644 index 747b1b1..0000000 --- a/compilation/KernelTranslation/include/x86/generate_x86_format.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __NVVM2x86_GENERATE_X86_FORMAT__ -#define __NVVM2x86_GENERATE_X86_FORMAT__ - -#include "llvm/IR/Module.h" - -void generate_x86_format(llvm::Module *M); - -void set_meta_data(llvm::Module *M); - -#endif diff --git a/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp b/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp new file mode 100644 index 0000000..9ee10ff --- /dev/null +++ b/compilation/KernelTranslation/src/cpu/generate_cpu_format.cpp @@ -0,0 +1,125 @@ +#include "generate_cpu_format.h" +#include "debug.hpp" +#include "tool.h" +#include "llvm/Support/Host.h" + +using namespace llvm; + +// set TargetTriple and DataLayout same as the host CPU +void set_meta_data(llvm::Module *M) { + M->setTargetTriple(llvm::sys::getProcessTriple()); + // use the default DataLayout + M->setDataLayout(""); +} + +// as pthread only accept a single void* for input +// we have to decode this input inside the kernel +void decode_input(llvm::Module *M) { + + std::set need_remove; + + llvm::Type *Int32T = Type::getInt32Ty(M->getContext()); + llvm::Type *Int8T = Type::getInt8Ty(M->getContext()); + + llvm::FunctionType *LauncherFuncT = FunctionType::get( + Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false); + + // generate Wrapper Function type + // now we only support a single int32* + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + if (!isKernelFunction(M, F)) + continue; + auto func_name = F->getName().str(); + // filter out _Z24 and other mangled prefix + for (int pos = 2; pos < func_name.length(); pos++) { + if (func_name[pos] >= '0' && func_name[pos] <= '9') + continue; + func_name = func_name.substr(pos); + break; + } + llvm::IRBuilder<> Builder(M->getContext()); + + FunctionCallee fc = + M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT); + Function *WorkGroup = dyn_cast(fc.getCallee()); + + BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup); + Builder.SetInsertPoint(Block); + + // WorkGroup has only a single input + Function::arg_iterator ai = WorkGroup->arg_begin(); + + SmallVector Arguments; + Value *input_arg = &*ai; + // convert to int** + input_arg = Builder.CreateBitOrPointerCast( + input_arg, PointerType::get(PointerType::get(Int32T, 0), 0)); + + size_t idx = 0; + // replace original arguments with the unpacked values + // for example, for a function f(int* a, char* b), + // we will generate a function f_wrapper(int** input) + // and replace the original arguments with the unpacked values + // e.g., a = (int*)input[0], b = (char*)input[1] + for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); + ii != ee; ++ii) { + Type *ArgType = ii->getType(); + // calculate addr + Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx)); + // load corresponding int* + GEP = createLoad(Builder, GEP); + // bitcast + GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0)); + Value *Arg = createLoad(Builder, GEP); + Arguments.push_back(Arg); + ++idx; + } + Builder.CreateCall(F, ArrayRef(Arguments)); + Builder.CreateRetVoid(); + } + for (auto f : need_remove) { + f->dropAllReferences(); + f->eraseFromParent(); + } +} + +// after flat/hierarchical collapsing, the barrier instructions are useless +void remove_barrier(llvm::Module *M) { + std::vector need_remove; + for (auto F = M->begin(); F != M->end(); ++F) + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) { + if (auto Call = dyn_cast(Inst)) { + if (Call->isInlineAsm()) + continue; + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "llvm.nvvm.bar.warp.sync" || + func_name == "llvm.nvvm.barrier0" || + func_name == "llvm.nvvm.barrier.sync") { + need_remove.push_back(Call); + } + } + } + } + for (auto inst : need_remove) { + inst->eraseFromParent(); + } +} + +void remove_useless_var(llvm::Module *M) { + M->getGlobalVariable("intra_warp_index")->eraseFromParent(); + M->getGlobalVariable("inter_warp_index")->eraseFromParent(); +} + +void generate_cpu_format(llvm::Module *M) { + DEBUG_INFO("generate cpu format\n"); + // change metadata + set_meta_data(M); + // decode argument + decode_input(M); + // remove barrier + remove_barrier(M); + // remove useless func/variable + remove_useless_var(M); +} diff --git a/compilation/KernelTranslation/src/x86/handle_sync.cpp b/compilation/KernelTranslation/src/cpu/handle_sync.cpp similarity index 90% rename from compilation/KernelTranslation/src/x86/handle_sync.cpp rename to compilation/KernelTranslation/src/cpu/handle_sync.cpp index f9f4b5c..4806e58 100644 --- a/compilation/KernelTranslation/src/x86/handle_sync.cpp +++ b/compilation/KernelTranslation/src/cpu/handle_sync.cpp @@ -1,13 +1,7 @@ #include "handle_sync.h" #include "debug.hpp" #include "tool.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include #include diff --git a/compilation/KernelTranslation/src/x86/init.cpp b/compilation/KernelTranslation/src/cpu/init.cpp similarity index 93% rename from compilation/KernelTranslation/src/x86/init.cpp rename to compilation/KernelTranslation/src/cpu/init.cpp index 049d4b2..117de31 100644 --- a/compilation/KernelTranslation/src/x86/init.cpp +++ b/compilation/KernelTranslation/src/cpu/init.cpp @@ -2,26 +2,11 @@ #include "debug.hpp" #include "memory_hierarchy.h" #include "tool.h" -#include -#include -#include - -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" -#include "llvm/PassInfo.h" -#include "llvm/PassRegistry.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include using namespace llvm; @@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) { for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); - auto func_name = F->getName().str(); if (!isKernelFunction(M, F)) continue; - Function::iterator I = F->begin(); - for (Function::iterator E = F->end(); I != E; ++I) { + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { if (CallInst *c = dyn_cast(BI++)) { if (c->getCalledFunction()) { @@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) { } bool find_sreg_inst(llvm::Function *F) { - Function::iterator I = F->begin(); - for (Function::iterator E = F->end(); I != E; ++I) { + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { if (CallInst *c = dyn_cast(BI++)) { if (c->getCalledFunction()) { @@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) { Passes.run(*M); } +// transform constant expression into sequence of instructions bool lower_constant_expr(llvm::Module *M) { bool modified = false; - LLVMContext &context = M->getContext(); - auto I32 = llvm::Type::getInt32Ty(context); std::vector need_remove; for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); - auto func_name = F->getName().str(); if (!isKernelFunction(M, F)) continue; @@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) { return modified; } +// replace _ZL3expd, just delete its body void replace_cuda_math_built_in(llvm::Module *M) { - // replace _ZL3expd, just delete its body for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); auto func_name = F->getName().str(); diff --git a/compilation/KernelTranslation/src/x86/insert_sync.cpp b/compilation/KernelTranslation/src/cpu/insert_sync.cpp similarity index 95% rename from compilation/KernelTranslation/src/x86/insert_sync.cpp rename to compilation/KernelTranslation/src/cpu/insert_sync.cpp index aa2be0c..a2c5311 100644 --- a/compilation/KernelTranslation/src/x86/insert_sync.cpp +++ b/compilation/KernelTranslation/src/cpu/insert_sync.cpp @@ -4,29 +4,9 @@ #include "handle_sync.h" #include "tool.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/PostDominators.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/ValueSymbolTable.h" -#include "llvm/InitializePasses.h" -#include "llvm/PassInfo.h" -#include "llvm/PassRegistry.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include #include using namespace llvm; @@ -44,7 +24,7 @@ public: std::vector insert_intra_warp_sync_before; std::vector insert_inter_warp_sync_before; - // insert sync in the entry + // insert sync after the entry and before the first non-AllocaInst BasicBlock *entry = &(*F.begin()); for (auto i = entry->begin(); i != entry->end(); i++) { if (!isa(i)) { @@ -54,10 +34,8 @@ public: } for (Function::iterator I = F.begin(); I != F.end(); ++I) { - BasicBlock::iterator BI = I->begin(); - // insert barrier before return - for (; BI != I->end(); BI++) { + for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) { llvm::ReturnInst *Ret = llvm::dyn_cast(&(*BI)); if (Ret) { insert_inter_warp_sync_before.push_back(&(*BI)); @@ -125,7 +103,7 @@ public: auto PDT = &getAnalysis(); - // first find all conditional barriers + // find all conditional barriers std::vector conditionalBarriers; for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { BasicBlock *b = &*i; @@ -148,12 +126,9 @@ public: conditionalBarriers.pop_back(); // insert barrier in the start of if-condition - - BasicBlock *pos = b; BasicBlock *pred = firstNonBackedgePredecessor(b); while (PDT->getPostDomTree().dominates(b, pred)) { - pos = pred; // If our BB post dominates the given block, we know it is not the // branching block that makes the barrier conditional. pred = firstNonBackedgePredecessor(pred); @@ -468,7 +443,6 @@ public: auto header_block = L->getHeader(); assert(header_block->getTerminator()->getNumSuccessors() == 2 && "has more than 2 successors of the for-head\n"); - BasicBlock *for_body = NULL; for (int i = 0; i < header_block->getTerminator()->getNumSuccessors(); i++) { auto bb = header_block->getTerminator()->getSuccessor(i); diff --git a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp b/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp similarity index 96% rename from compilation/KernelTranslation/src/x86/insert_warp_loop.cpp rename to compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp index 0c11c2f..50aa3e0 100644 --- a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp +++ b/compilation/KernelTranslation/src/cpu/insert_warp_loop.cpp @@ -4,43 +4,20 @@ #include "handle_sync.h" #include "tool.h" #include -#include #include -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/DivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/ValueSymbolTable.h" -#include "llvm/InitializePasses.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/PassInfo.h" -#include "llvm/PassRegistry.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" #include #include #include -#include -#include using namespace llvm; @@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction, BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock(); IRBuilder<> builder(&*(bb.getFirstInsertionPt())); - Function *FF = instruction->getParent()->getParent(); Module *M = instruction->getParent()->getParent()->getParent(); - LLVMContext &C = M->getContext(); - const llvm::DataLayout &Layout = M->getDataLayout(); llvm::Type *elementType; if (isa(instruction)) { @@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction, } Type *AllocType = elementType; - AllocaInst *InstCast = dyn_cast(instruction); - llvm::Value *ItemSize = nullptr; llvm::AllocaInst *Alloca = nullptr; auto block_size_addr = M->getGlobalVariable("block_size"); @@ -697,9 +669,6 @@ public: is_single_conditional_branch_block = 1; } else { // generate by replicate local variable - printf( - "[WARNING] match single conditional branch with HARD CODE\n"); - bool branch_to_intra_init = false; for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) { llvm::BasicBlock *entryCandidate = br->getSuccessor(suc); auto block_name = entryCandidate->getName().str(); @@ -755,7 +724,7 @@ public: entry = entryCandidate; break; } - // delete useless PR, those PRs only have branch + // delete useless PR, those PRs only have branch instructions if (entry == exit) { if (entry->size() == 1 && isa(entry->begin())) { return; diff --git a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp b/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp similarity index 90% rename from compilation/KernelTranslation/src/x86/memory_hierarchy.cpp rename to compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp index 5bc4e01..90f5a99 100644 --- a/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp +++ b/compilation/KernelTranslation/src/cpu/memory_hierarchy.cpp @@ -1,29 +1,10 @@ #include "memory_hierarchy.h" #include "debug.hpp" -#include "llvm/IR/CFG.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include -#include -#include #include #include -#include -#include -#include void mem_share2global(llvm::Module *M) { - LLVMContext *C = &M->getContext(); - llvm::Type *Int32T = Type::getInt32Ty(*C); - llvm::Type *Int64T = Type::getInt64Ty(*C); - llvm::Type *Int8T = Type::getInt8Ty(*C); - std::map corresponding_global_memory; std::set need_remove; std::set need_remove_share_memory; @@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) { // generate global type pointer PointerType *PointerTy = PointerType::get(array_type->getElementType(), 0); - llvm::Constant *x1 = ConstantPointerNull::get(PointerTy); llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable( *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage, NULL, "dynamic_shared_memory", NULL, @@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) { std::pair(share_memory, global_memory)); } else if (element_type->isFloatTy()) { - auto FP_type = llvm::Type::getFloatTy(*C); + auto FP_type = llvm::Type::getFloatTy(M->getContext()); auto zero = llvm::ConstantFP::get(FP_type, 0); llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero, @@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) { } void mem_constant2global(llvm::Module *M, std::ofstream &fout) { - LLVMContext *C = &M->getContext(); - llvm::Type *Int32T = Type::getInt32Ty(*C); - llvm::Type *Int64T = Type::getInt64Ty(*C); - llvm::Type *Int8T = Type::getInt8Ty(*C); - std::map corresponding_global_memory; std::set need_remove; std::set need_remove_constant_memory; @@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) { if (GlobalVariable *constant_memory = dyn_cast(I)) { if (auto PT = dyn_cast(I->getType())) { unsigned AS = PT->getAddressSpace(); - if (AS == 4) { // find a share memory + if (AS == 4) { // find a constant memory need_remove_constant_memory.insert(constant_memory); // generate the corresponding global memory variable auto new_name = "wrapper_global_" + constant_memory->getName().str(); @@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) { if (auto array_type = dyn_cast(element_type)) { if (constant_memory->hasExternalLinkage() && array_type->getArrayNumElements() == 0) { - // external shared memory of [] + // external constant memory of [] // generate global type pointer PointerType *PointerTy = PointerType::get(array_type->getElementType(), 0); diff --git a/compilation/KernelTranslation/src/x86/performance.cpp b/compilation/KernelTranslation/src/cpu/performance.cpp similarity index 60% rename from compilation/KernelTranslation/src/x86/performance.cpp rename to compilation/KernelTranslation/src/cpu/performance.cpp index 2d79384..fa00643 100644 --- a/compilation/KernelTranslation/src/x86/performance.cpp +++ b/compilation/KernelTranslation/src/cpu/performance.cpp @@ -1,43 +1,13 @@ #include "performance.h" #include "debug.hpp" #include "tool.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/ValueSymbolTable.h" -#include "llvm/InitializePasses.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/PassInfo.h" -#include "llvm/PassRegistry.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include -#include -#include -#include -#include using namespace llvm; @@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) { llvm::legacy::PassManager Passes; // add target machine info - llvm::Triple triple("x86_64-unknown-linux-gnu"); + llvm::Triple triple(llvm::sys::getProcessTriple()); std::string Error; const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error); @@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) { Options.FloatABIType = FloatABI::Hard; TargetMachine *TM = TheTarget->createTargetMachine( - triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"), + triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""), Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive); assert(TM && "No Machine Information\n"); @@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) { Builder.LoopVectorize = true; Builder.SLPVectorize = true; - Builder.VerifyInput = true; - Builder.VerifyOutput = true; - Builder.populateModulePassManager(Passes); Passes.run(*M); } diff --git a/compilation/KernelTranslation/src/x86/tool.cpp b/compilation/KernelTranslation/src/cpu/tool.cpp similarity index 96% rename from compilation/KernelTranslation/src/x86/tool.cpp rename to compilation/KernelTranslation/src/cpu/tool.cpp index d3024ac..32ab62e 100644 --- a/compilation/KernelTranslation/src/x86/tool.cpp +++ b/compilation/KernelTranslation/src/cpu/tool.cpp @@ -1,29 +1,13 @@ #include "tool.h" #include "debug.hpp" #include "llvm/Bitcode/BitcodeWriter.h" -#include "llvm/Config/llvm-config.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorOr.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" - -#include #include using namespace llvm; @@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) { Value *val = phi->getIncomingValue(incoming); BasicBlock *incomingBB = phi->getIncomingBlock(incoming); builder.SetInsertPoint(incomingBB->getTerminator()); - llvm::Instruction *store = builder.CreateStore(val, alloca); + builder.CreateStore(val, alloca); } builder.SetInsertPoint(phi); @@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) { } } - bool changed = false; for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) { Instruction *instr = *i; BreakPHIToAllocas(dyn_cast(instr)); @@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) { for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BI = BB->begin(); BI != BB->end(); BI++) { - if (auto Load = dyn_cast(BI)) { - auto load_from = Load->getOperand(0); - } else if (auto Call = dyn_cast(BI)) { + if (auto Call = dyn_cast(BI)) { if (Call->getCalledFunction()) { auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" || @@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) { if (auto Call = dyn_cast(BI)) { if (Call->getCalledFunction()) { auto func_name = Call->getCalledFunction()->getName().str(); - auto callFn = Call->getCalledFunction(); if (func_name == "vprintf") { /* * replace CUDA's printf to C's printf @@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) { dyn_cast(BC->getOperand(0)->getType()); auto SrcTy = SrcPointTy->getElementType(); // reverse the bitcast - auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call); + new BitCastInst(BC, SrcPointTy, "", Call); assert(SrcTy->isStructTy() == 1); auto StructTy = dyn_cast(SrcTy); for (int i = 0; i < StructTy->getNumElements(); i++) { @@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) { void replace_asm_call(llvm::Module *M) { LLVMContext &context = M->getContext(); - auto I32 = llvm::Type::getInt32Ty(context); std::vector need_remove; for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { Function *F = &(*i); diff --git a/compilation/KernelTranslation/src/x86/warp_func.cpp b/compilation/KernelTranslation/src/cpu/warp_func.cpp similarity index 95% rename from compilation/KernelTranslation/src/x86/warp_func.cpp rename to compilation/KernelTranslation/src/cpu/warp_func.cpp index d388f11..ed406c9 100644 --- a/compilation/KernelTranslation/src/x86/warp_func.cpp +++ b/compilation/KernelTranslation/src/cpu/warp_func.cpp @@ -2,15 +2,6 @@ #include "warp_func.h" #include "debug.hpp" #include "tool.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include #include using namespace llvm; @@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) { res = BinaryOperator::CreateNot(res, "", sync_inst); } - auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst); + new llvm::StoreInst(res, GEP, "", sync_inst); // create barrier CreateIntraWarpBarrier(sync_inst); /* diff --git a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp deleted file mode 100644 index 23c97d0..0000000 --- a/compilation/KernelTranslation/src/x86/generate_x86_format.cpp +++ /dev/null @@ -1,186 +0,0 @@ -#include "generate_x86_format.h" -#include "debug.hpp" -#include "tool.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/ToolOutputFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include -#include - -using namespace llvm; - -void set_meta_data(llvm::Module *M) { - M->setTargetTriple("x86_64-unknown-linux-gnu"); - M->setDataLayout( - "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); -} - -// as pthread only accept a single void* for input -// we have to decode this input inside the kernel -void decode_input(llvm::Module *M) { - - std::set need_remove; - - LLVMContext *C = &M->getContext(); - llvm::Type *Int32T = Type::getInt32Ty(*C); - llvm::Type *Int8T = Type::getInt8Ty(*C); - - llvm::FunctionType *LauncherFuncT = FunctionType::get( - Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false); - - std::set dynmaic_memory; - - std::map corres_dynamic_memory_load_address; - - // generate Wrapper Function type - // now we only support a single int32* - for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { - Function *F = &(*i); - if (!isKernelFunction(M, F)) - continue; - auto func_name = F->getName().str(); - // remove mangle prefix - // remove _Z24 - for (int pos = 2; pos < func_name.length(); pos++) { - if (func_name[pos] >= '0' && func_name[pos] <= '9') - continue; - func_name = func_name.substr(pos); - break; - } - llvm::IRBuilder<> Builder(M->getContext()); - - FunctionCallee fc = - M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT); - Function *WorkGroup = dyn_cast(fc.getCallee()); - - BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup); - Builder.SetInsertPoint(Block); - - // WorkGroup has only a single input - Function::arg_iterator ai = WorkGroup->arg_begin(); - - SmallVector Arguments; - Value *input_arg = &*ai; - // convert to int** - input_arg = Builder.CreateBitOrPointerCast( - input_arg, PointerType::get(PointerType::get(Int32T, 0), 0)); - - // dynamic memory load in the wrapper function - GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data"); - if (share_memory != NULL) { - dynmaic_memory.insert(share_memory); - llvm::GlobalVariable *global_mem = new llvm::GlobalVariable( - *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL, - "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, - 0, false); - Value *loadedValue = createLoad(Builder, global_mem); - - llvm::FunctionType *LaunchFun2 = FunctionType::get( - PointerType::get(PointerType::get(Int32T, 0), 0), NULL); - - FunctionCallee fc2 = - M->getOrInsertFunction("_wrapper_global_data", LaunchFun2); - - Function *WorkGroup2 = dyn_cast(fc2.getCallee()); - - WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage); - WorkGroup2->setVisibility(GlobalValue::HiddenVisibility); - Comdat *co = M->getOrInsertComdat("_wrapper_global_data"); - co->setSelectionKind(Comdat::SelectionKind::Any); - WorkGroup2->setComdat(co); - - BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2); - - llvm::IRBuilder<> Builder2(M->getContext()); - Builder2.SetInsertPoint(Block2); - Builder2.CreateRet(share_memory); - - auto PT = dyn_cast(share_memory->getType()); - auto element_type = PT->getElementType(); - - AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr"); - Value *new_ar = new_arr; - Value *gptr = Builder.CreateBitOrPointerCast( - share_memory, PointerType::get(PointerType::get(Int8T, 0), 0)); - - Builder.CreateStore(new_ar, gptr); - } - - size_t idx = 0; - for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); - ii != ee; ++ii) { - Type *ArgType = ii->getType(); - - // calculate addr - Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx)); - // load corresponding int* - GEP = createLoad(Builder, GEP); - // bitcast - GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0)); - Value *Arg = createLoad(Builder, GEP); - Arguments.push_back(Arg); - ++idx; - } - - CallInst *c = Builder.CreateCall(F, ArrayRef(Arguments)); - Builder.CreateRetVoid(); - } - for (auto f : need_remove) { - f->dropAllReferences(); - f->eraseFromParent(); - } -} - -void remove_barrier(llvm::Module *M) { - std::vector need_remove; - for (auto F = M->begin(); F != M->end(); ++F) - for (auto BB = F->begin(); BB != F->end(); ++BB) { - for (auto BI = BB->begin(); BI != BB->end(); BI++) { - if (auto Call = dyn_cast(BI)) { - if (Call->isInlineAsm()) - continue; - auto func_name = Call->getCalledFunction()->getName().str(); - if (func_name == "llvm.nvvm.bar.warp.sync" || - func_name == "llvm.nvvm.barrier0" || - func_name == "llvm.nvvm.barrier.sync") { - need_remove.push_back(Call); - } - } - } - } - for (auto inst : need_remove) { - inst->eraseFromParent(); - } -} - -void remove_useless_var(llvm::Module *M) { - M->getGlobalVariable("intra_warp_index")->eraseFromParent(); - M->getGlobalVariable("inter_warp_index")->eraseFromParent(); -} - -void generate_x86_format(llvm::Module *M) { - DEBUG_INFO("generate x86 format\n"); - // change metadata - set_meta_data(M); - // decode argument - decode_input(M); - // remove barrier - remove_barrier(M); - // remove useless func/variable - remove_useless_var(M); -} diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 4c4e8e7..36d86ad 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -1,9 +1,9 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR) project( - X86runtime - DESCRIPTION "Implementation CUDA runtime API with x86" + CPUruntime + DESCRIPTION "Implementation CUDA runtime API with CPUs" LANGUAGES CXX) -set(LIB_NAME x86Runtime) +set(LIB_NAME CPUruntime) set(CMAKE_VERBOSE_MAKEFILE ON) # compile threadPool implementation @@ -12,9 +12,9 @@ add_subdirectory(threadPool) # compile x86 runtime library include_directories(../common) include_directories(./include/) -include_directories(./include/x86) +include_directories(./include/cpu) include_directories(./threadPool/include/) -include_directories(./threadPool/include/x86) +include_directories(./threadPool/include/cpu) include_directories(../external/moodycamel/) -file(GLOB proj_SOURCES "src/x86/*.cpp") +file(GLOB proj_SOURCES "src/cpu/*.cpp") add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/include/x86/cudaKernelImpl.h b/runtime/include/cpu/cudaKernelImpl.h similarity index 96% rename from runtime/include/x86/cudaKernelImpl.h rename to runtime/include/cpu/cudaKernelImpl.h index e65d0c1..1abbfae 100644 --- a/runtime/include/x86/cudaKernelImpl.h +++ b/runtime/include/cpu/cudaKernelImpl.h @@ -1,4 +1,4 @@ -#ifndef __RUNTIME_IMPL__ +#ifndef __KERNEL_IMPL__ #define __KERNEL_IMPL__ #include "structures.h" #include diff --git a/runtime/include/x86/cudaRuntimeImpl.h b/runtime/include/cpu/cudaRuntimeImpl.h similarity index 100% rename from runtime/include/x86/cudaRuntimeImpl.h rename to runtime/include/cpu/cudaRuntimeImpl.h diff --git a/runtime/src/x86/cudaKernelImpl.cpp b/runtime/src/cpu/cudaKernelImpl.cpp similarity index 100% rename from runtime/src/x86/cudaKernelImpl.cpp rename to runtime/src/cpu/cudaKernelImpl.cpp diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/cpu/cudaRuntimeImpl.cpp similarity index 97% rename from runtime/src/x86/cudaRuntimeImpl.cpp rename to runtime/src/cpu/cudaRuntimeImpl.cpp index 8f60525..c281845 100644 --- a/runtime/src/x86/cudaRuntimeImpl.cpp +++ b/runtime/src/cpu/cudaRuntimeImpl.cpp @@ -10,27 +10,31 @@ #include #include #include + cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; return cudaSuccess; } + const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; } -cudaError_t cudaDeviceReset(void) { - scheduler_uninit(); - return cudaSuccess; -} + +cudaError_t cudaDeviceReset(void) { return cudaSuccess; } + cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); return cudaSuccess; } + cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); return cudaSuccess; } + cudaError_t cudaFree(void *devPtr) { free(devPtr); return cudaSuccess; } + cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); return cudaSuccess; @@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, cu_kernel *ker = create_kernel(func, gridDim, blockDim, args, sharedMem, stream); - int lstatus = cuLaunchKernel(&ker); - + cuLaunchKernel(&ker); return cudaSuccess; } + cudaError_t cudaMalloc(void **devPtr, size_t size) { *devPtr = malloc(size); if (devPtr == NULL) return cudaErrorMemoryAllocation; return cudaSuccess; } + cudaError_t cudaMemset(void *devPtr, int value, size_t count) { memset(devPtr, value, count); return cudaSuccess; } + cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind) { if (kind == cudaMemcpyHostToHost) { @@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { return cudaSuccess; } -static int stream_counter = 1; /* From our evaluation, CPU backend can gain little benefit from multi stream. Thus, we only use single stream @@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess; const char *cudaGetErrorString(cudaError_t error) { if (error == cudaSuccess) { return "Cuda Get Error Success"; + } else { + return "Cuda Get Error Failed"; } } diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt index 5d53566..3f54f99 100644 --- a/runtime/threadPool/CMakeLists.txt +++ b/runtime/threadPool/CMakeLists.txt @@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON) set(LIB_NAME threadPool) set(CMAKE_CXX_STANDARD 14) -set(CMAKE_BUILD_TYPE Debug) include_directories(../../common) include_directories(./include) -include_directories(./include/x86) +include_directories(./include/cpu) include_directories(../../external/moodycamel) -file(GLOB proj_SOURCES "src/x86/*.cpp") +file(GLOB proj_SOURCES "src/cpu/*.cpp") add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/threadPool/include/x86/api.h b/runtime/threadPool/include/cpu/api.h similarity index 100% rename from runtime/threadPool/include/x86/api.h rename to runtime/threadPool/include/cpu/api.h diff --git a/runtime/threadPool/include/x86/def.h b/runtime/threadPool/include/cpu/def.h similarity index 100% rename from runtime/threadPool/include/x86/def.h rename to runtime/threadPool/include/cpu/def.h diff --git a/runtime/threadPool/include/x86/macros.h b/runtime/threadPool/include/cpu/macros.h similarity index 100% rename from runtime/threadPool/include/x86/macros.h rename to runtime/threadPool/include/cpu/macros.h diff --git a/runtime/threadPool/include/x86/structures.h b/runtime/threadPool/include/cpu/structures.h similarity index 100% rename from runtime/threadPool/include/x86/structures.h rename to runtime/threadPool/include/cpu/structures.h diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/cpu/api.cpp similarity index 82% rename from runtime/threadPool/src/x86/api.cpp rename to runtime/threadPool/src/cpu/api.cpp index 4a319fc..7ab36e9 100644 --- a/runtime/threadPool/src/x86/api.cpp +++ b/runtime/threadPool/src/cpu/api.cpp @@ -1,3 +1,19 @@ +/* + This file contains the implementation of the CPU thread pool. For a kernel + launch, the host thread will enqueue the kernel to the kernelQueue, and the + threads in the thread pool will try to fetch work from the queue. After a + thread fetches a kernel from the queue, it will execute the kernel. After the + kernel execution, the thread will try to fetch another kernel from the queue. + If the queue is empty, the thread will wait for the next kernel launch. + + By default, we try to use all CPU cores for execution. Thus, for a kernel + launch, the host thread pushes P kernel variables to the queue, where P is the + number of CPU cores. + + For some lightweight kernels, useing fewer CPU cores can speed up the overall + execution time, due to fewer CPU cores lead to lower synchronization overhead. +*/ + #include "api.h" #include "blockingconcurrentqueue.h" #include "debug.hpp" @@ -9,9 +25,6 @@ #include #include -/* -Initialize the device -*/ int device_max_compute_units = 1; bool device_initilized = false; int init_device() { @@ -32,7 +45,6 @@ int init_device() { } // Create Kernel -static int kernelIds = 0; cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) { cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel)); @@ -71,13 +83,11 @@ __thread int warp_shfl[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -/* - Enqueue Kernel (k) to the scheduler kernelQueue -*/ int TaskToExecute; +// Enqueue Kernel to the scheduler kernelQueue int schedulerEnqueueKernel(cu_kernel *k) { - int totalBlocks = - k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread + int totalBlocks = k->totalBlocks; + // by default, all CPU cores are used to execute GPU blocks equally int gpuBlockToExecutePerCpuThread = (totalBlocks + device_max_compute_units - 1) / device_max_compute_units; TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) / @@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) { return C_SUCCESS; } -/* - Kernel Launch with numBlocks and numThreadsPerBlock -*/ +// Push kernel to the kernelQueue int cuLaunchKernel(cu_kernel **k) { if (!device_initilized) { init_device(); } - // Calculate Block Size N/numBlocks - cu_kernel *ker = *k; - int status = C_RUN; // set complete to false, this variable is used for sync for (int i = 0; i < scheduler->num_worker_threads; i++) { scheduler->thread_pool[i].completeTask = 0; } - schedulerEnqueueKernel(ker); + schedulerEnqueueKernel(*k); return 0; } -/* - Thread Gets Work -*/ +// threads in thread-pool try to fetch work from the queue int get_work(c_thread *th) { int dynamic_shared_mem_size = 0; dim3 gridDim; @@ -136,6 +139,7 @@ int get_work(c_thread *th) { grid_size_x = gridDim.x; grid_size_y = gridDim.y; grid_size_z = gridDim.z; + // allocate dynamic shared memory if (dynamic_shared_mem_size > 0) dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size); // execute GPU blocks @@ -153,7 +157,8 @@ int get_work(c_thread *th) { } // if cannot get tasks, check whether programs stop if (scheduler->threadpool_shutdown_requested) { - return true; // thread exit + // thread exit + break; } } return 0; @@ -176,9 +181,7 @@ void *driver_thread(void *p) { } } -/* -Initialize the scheduler -*/ +// Initialize the scheduler int scheduler_init(cu_device device) { scheduler = (cu_pool *)calloc(1, sizeof(cu_pool)); scheduler->num_worker_threads = device.max_compute_units; @@ -198,8 +201,6 @@ int scheduler_init(cu_device device) { return C_SUCCESS; } -void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); } - /* Barrier for Kernel Launch */ diff --git a/test/runHeteroMark.sh b/test/runHeteroMark.sh index 416e1ed..f9d4c97 100644 --- a/test/runHeteroMark.sh +++ b/test/runHeteroMark.sh @@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \ $HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc $HeteroMark_PATH/src/common/benchmark/*.cc \ $HeteroMark_PATH/src/common/command_line_option/*.cc $HeteroMark_PATH/src/common/time_measurement/*.cc \ -L$CuPBoP_BUILD_PATH/runtime -L$CuPBoP_BUILD_PATH/runtime/threadPool \ - -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool + -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool case $1 in aes)