Refactoring the codebase. Remove useless variables; Add comments; Remove useless header files; Remove hard code and support both x86 and ARM CPU
This commit is contained in:
parent
50d615da64
commit
fd56811650
|
@ -51,7 +51,7 @@ jobs:
|
||||||
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
|
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||||
llc --relocation-model=pic --filetype=obj host.bc
|
llc --relocation-model=pic --filetype=obj host.bc
|
||||||
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
|
||||||
./reverse
|
./reverse
|
||||||
- name: Execute the dynamic shared memory demo
|
- name: Execute the dynamic shared memory demo
|
||||||
run: |
|
run: |
|
||||||
|
@ -63,7 +63,7 @@ jobs:
|
||||||
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
|
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||||
llc --relocation-model=pic --filetype=obj host.bc
|
llc --relocation-model=pic --filetype=obj host.bc
|
||||||
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
|
||||||
./reverse
|
./reverse
|
||||||
- name: Execute Hetero-mark benchmark
|
- name: Execute Hetero-mark benchmark
|
||||||
run: |
|
run: |
|
||||||
|
@ -79,5 +79,5 @@ jobs:
|
||||||
hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
|
hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||||
llc --relocation-model=pic --filetype=obj host.bc
|
llc --relocation-model=pic --filetype=obj host.bc
|
||||||
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
|
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread
|
||||||
./lavaMD -boxes1d 10
|
./lavaMD -boxes1d 10
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||||
|
|
||||||
project(CudaOnX86)
|
project(CuPBoP)
|
||||||
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
|
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.")
|
||||||
set(CMAKE_CXX_STANDARD "14")
|
set(CMAKE_CXX_STANDARD "14")
|
||||||
set(MAJOR_VERSION 0)
|
|
||||||
set(MINOR_VERSION 1)
|
|
||||||
set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
|
|
||||||
set(COX_VERSION ${VERSION_STRING})
|
|
||||||
# get LLVM PATH get PATH for head file
|
|
||||||
if(DEFINED LLVM_CONFIG_PATH)
|
if(DEFINED LLVM_CONFIG_PATH)
|
||||||
if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
|
if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
|
||||||
if(EXISTS "${LLVM_CONFIG_PATH}")
|
if(EXISTS "${LLVM_CONFIG_PATH}")
|
||||||
|
@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH)
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "llvm-config is required")
|
message(FATAL_ERROR "llvm-config is required")
|
||||||
endif()
|
endif()
|
||||||
# get CUDA PATH
|
|
||||||
if(DEFINED CUDA_PATH)
|
if(DEFINED CUDA_PATH)
|
||||||
message(STATUS "Using CUDA: ${CUDA_PATH}")
|
message(STATUS "Using CUDA: ${CUDA_PATH}")
|
||||||
else()
|
else()
|
||||||
|
@ -45,7 +41,7 @@ if(DEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS
|
set(CMAKE_CXX_FLAGS
|
||||||
"-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
|
"-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused")
|
||||||
|
|
||||||
set(GCC_COVERAGE_LINK_FLAGS
|
set(GCC_COVERAGE_LINK_FLAGS
|
||||||
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
|
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
|
||||||
|
@ -54,5 +50,8 @@ add_subdirectory(compilation)
|
||||||
add_subdirectory(runtime)
|
add_subdirectory(runtime)
|
||||||
enable_testing()
|
enable_testing()
|
||||||
|
|
||||||
option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data)
|
set(HETERO_MARK_DATA
|
||||||
|
"/tmp/data"
|
||||||
|
CACHE PATH "The path to download hetero-mark dataset.")
|
||||||
|
|
||||||
add_subdirectory(test)
|
add_subdirectory(test)
|
||||||
|
|
|
@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \
|
||||||
-L$CuPBoP_PATH/build/runtime \
|
-L$CuPBoP_PATH/build/runtime \
|
||||||
-L$CuPBoP_PATH/build/runtime/threadPool \
|
-L$CuPBoP_PATH/build/runtime/threadPool \
|
||||||
host.o kernel.o \
|
host.o kernel.o \
|
||||||
-I../.. -lc -lx86Runtime -lthreadPool -lpthread
|
-I../.. -lc -lCPUruntime -lthreadPool -lpthread
|
||||||
# Execute
|
# Execute
|
||||||
./vecadd
|
./vecadd
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,20 +1,16 @@
|
||||||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||||
project(
|
|
||||||
NVVM2X86
|
|
||||||
DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend"
|
|
||||||
LANGUAGES CXX)
|
|
||||||
|
|
||||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
|
|
||||||
# compile kernel translator
|
# build kernel translator
|
||||||
include_directories(./KernelTranslation/include/x86)
|
include_directories(./KernelTranslation/include/cpu)
|
||||||
add_subdirectory(KernelTranslation)
|
add_subdirectory(KernelTranslation)
|
||||||
|
|
||||||
add_executable(kernelTranslator KernelTranslation.cpp)
|
add_executable(kernelTranslator KernelTranslation.cpp)
|
||||||
target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
|
target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
|
||||||
|
|
||||||
# compile host translator
|
# build host translator
|
||||||
include_directories(./HostTranslation/include/x86)
|
include_directories(./HostTranslation/include/cpu)
|
||||||
add_subdirectory(HostTranslation)
|
add_subdirectory(HostTranslation)
|
||||||
|
|
||||||
add_executable(hostTranslator HostTranslation.cpp)
|
add_executable(hostTranslator HostTranslation.cpp)
|
||||||
|
|
|
@ -4,12 +4,7 @@
|
||||||
#include "ReplaceCudaBuiltin.h"
|
#include "ReplaceCudaBuiltin.h"
|
||||||
#include "ReplaceKernelArgs.h"
|
#include "ReplaceKernelArgs.h"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/IR/Verifier.h"
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
|
|
|
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
set(LIB_NAME cudaRuntime2cpuRuntime)
|
set(LIB_NAME cudaRuntime2cpuRuntime)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 14)
|
set(CMAKE_CXX_STANDARD 14)
|
||||||
set(CMAKE_BUILD_TYPE Debug)
|
include_directories(./include/cpu)
|
||||||
include_directories(./include/x86)
|
|
||||||
include_directories(../../common)
|
include_directories(../../common)
|
||||||
|
|
||||||
file(GLOB proj_HEADERS "include/x86/*.h")
|
file(GLOB proj_HEADERS "include/cpu/*.h")
|
||||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||||
|
|
||||||
# Add core library.
|
# Add core library.
|
||||||
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
|
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
|
||||||
|
|
|
@ -3,16 +3,8 @@
|
||||||
*/
|
*/
|
||||||
#include "RemoveCudaBuiltin.h"
|
#include "RemoveCudaBuiltin.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
#include "llvm/IR/Instructions.h"
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/Support/ToolOutputFile.h"
|
#include "llvm/Support/ToolOutputFile.h"
|
||||||
#include "llvm/Transforms/Utils/CtorUtils.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
|
@ -1,16 +1,14 @@
|
||||||
#include "RemoveMetadata.h"
|
#include "RemoveMetadata.h"
|
||||||
#include "llvm/IR/Function.h"
|
#include "llvm/Support/Host.h"
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/Support/ToolOutputFile.h"
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
void RemoveMetadata(llvm::Module *M) {
|
void RemoveMetadata(llvm::Module *M) {
|
||||||
|
// change the target triple to the host triple
|
||||||
|
M->setTargetTriple(llvm::sys::getProcessTriple());
|
||||||
|
// use the default DataLayout
|
||||||
|
M->setDataLayout("");
|
||||||
|
|
||||||
SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
|
SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
Function *F = &(*i);
|
Function *F = &(*i);
|
||||||
|
@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) {
|
||||||
F->removeFnAttr("min-legal-vector-width");
|
F->removeFnAttr("min-legal-vector-width");
|
||||||
F->removeFnAttr("no-trapping-math");
|
F->removeFnAttr("no-trapping-math");
|
||||||
F->removeFnAttr(llvm::Attribute::OptimizeNone);
|
F->removeFnAttr(llvm::Attribute::OptimizeNone);
|
||||||
|
F->removeFnAttr("target-cpu");
|
||||||
|
F->removeFnAttr("target-features");
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,12 +1,7 @@
|
||||||
#include "ReplaceConstantMemory.h"
|
#include "ReplaceConstantMemory.h"
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
#include "llvm/IR/Instructions.h"
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
|
@ -1,13 +1,6 @@
|
||||||
#include "ReplaceCudaBuiltin.h"
|
#include "ReplaceCudaBuiltin.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
#include "llvm/IR/IRBuilder.h"
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/Support/ToolOutputFile.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
|
||||||
std::map<std::string, Function *> kernels;
|
std::map<std::string, Function *> kernels;
|
||||||
|
|
||||||
std::set<llvm::Function *> need_remove;
|
std::set<llvm::Function *> need_remove;
|
||||||
LLVMContext *C = &M->getContext();
|
|
||||||
|
|
||||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
|
||||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
|
||||||
|
|
||||||
llvm::FunctionType *LauncherFuncT =
|
|
||||||
FunctionType::get(Type::getVoidTy(*C), NULL);
|
|
||||||
|
|
||||||
llvm::FunctionType *LaunchFun2 =
|
|
||||||
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
|
||||||
|
|
||||||
bool done = false;
|
|
||||||
|
|
||||||
std::set<std::string> cuda_register_kernel_names;
|
std::set<std::string> cuda_register_kernel_names;
|
||||||
|
|
||||||
|
@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
|
||||||
std::vector<size_t> arg_sizes;
|
std::vector<size_t> arg_sizes;
|
||||||
functionOperand =
|
functionOperand =
|
||||||
dyn_cast<Function>(callOperand->stripPointerCasts());
|
dyn_cast<Function>(callOperand->stripPointerCasts());
|
||||||
|
|
||||||
FunctionType *ft = calledFunction->getFunctionType();
|
|
||||||
DEBUG_INFO("Parent (Caller) Function Name: %s, "
|
DEBUG_INFO("Parent (Caller) Function Name: %s, "
|
||||||
"cudaLaunchKernel Function: %s, args : %d\n",
|
"cudaLaunchKernel Function: %s, args : %d\n",
|
||||||
func_name.c_str(),
|
func_name.c_str(),
|
|
@ -1,12 +1,5 @@
|
||||||
#include "ReplaceKernelArgs.h"
|
#include "ReplaceKernelArgs.h"
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
#include "llvm/IR/IRBuilder.h"
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/Support/ToolOutputFile.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
|
@ -23,12 +16,9 @@ using namespace llvm;
|
||||||
// to use use-analysis to find the arguments in the future
|
// to use use-analysis to find the arguments in the future
|
||||||
void ReplaceKernelArg(llvm::Module *M) {
|
void ReplaceKernelArg(llvm::Module *M) {
|
||||||
LLVMContext &context = M->getContext();
|
LLVMContext &context = M->getContext();
|
||||||
auto VoidTy = llvm::Type::getVoidTy(context);
|
|
||||||
auto I8 = llvm::Type::getInt8PtrTy(context);
|
|
||||||
std::map<std::string, Function *> kernels;
|
std::map<std::string, Function *> kernels;
|
||||||
|
|
||||||
std::set<llvm::Function *> need_replace;
|
std::set<llvm::Function *> need_replace;
|
||||||
LLVMContext *C = &M->getContext();
|
|
||||||
|
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
Function *F = &(*i);
|
Function *F = &(*i);
|
|
@ -1,4 +1,4 @@
|
||||||
#include "generate_x86_format.h"
|
#include "generate_cpu_format.h"
|
||||||
#include "handle_sync.h"
|
#include "handle_sync.h"
|
||||||
#include "init.h"
|
#include "init.h"
|
||||||
#include "insert_sync.h"
|
#include "insert_sync.h"
|
||||||
|
@ -6,17 +6,14 @@
|
||||||
#include "performance.h"
|
#include "performance.h"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "warp_func.h"
|
#include "warp_func.h"
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <llvm/Support/raw_ostream.h>
|
|
||||||
#include <map>
|
|
||||||
#include <set>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
|
// to support constant memory variables, we need to convert information
|
||||||
|
// from kernelTranslator to HostTranslator, since HostTranslator knows nothing
|
||||||
|
// about the kernel functions, we need to write the information to a file
|
||||||
|
// by KernelTranslator and read it in HostTranslator
|
||||||
std::string PATH = "kernel_meta.log";
|
std::string PATH = "kernel_meta.log";
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
@ -26,8 +23,9 @@ int main(int argc, char **argv) {
|
||||||
std::ofstream fout;
|
std::ofstream fout;
|
||||||
fout.open(PATH);
|
fout.open(PATH);
|
||||||
|
|
||||||
// inline, and create auxiliary global variables
|
// inline __device__ functions, and create auxiliary global variables
|
||||||
init_block(program, fout);
|
init_block(program, fout);
|
||||||
|
|
||||||
// insert sync before each vote, and replace the
|
// insert sync before each vote, and replace the
|
||||||
// original vote function to warp vote
|
// original vote function to warp vote
|
||||||
handle_warp_vote(program);
|
handle_warp_vote(program);
|
||||||
|
@ -40,17 +38,18 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
// split block by sync
|
// split block by sync
|
||||||
split_block_by_sync(program);
|
split_block_by_sync(program);
|
||||||
// add loop for intra&intera thread
|
|
||||||
|
// add loop for intra&intera thread, it refers 'hierarchical collapsing' in
|
||||||
|
// COX paper.
|
||||||
insert_warp_loop(program);
|
insert_warp_loop(program);
|
||||||
|
|
||||||
// (TODO): replace this patch
|
|
||||||
replace_built_in_function(program);
|
replace_built_in_function(program);
|
||||||
|
|
||||||
// TODO: replace with a more general function
|
// the input kernel programs have NVIDIA metadata, they need to be replaced to
|
||||||
// Not only for x86 backend
|
// CPU metadata
|
||||||
generate_x86_format(program);
|
generate_cpu_format(program);
|
||||||
|
|
||||||
// performance optimization
|
// execute O3 pipeline on the transformed program
|
||||||
performance_optimization(program);
|
performance_optimization(program);
|
||||||
|
|
||||||
VerifyModule(program);
|
VerifyModule(program);
|
||||||
|
|
|
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
set(LIB_NAME spmd2mpmd)
|
set(LIB_NAME spmd2mpmd)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 14)
|
set(CMAKE_CXX_STANDARD 14)
|
||||||
set(CMAKE_BUILD_TYPE Debug)
|
include_directories(./include/cpu)
|
||||||
include_directories(./include/x86)
|
|
||||||
include_directories(../../common)
|
include_directories(../../common)
|
||||||
|
|
||||||
file(GLOB proj_HEADERS "include/x86/*.h")
|
file(GLOB proj_HEADERS "include/cpu/*.h")
|
||||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||||
|
|
||||||
# Add core library.
|
# Add core library.
|
||||||
add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
|
add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
#ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__
|
||||||
|
#define __NVVM2CPU_GENERATE_CPU_FORMAT__
|
||||||
|
|
||||||
|
#include "llvm/IR/Module.h"
|
||||||
|
|
||||||
|
void generate_cpu_format(llvm::Module *M);
|
||||||
|
|
||||||
|
void set_meta_data(llvm::Module *M);
|
||||||
|
|
||||||
|
#endif
|
|
@ -1,10 +0,0 @@
|
||||||
#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
|
|
||||||
#define __NVVM2x86_GENERATE_X86_FORMAT__
|
|
||||||
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
|
|
||||||
void generate_x86_format(llvm::Module *M);
|
|
||||||
|
|
||||||
void set_meta_data(llvm::Module *M);
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -0,0 +1,125 @@
|
||||||
|
#include "generate_cpu_format.h"
|
||||||
|
#include "debug.hpp"
|
||||||
|
#include "tool.h"
|
||||||
|
#include "llvm/Support/Host.h"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
// set TargetTriple and DataLayout same as the host CPU
|
||||||
|
void set_meta_data(llvm::Module *M) {
|
||||||
|
M->setTargetTriple(llvm::sys::getProcessTriple());
|
||||||
|
// use the default DataLayout
|
||||||
|
M->setDataLayout("");
|
||||||
|
}
|
||||||
|
|
||||||
|
// as pthread only accept a single void* for input
|
||||||
|
// we have to decode this input inside the kernel
|
||||||
|
void decode_input(llvm::Module *M) {
|
||||||
|
|
||||||
|
std::set<llvm::Function *> need_remove;
|
||||||
|
|
||||||
|
llvm::Type *Int32T = Type::getInt32Ty(M->getContext());
|
||||||
|
llvm::Type *Int8T = Type::getInt8Ty(M->getContext());
|
||||||
|
|
||||||
|
llvm::FunctionType *LauncherFuncT = FunctionType::get(
|
||||||
|
Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false);
|
||||||
|
|
||||||
|
// generate Wrapper Function type
|
||||||
|
// now we only support a single int32*
|
||||||
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
|
Function *F = &(*i);
|
||||||
|
if (!isKernelFunction(M, F))
|
||||||
|
continue;
|
||||||
|
auto func_name = F->getName().str();
|
||||||
|
// filter out _Z24 and other mangled prefix
|
||||||
|
for (int pos = 2; pos < func_name.length(); pos++) {
|
||||||
|
if (func_name[pos] >= '0' && func_name[pos] <= '9')
|
||||||
|
continue;
|
||||||
|
func_name = func_name.substr(pos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
llvm::IRBuilder<> Builder(M->getContext());
|
||||||
|
|
||||||
|
FunctionCallee fc =
|
||||||
|
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
|
||||||
|
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
|
||||||
|
|
||||||
|
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
|
||||||
|
Builder.SetInsertPoint(Block);
|
||||||
|
|
||||||
|
// WorkGroup has only a single input
|
||||||
|
Function::arg_iterator ai = WorkGroup->arg_begin();
|
||||||
|
|
||||||
|
SmallVector<Value *, 8> Arguments;
|
||||||
|
Value *input_arg = &*ai;
|
||||||
|
// convert to int**
|
||||||
|
input_arg = Builder.CreateBitOrPointerCast(
|
||||||
|
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
|
||||||
|
|
||||||
|
size_t idx = 0;
|
||||||
|
// replace original arguments with the unpacked values
|
||||||
|
// for example, for a function f(int* a, char* b),
|
||||||
|
// we will generate a function f_wrapper(int** input)
|
||||||
|
// and replace the original arguments with the unpacked values
|
||||||
|
// e.g., a = (int*)input[0], b = (char*)input[1]
|
||||||
|
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
||||||
|
ii != ee; ++ii) {
|
||||||
|
Type *ArgType = ii->getType();
|
||||||
|
// calculate addr
|
||||||
|
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
|
||||||
|
// load corresponding int*
|
||||||
|
GEP = createLoad(Builder, GEP);
|
||||||
|
// bitcast
|
||||||
|
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
|
||||||
|
Value *Arg = createLoad(Builder, GEP);
|
||||||
|
Arguments.push_back(Arg);
|
||||||
|
++idx;
|
||||||
|
}
|
||||||
|
Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
}
|
||||||
|
for (auto f : need_remove) {
|
||||||
|
f->dropAllReferences();
|
||||||
|
f->eraseFromParent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// after flat/hierarchical collapsing, the barrier instructions are useless
|
||||||
|
void remove_barrier(llvm::Module *M) {
|
||||||
|
std::vector<Instruction *> need_remove;
|
||||||
|
for (auto F = M->begin(); F != M->end(); ++F)
|
||||||
|
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||||
|
for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) {
|
||||||
|
if (auto Call = dyn_cast<CallInst>(Inst)) {
|
||||||
|
if (Call->isInlineAsm())
|
||||||
|
continue;
|
||||||
|
auto func_name = Call->getCalledFunction()->getName().str();
|
||||||
|
if (func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||||
|
func_name == "llvm.nvvm.barrier0" ||
|
||||||
|
func_name == "llvm.nvvm.barrier.sync") {
|
||||||
|
need_remove.push_back(Call);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto inst : need_remove) {
|
||||||
|
inst->eraseFromParent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void remove_useless_var(llvm::Module *M) {
|
||||||
|
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
|
||||||
|
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
void generate_cpu_format(llvm::Module *M) {
|
||||||
|
DEBUG_INFO("generate cpu format\n");
|
||||||
|
// change metadata
|
||||||
|
set_meta_data(M);
|
||||||
|
// decode argument
|
||||||
|
decode_input(M);
|
||||||
|
// remove barrier
|
||||||
|
remove_barrier(M);
|
||||||
|
// remove useless func/variable
|
||||||
|
remove_useless_var(M);
|
||||||
|
}
|
|
@ -1,13 +1,7 @@
|
||||||
#include "handle_sync.h"
|
#include "handle_sync.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
#include "llvm/IR/IRBuilder.h"
|
||||||
#include "llvm/IR/InlineAsm.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
|
@ -2,26 +2,11 @@
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "memory_hierarchy.h"
|
#include "memory_hierarchy.h"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <set>
|
|
||||||
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/InlineAsm.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/LegacyPassManager.h"
|
#include "llvm/IR/LegacyPassManager.h"
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/InitializePasses.h"
|
#include "llvm/InitializePasses.h"
|
||||||
#include "llvm/PassInfo.h"
|
|
||||||
#include "llvm/PassRegistry.h"
|
|
||||||
#include "llvm/Support/CommandLine.h"
|
|
||||||
#include "llvm/Support/TargetSelect.h"
|
#include "llvm/Support/TargetSelect.h"
|
||||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
#include "llvm/Transforms/Utils/Cloning.h"
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
#include <set>
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
|
@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) {
|
||||||
|
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
Function *F = &(*i);
|
Function *F = &(*i);
|
||||||
auto func_name = F->getName().str();
|
|
||||||
if (!isKernelFunction(M, F))
|
if (!isKernelFunction(M, F))
|
||||||
continue;
|
continue;
|
||||||
Function::iterator I = F->begin();
|
for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
|
||||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
|
||||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||||
if (c->getCalledFunction()) {
|
if (c->getCalledFunction()) {
|
||||||
|
@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool find_sreg_inst(llvm::Function *F) {
|
bool find_sreg_inst(llvm::Function *F) {
|
||||||
Function::iterator I = F->begin();
|
for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
|
||||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
|
||||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||||
if (c->getCalledFunction()) {
|
if (c->getCalledFunction()) {
|
||||||
|
@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) {
|
||||||
Passes.run(*M);
|
Passes.run(*M);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// transform constant expression into sequence of instructions
|
||||||
bool lower_constant_expr(llvm::Module *M) {
|
bool lower_constant_expr(llvm::Module *M) {
|
||||||
bool modified = false;
|
bool modified = false;
|
||||||
LLVMContext &context = M->getContext();
|
|
||||||
auto I32 = llvm::Type::getInt32Ty(context);
|
|
||||||
std::vector<CallInst *> need_remove;
|
std::vector<CallInst *> need_remove;
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
Function *F = &(*i);
|
Function *F = &(*i);
|
||||||
auto func_name = F->getName().str();
|
|
||||||
if (!isKernelFunction(M, F))
|
if (!isKernelFunction(M, F))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) {
|
||||||
return modified;
|
return modified;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// replace _ZL3expd, just delete its body
|
||||||
void replace_cuda_math_built_in(llvm::Module *M) {
|
void replace_cuda_math_built_in(llvm::Module *M) {
|
||||||
// replace _ZL3expd, just delete its body
|
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
Function *F = &(*i);
|
Function *F = &(*i);
|
||||||
auto func_name = F->getName().str();
|
auto func_name = F->getName().str();
|
|
@ -4,29 +4,9 @@
|
||||||
#include "handle_sync.h"
|
#include "handle_sync.h"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
#include "llvm/ADT/Statistic.h"
|
|
||||||
#include "llvm/Analysis/LoopInfo.h"
|
|
||||||
#include "llvm/Analysis/LoopPass.h"
|
#include "llvm/Analysis/LoopPass.h"
|
||||||
#include "llvm/Analysis/PostDominators.h"
|
#include "llvm/Analysis/PostDominators.h"
|
||||||
#include "llvm/IR/DataLayout.h"
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/InlineAsm.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/LegacyPassManager.h"
|
#include "llvm/IR/LegacyPassManager.h"
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/IR/ValueSymbolTable.h"
|
|
||||||
#include "llvm/InitializePasses.h"
|
|
||||||
#include "llvm/PassInfo.h"
|
|
||||||
#include "llvm/PassRegistry.h"
|
|
||||||
#include "llvm/Support/CommandLine.h"
|
|
||||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
|
||||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <queue>
|
#include <queue>
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
@ -44,7 +24,7 @@ public:
|
||||||
std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
|
std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
|
||||||
std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
|
std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
|
||||||
|
|
||||||
// insert sync in the entry
|
// insert sync after the entry and before the first non-AllocaInst
|
||||||
BasicBlock *entry = &(*F.begin());
|
BasicBlock *entry = &(*F.begin());
|
||||||
for (auto i = entry->begin(); i != entry->end(); i++) {
|
for (auto i = entry->begin(); i != entry->end(); i++) {
|
||||||
if (!isa<AllocaInst>(i)) {
|
if (!isa<AllocaInst>(i)) {
|
||||||
|
@ -54,10 +34,8 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Function::iterator I = F.begin(); I != F.end(); ++I) {
|
for (Function::iterator I = F.begin(); I != F.end(); ++I) {
|
||||||
BasicBlock::iterator BI = I->begin();
|
|
||||||
|
|
||||||
// insert barrier before return
|
// insert barrier before return
|
||||||
for (; BI != I->end(); BI++) {
|
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
|
||||||
llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
|
llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
|
||||||
if (Ret) {
|
if (Ret) {
|
||||||
insert_inter_warp_sync_before.push_back(&(*BI));
|
insert_inter_warp_sync_before.push_back(&(*BI));
|
||||||
|
@ -125,7 +103,7 @@ public:
|
||||||
|
|
||||||
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
|
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
|
||||||
|
|
||||||
// first find all conditional barriers
|
// find all conditional barriers
|
||||||
std::vector<BasicBlock *> conditionalBarriers;
|
std::vector<BasicBlock *> conditionalBarriers;
|
||||||
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
|
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
|
||||||
BasicBlock *b = &*i;
|
BasicBlock *b = &*i;
|
||||||
|
@ -148,12 +126,9 @@ public:
|
||||||
conditionalBarriers.pop_back();
|
conditionalBarriers.pop_back();
|
||||||
|
|
||||||
// insert barrier in the start of if-condition
|
// insert barrier in the start of if-condition
|
||||||
|
|
||||||
BasicBlock *pos = b;
|
|
||||||
BasicBlock *pred = firstNonBackedgePredecessor(b);
|
BasicBlock *pred = firstNonBackedgePredecessor(b);
|
||||||
|
|
||||||
while (PDT->getPostDomTree().dominates(b, pred)) {
|
while (PDT->getPostDomTree().dominates(b, pred)) {
|
||||||
pos = pred;
|
|
||||||
// If our BB post dominates the given block, we know it is not the
|
// If our BB post dominates the given block, we know it is not the
|
||||||
// branching block that makes the barrier conditional.
|
// branching block that makes the barrier conditional.
|
||||||
pred = firstNonBackedgePredecessor(pred);
|
pred = firstNonBackedgePredecessor(pred);
|
||||||
|
@ -468,7 +443,6 @@ public:
|
||||||
auto header_block = L->getHeader();
|
auto header_block = L->getHeader();
|
||||||
assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
|
assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
|
||||||
"has more than 2 successors of the for-head\n");
|
"has more than 2 successors of the for-head\n");
|
||||||
BasicBlock *for_body = NULL;
|
|
||||||
for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
|
for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
|
||||||
i++) {
|
i++) {
|
||||||
auto bb = header_block->getTerminator()->getSuccessor(i);
|
auto bb = header_block->getTerminator()->getSuccessor(i);
|
|
@ -4,43 +4,20 @@
|
||||||
#include "handle_sync.h"
|
#include "handle_sync.h"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <iostream>
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
#include "llvm/ADT/Statistic.h"
|
|
||||||
#include "llvm/ADT/Triple.h"
|
#include "llvm/ADT/Triple.h"
|
||||||
#include "llvm/Analysis/DivergenceAnalysis.h"
|
#include "llvm/Analysis/DivergenceAnalysis.h"
|
||||||
#include "llvm/Analysis/LoopInfo.h"
|
|
||||||
#include "llvm/Analysis/LoopPass.h"
|
|
||||||
#include "llvm/Analysis/PostDominators.h"
|
#include "llvm/Analysis/PostDominators.h"
|
||||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||||
#include "llvm/IR/CFG.h"
|
|
||||||
#include "llvm/IR/DataLayout.h"
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/InlineAsm.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/LegacyPassManager.h"
|
#include "llvm/IR/LegacyPassManager.h"
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/IR/ValueSymbolTable.h"
|
|
||||||
#include "llvm/InitializePasses.h"
|
|
||||||
#include "llvm/MC/TargetRegistry.h"
|
#include "llvm/MC/TargetRegistry.h"
|
||||||
#include "llvm/PassInfo.h"
|
|
||||||
#include "llvm/PassRegistry.h"
|
|
||||||
#include "llvm/Support/CommandLine.h"
|
|
||||||
#include "llvm/Target/TargetMachine.h"
|
#include "llvm/Target/TargetMachine.h"
|
||||||
#include "llvm/Target/TargetOptions.h"
|
#include "llvm/Target/TargetOptions.h"
|
||||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
|
@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
||||||
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
|
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
|
||||||
|
|
||||||
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
|
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
|
||||||
Function *FF = instruction->getParent()->getParent();
|
|
||||||
Module *M = instruction->getParent()->getParent()->getParent();
|
Module *M = instruction->getParent()->getParent()->getParent();
|
||||||
LLVMContext &C = M->getContext();
|
|
||||||
const llvm::DataLayout &Layout = M->getDataLayout();
|
|
||||||
|
|
||||||
llvm::Type *elementType;
|
llvm::Type *elementType;
|
||||||
if (isa<AllocaInst>(instruction)) {
|
if (isa<AllocaInst>(instruction)) {
|
||||||
|
@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
||||||
}
|
}
|
||||||
|
|
||||||
Type *AllocType = elementType;
|
Type *AllocType = elementType;
|
||||||
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
|
|
||||||
llvm::Value *ItemSize = nullptr;
|
|
||||||
llvm::AllocaInst *Alloca = nullptr;
|
llvm::AllocaInst *Alloca = nullptr;
|
||||||
|
|
||||||
auto block_size_addr = M->getGlobalVariable("block_size");
|
auto block_size_addr = M->getGlobalVariable("block_size");
|
||||||
|
@ -697,9 +669,6 @@ public:
|
||||||
is_single_conditional_branch_block = 1;
|
is_single_conditional_branch_block = 1;
|
||||||
} else {
|
} else {
|
||||||
// generate by replicate local variable
|
// generate by replicate local variable
|
||||||
printf(
|
|
||||||
"[WARNING] match single conditional branch with HARD CODE\n");
|
|
||||||
bool branch_to_intra_init = false;
|
|
||||||
for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
|
for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
|
||||||
llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
|
llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
|
||||||
auto block_name = entryCandidate->getName().str();
|
auto block_name = entryCandidate->getName().str();
|
||||||
|
@ -755,7 +724,7 @@ public:
|
||||||
entry = entryCandidate;
|
entry = entryCandidate;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// delete useless PR, those PRs only have branch
|
// delete useless PR, those PRs only have branch instructions
|
||||||
if (entry == exit) {
|
if (entry == exit) {
|
||||||
if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
|
if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
|
||||||
return;
|
return;
|
|
@ -1,29 +1,10 @@
|
||||||
#include "memory_hierarchy.h"
|
#include "memory_hierarchy.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "llvm/IR/CFG.h"
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
#include "llvm/IR/Instructions.h"
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
#include <assert.h>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <sstream>
|
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
void mem_share2global(llvm::Module *M) {
|
void mem_share2global(llvm::Module *M) {
|
||||||
LLVMContext *C = &M->getContext();
|
|
||||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
|
||||||
llvm::Type *Int64T = Type::getInt64Ty(*C);
|
|
||||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
|
||||||
|
|
||||||
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
||||||
std::set<llvm::Instruction *> need_remove;
|
std::set<llvm::Instruction *> need_remove;
|
||||||
std::set<GlobalVariable *> need_remove_share_memory;
|
std::set<GlobalVariable *> need_remove_share_memory;
|
||||||
|
@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) {
|
||||||
// generate global type pointer
|
// generate global type pointer
|
||||||
PointerType *PointerTy =
|
PointerType *PointerTy =
|
||||||
PointerType::get(array_type->getElementType(), 0);
|
PointerType::get(array_type->getElementType(), 0);
|
||||||
llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
|
|
||||||
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
|
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
|
||||||
*M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
|
*M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
|
||||||
NULL, "dynamic_shared_memory", NULL,
|
NULL, "dynamic_shared_memory", NULL,
|
||||||
|
@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) {
|
||||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||||
global_memory));
|
global_memory));
|
||||||
} else if (element_type->isFloatTy()) {
|
} else if (element_type->isFloatTy()) {
|
||||||
auto FP_type = llvm::Type::getFloatTy(*C);
|
auto FP_type = llvm::Type::getFloatTy(M->getContext());
|
||||||
auto zero = llvm::ConstantFP::get(FP_type, 0);
|
auto zero = llvm::ConstantFP::get(FP_type, 0);
|
||||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||||
*M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
|
*M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
|
||||||
|
@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
||||||
LLVMContext *C = &M->getContext();
|
|
||||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
|
||||||
llvm::Type *Int64T = Type::getInt64Ty(*C);
|
|
||||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
|
||||||
|
|
||||||
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
||||||
std::set<llvm::Instruction *> need_remove;
|
std::set<llvm::Instruction *> need_remove;
|
||||||
std::set<GlobalVariable *> need_remove_constant_memory;
|
std::set<GlobalVariable *> need_remove_constant_memory;
|
||||||
|
@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
||||||
if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
|
if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
|
||||||
if (auto PT = dyn_cast<PointerType>(I->getType())) {
|
if (auto PT = dyn_cast<PointerType>(I->getType())) {
|
||||||
unsigned AS = PT->getAddressSpace();
|
unsigned AS = PT->getAddressSpace();
|
||||||
if (AS == 4) { // find a share memory
|
if (AS == 4) { // find a constant memory
|
||||||
need_remove_constant_memory.insert(constant_memory);
|
need_remove_constant_memory.insert(constant_memory);
|
||||||
// generate the corresponding global memory variable
|
// generate the corresponding global memory variable
|
||||||
auto new_name = "wrapper_global_" + constant_memory->getName().str();
|
auto new_name = "wrapper_global_" + constant_memory->getName().str();
|
||||||
|
@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
||||||
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
|
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
|
||||||
if (constant_memory->hasExternalLinkage() &&
|
if (constant_memory->hasExternalLinkage() &&
|
||||||
array_type->getArrayNumElements() == 0) {
|
array_type->getArrayNumElements() == 0) {
|
||||||
// external shared memory of []
|
// external constant memory of []
|
||||||
// generate global type pointer
|
// generate global type pointer
|
||||||
PointerType *PointerTy =
|
PointerType *PointerTy =
|
||||||
PointerType::get(array_type->getElementType(), 0);
|
PointerType::get(array_type->getElementType(), 0);
|
|
@ -1,43 +1,13 @@
|
||||||
#include "performance.h"
|
#include "performance.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "llvm/ADT/Statistic.h"
|
|
||||||
#include "llvm/ADT/StringRef.h"
|
|
||||||
#include "llvm/ADT/Triple.h"
|
|
||||||
#include "llvm/Analysis/LoopInfo.h"
|
|
||||||
#include "llvm/Analysis/LoopPass.h"
|
|
||||||
#include "llvm/Analysis/PostDominators.h"
|
|
||||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||||
#include "llvm/CodeGen/MachineModuleInfo.h"
|
|
||||||
#include "llvm/IR/CFG.h"
|
|
||||||
#include "llvm/IR/DataLayout.h"
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/InlineAsm.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/LegacyPassManager.h"
|
#include "llvm/IR/LegacyPassManager.h"
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/IR/ValueSymbolTable.h"
|
|
||||||
#include "llvm/InitializePasses.h"
|
|
||||||
#include "llvm/MC/TargetRegistry.h"
|
#include "llvm/MC/TargetRegistry.h"
|
||||||
#include "llvm/PassInfo.h"
|
|
||||||
#include "llvm/PassRegistry.h"
|
|
||||||
#include "llvm/Support/CommandLine.h"
|
|
||||||
#include "llvm/Support/Host.h"
|
#include "llvm/Support/Host.h"
|
||||||
#include "llvm/Target/TargetMachine.h"
|
#include "llvm/Target/TargetMachine.h"
|
||||||
#include "llvm/Target/TargetOptions.h"
|
|
||||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
#include <map>
|
|
||||||
#include <set>
|
|
||||||
#include <sstream>
|
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
|
@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) {
|
||||||
llvm::legacy::PassManager Passes;
|
llvm::legacy::PassManager Passes;
|
||||||
|
|
||||||
// add target machine info
|
// add target machine info
|
||||||
llvm::Triple triple("x86_64-unknown-linux-gnu");
|
llvm::Triple triple(llvm::sys::getProcessTriple());
|
||||||
|
|
||||||
std::string Error;
|
std::string Error;
|
||||||
const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
|
const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
|
||||||
|
@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) {
|
||||||
Options.FloatABIType = FloatABI::Hard;
|
Options.FloatABIType = FloatABI::Hard;
|
||||||
|
|
||||||
TargetMachine *TM = TheTarget->createTargetMachine(
|
TargetMachine *TM = TheTarget->createTargetMachine(
|
||||||
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
|
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""),
|
||||||
Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
|
Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
|
||||||
assert(TM && "No Machine Information\n");
|
assert(TM && "No Machine Information\n");
|
||||||
|
|
||||||
|
@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) {
|
||||||
Builder.LoopVectorize = true;
|
Builder.LoopVectorize = true;
|
||||||
Builder.SLPVectorize = true;
|
Builder.SLPVectorize = true;
|
||||||
|
|
||||||
Builder.VerifyInput = true;
|
|
||||||
Builder.VerifyOutput = true;
|
|
||||||
|
|
||||||
Builder.populateModulePassManager(Passes);
|
Builder.populateModulePassManager(Passes);
|
||||||
Passes.run(*M);
|
Passes.run(*M);
|
||||||
}
|
}
|
|
@ -1,29 +1,13 @@
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||||
#include "llvm/Config/llvm-config.h"
|
|
||||||
#include "llvm/IR/Constants.h"
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/GlobalVariable.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/InlineAsm.h"
|
#include "llvm/IR/InlineAsm.h"
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/IR/Verifier.h"
|
#include "llvm/IR/Verifier.h"
|
||||||
#include "llvm/IRReader/IRReader.h"
|
#include "llvm/IRReader/IRReader.h"
|
||||||
#include "llvm/Support/CommandLine.h"
|
|
||||||
#include "llvm/Support/ErrorOr.h"
|
|
||||||
#include "llvm/Support/FileSystem.h"
|
#include "llvm/Support/FileSystem.h"
|
||||||
#include "llvm/Support/ManagedStatic.h"
|
#include "llvm/Support/SourceMgr.h"
|
||||||
#include "llvm/Support/MemoryBuffer.h"
|
|
||||||
#include "llvm/Support/ToolOutputFile.h"
|
#include "llvm/Support/ToolOutputFile.h"
|
||||||
#include "llvm/Support/raw_ostream.h"
|
#include "llvm/Support/raw_ostream.h"
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
|
||||||
Value *val = phi->getIncomingValue(incoming);
|
Value *val = phi->getIncomingValue(incoming);
|
||||||
BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
|
BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
|
||||||
builder.SetInsertPoint(incomingBB->getTerminator());
|
builder.SetInsertPoint(incomingBB->getTerminator());
|
||||||
llvm::Instruction *store = builder.CreateStore(val, alloca);
|
builder.CreateStore(val, alloca);
|
||||||
}
|
}
|
||||||
builder.SetInsertPoint(phi);
|
builder.SetInsertPoint(phi);
|
||||||
|
|
||||||
|
@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool changed = false;
|
|
||||||
for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
|
for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
|
||||||
Instruction *instr = *i;
|
Instruction *instr = *i;
|
||||||
BreakPHIToAllocas(dyn_cast<PHINode>(instr));
|
BreakPHIToAllocas(dyn_cast<PHINode>(instr));
|
||||||
|
@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) {
|
||||||
|
|
||||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||||
if (auto Load = dyn_cast<LoadInst>(BI)) {
|
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||||
auto load_from = Load->getOperand(0);
|
|
||||||
} else if (auto Call = dyn_cast<CallInst>(BI)) {
|
|
||||||
if (Call->getCalledFunction()) {
|
if (Call->getCalledFunction()) {
|
||||||
auto func_name = Call->getCalledFunction()->getName().str();
|
auto func_name = Call->getCalledFunction()->getName().str();
|
||||||
if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
|
if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
|
||||||
|
@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) {
|
||||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||||
if (Call->getCalledFunction()) {
|
if (Call->getCalledFunction()) {
|
||||||
auto func_name = Call->getCalledFunction()->getName().str();
|
auto func_name = Call->getCalledFunction()->getName().str();
|
||||||
auto callFn = Call->getCalledFunction();
|
|
||||||
if (func_name == "vprintf") {
|
if (func_name == "vprintf") {
|
||||||
/*
|
/*
|
||||||
* replace CUDA's printf to C's printf
|
* replace CUDA's printf to C's printf
|
||||||
|
@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) {
|
||||||
dyn_cast<PointerType>(BC->getOperand(0)->getType());
|
dyn_cast<PointerType>(BC->getOperand(0)->getType());
|
||||||
auto SrcTy = SrcPointTy->getElementType();
|
auto SrcTy = SrcPointTy->getElementType();
|
||||||
// reverse the bitcast
|
// reverse the bitcast
|
||||||
auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
|
new BitCastInst(BC, SrcPointTy, "", Call);
|
||||||
assert(SrcTy->isStructTy() == 1);
|
assert(SrcTy->isStructTy() == 1);
|
||||||
auto StructTy = dyn_cast<StructType>(SrcTy);
|
auto StructTy = dyn_cast<StructType>(SrcTy);
|
||||||
for (int i = 0; i < StructTy->getNumElements(); i++) {
|
for (int i = 0; i < StructTy->getNumElements(); i++) {
|
||||||
|
@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) {
|
||||||
|
|
||||||
void replace_asm_call(llvm::Module *M) {
|
void replace_asm_call(llvm::Module *M) {
|
||||||
LLVMContext &context = M->getContext();
|
LLVMContext &context = M->getContext();
|
||||||
auto I32 = llvm::Type::getInt32Ty(context);
|
|
||||||
std::vector<CallInst *> need_remove;
|
std::vector<CallInst *> need_remove;
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||||
Function *F = &(*i);
|
Function *F = &(*i);
|
|
@ -2,15 +2,6 @@
|
||||||
#include "warp_func.h"
|
#include "warp_func.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
#include "tool.h"
|
#include "tool.h"
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) {
|
||||||
res = BinaryOperator::CreateNot(res, "", sync_inst);
|
res = BinaryOperator::CreateNot(res, "", sync_inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
|
new llvm::StoreInst(res, GEP, "", sync_inst);
|
||||||
// create barrier
|
// create barrier
|
||||||
CreateIntraWarpBarrier(sync_inst);
|
CreateIntraWarpBarrier(sync_inst);
|
||||||
/*
|
/*
|
|
@ -1,186 +0,0 @@
|
||||||
#include "generate_x86_format.h"
|
|
||||||
#include "debug.hpp"
|
|
||||||
#include "tool.h"
|
|
||||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
|
||||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
||||||
#include "llvm/IR/Function.h"
|
|
||||||
#include "llvm/IR/GlobalValue.h"
|
|
||||||
#include "llvm/IR/IRBuilder.h"
|
|
||||||
#include "llvm/IR/Instructions.h"
|
|
||||||
#include "llvm/IR/LLVMContext.h"
|
|
||||||
#include "llvm/IR/LegacyPassManager.h"
|
|
||||||
#include "llvm/IR/Module.h"
|
|
||||||
#include "llvm/IR/Verifier.h"
|
|
||||||
#include "llvm/IRReader/IRReader.h"
|
|
||||||
#include "llvm/MC/TargetRegistry.h"
|
|
||||||
#include "llvm/Support/TargetSelect.h"
|
|
||||||
#include "llvm/Support/ToolOutputFile.h"
|
|
||||||
#include "llvm/Target/TargetMachine.h"
|
|
||||||
#include "llvm/Target/TargetOptions.h"
|
|
||||||
#include "llvm/Transforms/Utils/Cloning.h"
|
|
||||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
|
||||||
|
|
||||||
using namespace llvm;
|
|
||||||
|
|
||||||
void set_meta_data(llvm::Module *M) {
|
|
||||||
M->setTargetTriple("x86_64-unknown-linux-gnu");
|
|
||||||
M->setDataLayout(
|
|
||||||
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
|
|
||||||
}
|
|
||||||
|
|
||||||
// as pthread only accept a single void* for input
|
|
||||||
// we have to decode this input inside the kernel
|
|
||||||
void decode_input(llvm::Module *M) {
|
|
||||||
|
|
||||||
std::set<llvm::Function *> need_remove;
|
|
||||||
|
|
||||||
LLVMContext *C = &M->getContext();
|
|
||||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
|
||||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
|
||||||
|
|
||||||
llvm::FunctionType *LauncherFuncT = FunctionType::get(
|
|
||||||
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
|
|
||||||
|
|
||||||
std::set<GlobalVariable *> dynmaic_memory;
|
|
||||||
|
|
||||||
std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
|
|
||||||
|
|
||||||
// generate Wrapper Function type
|
|
||||||
// now we only support a single int32*
|
|
||||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
|
||||||
Function *F = &(*i);
|
|
||||||
if (!isKernelFunction(M, F))
|
|
||||||
continue;
|
|
||||||
auto func_name = F->getName().str();
|
|
||||||
// remove mangle prefix
|
|
||||||
// remove _Z24
|
|
||||||
for (int pos = 2; pos < func_name.length(); pos++) {
|
|
||||||
if (func_name[pos] >= '0' && func_name[pos] <= '9')
|
|
||||||
continue;
|
|
||||||
func_name = func_name.substr(pos);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
llvm::IRBuilder<> Builder(M->getContext());
|
|
||||||
|
|
||||||
FunctionCallee fc =
|
|
||||||
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
|
|
||||||
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
|
|
||||||
|
|
||||||
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
|
|
||||||
Builder.SetInsertPoint(Block);
|
|
||||||
|
|
||||||
// WorkGroup has only a single input
|
|
||||||
Function::arg_iterator ai = WorkGroup->arg_begin();
|
|
||||||
|
|
||||||
SmallVector<Value *, 8> Arguments;
|
|
||||||
Value *input_arg = &*ai;
|
|
||||||
// convert to int**
|
|
||||||
input_arg = Builder.CreateBitOrPointerCast(
|
|
||||||
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
|
|
||||||
|
|
||||||
// dynamic memory load in the wrapper function
|
|
||||||
GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
|
|
||||||
if (share_memory != NULL) {
|
|
||||||
dynmaic_memory.insert(share_memory);
|
|
||||||
llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
|
|
||||||
*M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
|
||||||
"thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
|
|
||||||
0, false);
|
|
||||||
Value *loadedValue = createLoad(Builder, global_mem);
|
|
||||||
|
|
||||||
llvm::FunctionType *LaunchFun2 = FunctionType::get(
|
|
||||||
PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
|
||||||
|
|
||||||
FunctionCallee fc2 =
|
|
||||||
M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
|
|
||||||
|
|
||||||
Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
|
|
||||||
|
|
||||||
WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
|
|
||||||
WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
|
|
||||||
Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
|
|
||||||
co->setSelectionKind(Comdat::SelectionKind::Any);
|
|
||||||
WorkGroup2->setComdat(co);
|
|
||||||
|
|
||||||
BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
|
|
||||||
|
|
||||||
llvm::IRBuilder<> Builder2(M->getContext());
|
|
||||||
Builder2.SetInsertPoint(Block2);
|
|
||||||
Builder2.CreateRet(share_memory);
|
|
||||||
|
|
||||||
auto PT = dyn_cast<PointerType>(share_memory->getType());
|
|
||||||
auto element_type = PT->getElementType();
|
|
||||||
|
|
||||||
AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
|
|
||||||
Value *new_ar = new_arr;
|
|
||||||
Value *gptr = Builder.CreateBitOrPointerCast(
|
|
||||||
share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
|
|
||||||
|
|
||||||
Builder.CreateStore(new_ar, gptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t idx = 0;
|
|
||||||
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
|
||||||
ii != ee; ++ii) {
|
|
||||||
Type *ArgType = ii->getType();
|
|
||||||
|
|
||||||
// calculate addr
|
|
||||||
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
|
|
||||||
// load corresponding int*
|
|
||||||
GEP = createLoad(Builder, GEP);
|
|
||||||
// bitcast
|
|
||||||
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
|
|
||||||
Value *Arg = createLoad(Builder, GEP);
|
|
||||||
Arguments.push_back(Arg);
|
|
||||||
++idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
|
|
||||||
Builder.CreateRetVoid();
|
|
||||||
}
|
|
||||||
for (auto f : need_remove) {
|
|
||||||
f->dropAllReferences();
|
|
||||||
f->eraseFromParent();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void remove_barrier(llvm::Module *M) {
|
|
||||||
std::vector<Instruction *> need_remove;
|
|
||||||
for (auto F = M->begin(); F != M->end(); ++F)
|
|
||||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
|
||||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
|
||||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
|
||||||
if (Call->isInlineAsm())
|
|
||||||
continue;
|
|
||||||
auto func_name = Call->getCalledFunction()->getName().str();
|
|
||||||
if (func_name == "llvm.nvvm.bar.warp.sync" ||
|
|
||||||
func_name == "llvm.nvvm.barrier0" ||
|
|
||||||
func_name == "llvm.nvvm.barrier.sync") {
|
|
||||||
need_remove.push_back(Call);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto inst : need_remove) {
|
|
||||||
inst->eraseFromParent();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void remove_useless_var(llvm::Module *M) {
|
|
||||||
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
|
|
||||||
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
|
|
||||||
}
|
|
||||||
|
|
||||||
void generate_x86_format(llvm::Module *M) {
|
|
||||||
DEBUG_INFO("generate x86 format\n");
|
|
||||||
// change metadata
|
|
||||||
set_meta_data(M);
|
|
||||||
// decode argument
|
|
||||||
decode_input(M);
|
|
||||||
// remove barrier
|
|
||||||
remove_barrier(M);
|
|
||||||
// remove useless func/variable
|
|
||||||
remove_useless_var(M);
|
|
||||||
}
|
|
|
@ -1,9 +1,9 @@
|
||||||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||||
project(
|
project(
|
||||||
X86runtime
|
CPUruntime
|
||||||
DESCRIPTION "Implementation CUDA runtime API with x86"
|
DESCRIPTION "Implementation CUDA runtime API with CPUs"
|
||||||
LANGUAGES CXX)
|
LANGUAGES CXX)
|
||||||
set(LIB_NAME x86Runtime)
|
set(LIB_NAME CPUruntime)
|
||||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
|
|
||||||
# compile threadPool implementation
|
# compile threadPool implementation
|
||||||
|
@ -12,9 +12,9 @@ add_subdirectory(threadPool)
|
||||||
# compile x86 runtime library
|
# compile x86 runtime library
|
||||||
include_directories(../common)
|
include_directories(../common)
|
||||||
include_directories(./include/)
|
include_directories(./include/)
|
||||||
include_directories(./include/x86)
|
include_directories(./include/cpu)
|
||||||
include_directories(./threadPool/include/)
|
include_directories(./threadPool/include/)
|
||||||
include_directories(./threadPool/include/x86)
|
include_directories(./threadPool/include/cpu)
|
||||||
include_directories(../external/moodycamel/)
|
include_directories(../external/moodycamel/)
|
||||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#ifndef __RUNTIME_IMPL__
|
#ifndef __KERNEL_IMPL__
|
||||||
#define __KERNEL_IMPL__
|
#define __KERNEL_IMPL__
|
||||||
#include "structures.h"
|
#include "structures.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
|
@ -10,27 +10,31 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
cudaError_t cudaGetDevice(int *devPtr) {
|
cudaError_t cudaGetDevice(int *devPtr) {
|
||||||
*devPtr = 0;
|
*devPtr = 0;
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
|
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
|
||||||
cudaError_t cudaDeviceReset(void) {
|
|
||||||
scheduler_uninit();
|
cudaError_t cudaDeviceReset(void) { return cudaSuccess; }
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
cudaError_t cudaDeviceSynchronize(void) {
|
cudaError_t cudaDeviceSynchronize(void) {
|
||||||
cuSynchronizeBarrier();
|
cuSynchronizeBarrier();
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaThreadSynchronize(void) {
|
cudaError_t cudaThreadSynchronize(void) {
|
||||||
cuSynchronizeBarrier();
|
cuSynchronizeBarrier();
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaFree(void *devPtr) {
|
cudaError_t cudaFree(void *devPtr) {
|
||||||
free(devPtr);
|
free(devPtr);
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaFreeHost(void *devPtr) {
|
cudaError_t cudaFreeHost(void *devPtr) {
|
||||||
free(devPtr);
|
free(devPtr);
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
|
@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||||
cu_kernel *ker =
|
cu_kernel *ker =
|
||||||
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
|
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
|
||||||
|
|
||||||
int lstatus = cuLaunchKernel(&ker);
|
cuLaunchKernel(&ker);
|
||||||
|
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
||||||
*devPtr = malloc(size);
|
*devPtr = malloc(size);
|
||||||
if (devPtr == NULL)
|
if (devPtr == NULL)
|
||||||
return cudaErrorMemoryAllocation;
|
return cudaErrorMemoryAllocation;
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
|
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
|
||||||
memset(devPtr, value, count);
|
memset(devPtr, value, count);
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
||||||
cudaMemcpyKind kind) {
|
cudaMemcpyKind kind) {
|
||||||
if (kind == cudaMemcpyHostToHost) {
|
if (kind == cudaMemcpyHostToHost) {
|
||||||
|
@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int stream_counter = 1;
|
|
||||||
/*
|
/*
|
||||||
From our evaluation, CPU backend can gain little benefit
|
From our evaluation, CPU backend can gain little benefit
|
||||||
from multi stream. Thus, we only use single stream
|
from multi stream. Thus, we only use single stream
|
||||||
|
@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess;
|
||||||
const char *cudaGetErrorString(cudaError_t error) {
|
const char *cudaGetErrorString(cudaError_t error) {
|
||||||
if (error == cudaSuccess) {
|
if (error == cudaSuccess) {
|
||||||
return "Cuda Get Error Success";
|
return "Cuda Get Error Success";
|
||||||
|
} else {
|
||||||
|
return "Cuda Get Error Failed";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
set(LIB_NAME threadPool)
|
set(LIB_NAME threadPool)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 14)
|
set(CMAKE_CXX_STANDARD 14)
|
||||||
set(CMAKE_BUILD_TYPE Debug)
|
|
||||||
include_directories(../../common)
|
include_directories(../../common)
|
||||||
include_directories(./include)
|
include_directories(./include)
|
||||||
include_directories(./include/x86)
|
include_directories(./include/cpu)
|
||||||
include_directories(../../external/moodycamel)
|
include_directories(../../external/moodycamel)
|
||||||
|
|
||||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
||||||
|
|
|
@ -1,3 +1,19 @@
|
||||||
|
/*
|
||||||
|
This file contains the implementation of the CPU thread pool. For a kernel
|
||||||
|
launch, the host thread will enqueue the kernel to the kernelQueue, and the
|
||||||
|
threads in the thread pool will try to fetch work from the queue. After a
|
||||||
|
thread fetches a kernel from the queue, it will execute the kernel. After the
|
||||||
|
kernel execution, the thread will try to fetch another kernel from the queue.
|
||||||
|
If the queue is empty, the thread will wait for the next kernel launch.
|
||||||
|
|
||||||
|
By default, we try to use all CPU cores for execution. Thus, for a kernel
|
||||||
|
launch, the host thread pushes P kernel variables to the queue, where P is the
|
||||||
|
number of CPU cores.
|
||||||
|
|
||||||
|
For some lightweight kernels, useing fewer CPU cores can speed up the overall
|
||||||
|
execution time, due to fewer CPU cores lead to lower synchronization overhead.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "api.h"
|
#include "api.h"
|
||||||
#include "blockingconcurrentqueue.h"
|
#include "blockingconcurrentqueue.h"
|
||||||
#include "debug.hpp"
|
#include "debug.hpp"
|
||||||
|
@ -9,9 +25,6 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
/*
|
|
||||||
Initialize the device
|
|
||||||
*/
|
|
||||||
int device_max_compute_units = 1;
|
int device_max_compute_units = 1;
|
||||||
bool device_initilized = false;
|
bool device_initilized = false;
|
||||||
int init_device() {
|
int init_device() {
|
||||||
|
@ -32,7 +45,6 @@ int init_device() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create Kernel
|
// Create Kernel
|
||||||
static int kernelIds = 0;
|
|
||||||
cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
|
cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||||
void **args, size_t sharedMem, cudaStream_t stream) {
|
void **args, size_t sharedMem, cudaStream_t stream) {
|
||||||
cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
|
cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
|
||||||
|
@ -71,13 +83,11 @@ __thread int warp_shfl[32] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
Enqueue Kernel (k) to the scheduler kernelQueue
|
|
||||||
*/
|
|
||||||
int TaskToExecute;
|
int TaskToExecute;
|
||||||
|
// Enqueue Kernel to the scheduler kernelQueue
|
||||||
int schedulerEnqueueKernel(cu_kernel *k) {
|
int schedulerEnqueueKernel(cu_kernel *k) {
|
||||||
int totalBlocks =
|
int totalBlocks = k->totalBlocks;
|
||||||
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
|
// by default, all CPU cores are used to execute GPU blocks equally
|
||||||
int gpuBlockToExecutePerCpuThread =
|
int gpuBlockToExecutePerCpuThread =
|
||||||
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
|
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
|
||||||
TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
|
TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
|
||||||
|
@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
||||||
return C_SUCCESS;
|
return C_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
// Push kernel to the kernelQueue
|
||||||
Kernel Launch with numBlocks and numThreadsPerBlock
|
|
||||||
*/
|
|
||||||
int cuLaunchKernel(cu_kernel **k) {
|
int cuLaunchKernel(cu_kernel **k) {
|
||||||
if (!device_initilized) {
|
if (!device_initilized) {
|
||||||
init_device();
|
init_device();
|
||||||
}
|
}
|
||||||
// Calculate Block Size N/numBlocks
|
|
||||||
cu_kernel *ker = *k;
|
|
||||||
int status = C_RUN;
|
|
||||||
// set complete to false, this variable is used for sync
|
// set complete to false, this variable is used for sync
|
||||||
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
||||||
scheduler->thread_pool[i].completeTask = 0;
|
scheduler->thread_pool[i].completeTask = 0;
|
||||||
}
|
}
|
||||||
schedulerEnqueueKernel(ker);
|
schedulerEnqueueKernel(*k);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
// threads in thread-pool try to fetch work from the queue
|
||||||
Thread Gets Work
|
|
||||||
*/
|
|
||||||
int get_work(c_thread *th) {
|
int get_work(c_thread *th) {
|
||||||
int dynamic_shared_mem_size = 0;
|
int dynamic_shared_mem_size = 0;
|
||||||
dim3 gridDim;
|
dim3 gridDim;
|
||||||
|
@ -136,6 +139,7 @@ int get_work(c_thread *th) {
|
||||||
grid_size_x = gridDim.x;
|
grid_size_x = gridDim.x;
|
||||||
grid_size_y = gridDim.y;
|
grid_size_y = gridDim.y;
|
||||||
grid_size_z = gridDim.z;
|
grid_size_z = gridDim.z;
|
||||||
|
// allocate dynamic shared memory
|
||||||
if (dynamic_shared_mem_size > 0)
|
if (dynamic_shared_mem_size > 0)
|
||||||
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
||||||
// execute GPU blocks
|
// execute GPU blocks
|
||||||
|
@ -153,7 +157,8 @@ int get_work(c_thread *th) {
|
||||||
}
|
}
|
||||||
// if cannot get tasks, check whether programs stop
|
// if cannot get tasks, check whether programs stop
|
||||||
if (scheduler->threadpool_shutdown_requested) {
|
if (scheduler->threadpool_shutdown_requested) {
|
||||||
return true; // thread exit
|
// thread exit
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -176,9 +181,7 @@ void *driver_thread(void *p) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
// Initialize the scheduler
|
||||||
Initialize the scheduler
|
|
||||||
*/
|
|
||||||
int scheduler_init(cu_device device) {
|
int scheduler_init(cu_device device) {
|
||||||
scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
|
scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
|
||||||
scheduler->num_worker_threads = device.max_compute_units;
|
scheduler->num_worker_threads = device.max_compute_units;
|
||||||
|
@ -198,8 +201,6 @@ int scheduler_init(cu_device device) {
|
||||||
return C_SUCCESS;
|
return C_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Barrier for Kernel Launch
|
Barrier for Kernel Launch
|
||||||
*/
|
*/
|
|
@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \
|
||||||
$HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc $HeteroMark_PATH/src/common/benchmark/*.cc \
|
$HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc $HeteroMark_PATH/src/common/benchmark/*.cc \
|
||||||
$HeteroMark_PATH/src/common/command_line_option/*.cc $HeteroMark_PATH/src/common/time_measurement/*.cc \
|
$HeteroMark_PATH/src/common/command_line_option/*.cc $HeteroMark_PATH/src/common/time_measurement/*.cc \
|
||||||
-L$CuPBoP_BUILD_PATH/runtime -L$CuPBoP_BUILD_PATH/runtime/threadPool \
|
-L$CuPBoP_BUILD_PATH/runtime -L$CuPBoP_BUILD_PATH/runtime/threadPool \
|
||||||
-I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool
|
-I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool
|
||||||
|
|
||||||
case $1 in
|
case $1 in
|
||||||
aes)
|
aes)
|
||||||
|
|
Loading…
Reference in New Issue