Refactoring the codebase. Remove useless variables; Add comments; Remove useless header files; Remove hard code and support both x86 and ARM CPU
This commit is contained in:
parent
50d615da64
commit
fd56811650
|
@ -51,7 +51,7 @@ jobs:
|
|||
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
|
||||
./reverse
|
||||
- name: Execute the dynamic shared memory demo
|
||||
run: |
|
||||
|
@ -63,7 +63,7 @@ jobs:
|
|||
hostTranslator reverse-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool host.o kernel.o -lc -lCPUruntime -lthreadPool -lpthread
|
||||
./reverse
|
||||
- name: Execute Hetero-mark benchmark
|
||||
run: |
|
||||
|
@ -79,5 +79,5 @@ jobs:
|
|||
hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
|
||||
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -I${{ github.workspace }}/cuda-10.1/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lCPUruntime -lthreadPool -pthread
|
||||
./lavaMD -boxes1d 10
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||
|
||||
project(CudaOnX86)
|
||||
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on X86 architecture.")
|
||||
project(CuPBoP)
|
||||
set(CMAKE_PROJECT_DESCRIPTION "Executing CUDA on non-NVIDIA architecture.")
|
||||
set(CMAKE_CXX_STANDARD "14")
|
||||
set(MAJOR_VERSION 0)
|
||||
set(MINOR_VERSION 1)
|
||||
set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION})
|
||||
set(COX_VERSION ${VERSION_STRING})
|
||||
# get LLVM PATH get PATH for head file
|
||||
|
||||
if(DEFINED LLVM_CONFIG_PATH)
|
||||
if(IS_ABSOLUTE "${LLVM_CONFIG_PATH}")
|
||||
if(EXISTS "${LLVM_CONFIG_PATH}")
|
||||
|
@ -32,7 +28,7 @@ if(DEFINED LLVM_CONFIG_PATH)
|
|||
else()
|
||||
message(FATAL_ERROR "llvm-config is required")
|
||||
endif()
|
||||
# get CUDA PATH
|
||||
|
||||
if(DEFINED CUDA_PATH)
|
||||
message(STATUS "Using CUDA: ${CUDA_PATH}")
|
||||
else()
|
||||
|
@ -45,7 +41,7 @@ if(DEBUG)
|
|||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS
|
||||
"-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS}")
|
||||
"-I${CUDA_PATH}/include ${LLVM_CXX_FLAG} ${CMAKE_CXX_FLAGS} -Wunused")
|
||||
|
||||
set(GCC_COVERAGE_LINK_FLAGS
|
||||
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
|
||||
|
@ -54,5 +50,8 @@ add_subdirectory(compilation)
|
|||
add_subdirectory(runtime)
|
||||
enable_testing()
|
||||
|
||||
option(HETERO_MARK_DATA "The path to download hetero-mark dataset." /tmp/data)
|
||||
set(HETERO_MARK_DATA
|
||||
"/tmp/data"
|
||||
CACHE PATH "The path to download hetero-mark dataset.")
|
||||
|
||||
add_subdirectory(test)
|
||||
|
|
|
@ -75,7 +75,7 @@ g++ -o vecadd -fPIC -no-pie \
|
|||
-L$CuPBoP_PATH/build/runtime \
|
||||
-L$CuPBoP_PATH/build/runtime/threadPool \
|
||||
host.o kernel.o \
|
||||
-I../.. -lc -lx86Runtime -lthreadPool -lpthread
|
||||
-I../.. -lc -lCPUruntime -lthreadPool -lpthread
|
||||
# Execute
|
||||
./vecadd
|
||||
```
|
||||
|
|
|
@ -1,20 +1,16 @@
|
|||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||
project(
|
||||
NVVM2X86
|
||||
DESCRIPTION "Translate NVVM IR to LLVM IR for X86 backend"
|
||||
LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
|
||||
# compile kernel translator
|
||||
include_directories(./KernelTranslation/include/x86)
|
||||
# build kernel translator
|
||||
include_directories(./KernelTranslation/include/cpu)
|
||||
add_subdirectory(KernelTranslation)
|
||||
|
||||
add_executable(kernelTranslator KernelTranslation.cpp)
|
||||
target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS})
|
||||
|
||||
# compile host translator
|
||||
include_directories(./HostTranslation/include/x86)
|
||||
# build host translator
|
||||
include_directories(./HostTranslation/include/cpu)
|
||||
add_subdirectory(HostTranslation)
|
||||
|
||||
add_executable(hostTranslator HostTranslation.cpp)
|
||||
|
|
|
@ -4,12 +4,7 @@
|
|||
#include "ReplaceCudaBuiltin.h"
|
||||
#include "ReplaceKernelArgs.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
|
|
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
|||
set(LIB_NAME cudaRuntime2cpuRuntime)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(./include/x86)
|
||||
include_directories(./include/cpu)
|
||||
include_directories(../../common)
|
||||
|
||||
file(GLOB proj_HEADERS "include/x86/*.h")
|
||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
file(GLOB proj_HEADERS "include/cpu/*.h")
|
||||
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||
|
||||
# Add core library.
|
||||
add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES})
|
||||
|
|
|
@ -3,16 +3,8 @@
|
|||
*/
|
||||
#include "RemoveCudaBuiltin.h"
|
||||
#include "debug.hpp"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Transforms/Utils/CtorUtils.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
|
@ -1,16 +1,14 @@
|
|||
#include "RemoveMetadata.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include "llvm/Support/Host.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void RemoveMetadata(llvm::Module *M) {
|
||||
// change the target triple to the host triple
|
||||
M->setTargetTriple(llvm::sys::getProcessTriple());
|
||||
// use the default DataLayout
|
||||
M->setDataLayout("");
|
||||
|
||||
SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
|
@ -22,5 +20,7 @@ void RemoveMetadata(llvm::Module *M) {
|
|||
F->removeFnAttr("min-legal-vector-width");
|
||||
F->removeFnAttr("no-trapping-math");
|
||||
F->removeFnAttr(llvm::Attribute::OptimizeNone);
|
||||
F->removeFnAttr("target-cpu");
|
||||
F->removeFnAttr("target-features");
|
||||
}
|
||||
}
|
|
@ -1,12 +1,7 @@
|
|||
#include "ReplaceConstantMemory.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
|
@ -1,13 +1,6 @@
|
|||
#include "ReplaceCudaBuiltin.h"
|
||||
#include "debug.hpp"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
|
@ -63,18 +56,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
|
|||
std::map<std::string, Function *> kernels;
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(Type::getVoidTy(*C), NULL);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 =
|
||||
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
||||
bool done = false;
|
||||
|
||||
std::set<std::string> cuda_register_kernel_names;
|
||||
|
||||
|
@ -160,8 +141,6 @@ void ReplaceKernelLaunch(llvm::Module *M) {
|
|||
std::vector<size_t> arg_sizes;
|
||||
functionOperand =
|
||||
dyn_cast<Function>(callOperand->stripPointerCasts());
|
||||
|
||||
FunctionType *ft = calledFunction->getFunctionType();
|
||||
DEBUG_INFO("Parent (Caller) Function Name: %s, "
|
||||
"cudaLaunchKernel Function: %s, args : %d\n",
|
||||
func_name.c_str(),
|
|
@ -1,12 +1,5 @@
|
|||
#include "ReplaceKernelArgs.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
|
@ -23,12 +16,9 @@ using namespace llvm;
|
|||
// to use use-analysis to find the arguments in the future
|
||||
void ReplaceKernelArg(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto VoidTy = llvm::Type::getVoidTy(context);
|
||||
auto I8 = llvm::Type::getInt8PtrTy(context);
|
||||
std::map<std::string, Function *> kernels;
|
||||
|
||||
std::set<llvm::Function *> need_replace;
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
|
@ -1,4 +1,4 @@
|
|||
#include "generate_x86_format.h"
|
||||
#include "generate_cpu_format.h"
|
||||
#include "handle_sync.h"
|
||||
#include "init.h"
|
||||
#include "insert_sync.h"
|
||||
|
@ -6,17 +6,14 @@
|
|||
#include "performance.h"
|
||||
#include "tool.h"
|
||||
#include "warp_func.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// to support constant memory variables, we need to convert information
|
||||
// from kernelTranslator to HostTranslator, since HostTranslator knows nothing
|
||||
// about the kernel functions, we need to write the information to a file
|
||||
// by KernelTranslator and read it in HostTranslator
|
||||
std::string PATH = "kernel_meta.log";
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
@ -26,8 +23,9 @@ int main(int argc, char **argv) {
|
|||
std::ofstream fout;
|
||||
fout.open(PATH);
|
||||
|
||||
// inline, and create auxiliary global variables
|
||||
// inline __device__ functions, and create auxiliary global variables
|
||||
init_block(program, fout);
|
||||
|
||||
// insert sync before each vote, and replace the
|
||||
// original vote function to warp vote
|
||||
handle_warp_vote(program);
|
||||
|
@ -40,17 +38,18 @@ int main(int argc, char **argv) {
|
|||
|
||||
// split block by sync
|
||||
split_block_by_sync(program);
|
||||
// add loop for intra&intera thread
|
||||
|
||||
// add loop for intra&intera thread, it refers 'hierarchical collapsing' in
|
||||
// COX paper.
|
||||
insert_warp_loop(program);
|
||||
|
||||
// (TODO): replace this patch
|
||||
replace_built_in_function(program);
|
||||
|
||||
// TODO: replace with a more general function
|
||||
// Not only for x86 backend
|
||||
generate_x86_format(program);
|
||||
// the input kernel programs have NVIDIA metadata, they need to be replaced to
|
||||
// CPU metadata
|
||||
generate_cpu_format(program);
|
||||
|
||||
// performance optimization
|
||||
// execute O3 pipeline on the transformed program
|
||||
performance_optimization(program);
|
||||
|
||||
VerifyModule(program);
|
||||
|
|
|
@ -11,12 +11,11 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
|||
set(LIB_NAME spmd2mpmd)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(./include/x86)
|
||||
include_directories(./include/cpu)
|
||||
include_directories(../../common)
|
||||
|
||||
file(GLOB proj_HEADERS "include/x86/*.h")
|
||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
file(GLOB proj_HEADERS "include/cpu/*.h")
|
||||
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||
|
||||
# Add core library.
|
||||
add_library(${LIB_NAME} STATIC ${proj_HEADERS} ${proj_SOURCES})
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#ifndef __NVVM2CPU_GENERATE_CPU_FORMAT__
|
||||
#define __NVVM2CPU_GENERATE_CPU_FORMAT__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void generate_cpu_format(llvm::Module *M);
|
||||
|
||||
void set_meta_data(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -1,10 +0,0 @@
|
|||
#ifndef __NVVM2x86_GENERATE_X86_FORMAT__
|
||||
#define __NVVM2x86_GENERATE_X86_FORMAT__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void generate_x86_format(llvm::Module *M);
|
||||
|
||||
void set_meta_data(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,125 @@
|
|||
#include "generate_cpu_format.h"
|
||||
#include "debug.hpp"
|
||||
#include "tool.h"
|
||||
#include "llvm/Support/Host.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// set TargetTriple and DataLayout same as the host CPU
|
||||
void set_meta_data(llvm::Module *M) {
|
||||
M->setTargetTriple(llvm::sys::getProcessTriple());
|
||||
// use the default DataLayout
|
||||
M->setDataLayout("");
|
||||
}
|
||||
|
||||
// as pthread only accept a single void* for input
|
||||
// we have to decode this input inside the kernel
|
||||
void decode_input(llvm::Module *M) {
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
|
||||
llvm::Type *Int32T = Type::getInt32Ty(M->getContext());
|
||||
llvm::Type *Int8T = Type::getInt8Ty(M->getContext());
|
||||
|
||||
llvm::FunctionType *LauncherFuncT = FunctionType::get(
|
||||
Type::getVoidTy(M->getContext()), {PointerType::get(Int8T, 0)}, false);
|
||||
|
||||
// generate Wrapper Function type
|
||||
// now we only support a single int32*
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
auto func_name = F->getName().str();
|
||||
// filter out _Z24 and other mangled prefix
|
||||
for (int pos = 2; pos < func_name.length(); pos++) {
|
||||
if (func_name[pos] >= '0' && func_name[pos] <= '9')
|
||||
continue;
|
||||
func_name = func_name.substr(pos);
|
||||
break;
|
||||
}
|
||||
llvm::IRBuilder<> Builder(M->getContext());
|
||||
|
||||
FunctionCallee fc =
|
||||
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
|
||||
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
|
||||
|
||||
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
|
||||
Builder.SetInsertPoint(Block);
|
||||
|
||||
// WorkGroup has only a single input
|
||||
Function::arg_iterator ai = WorkGroup->arg_begin();
|
||||
|
||||
SmallVector<Value *, 8> Arguments;
|
||||
Value *input_arg = &*ai;
|
||||
// convert to int**
|
||||
input_arg = Builder.CreateBitOrPointerCast(
|
||||
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
|
||||
|
||||
size_t idx = 0;
|
||||
// replace original arguments with the unpacked values
|
||||
// for example, for a function f(int* a, char* b),
|
||||
// we will generate a function f_wrapper(int** input)
|
||||
// and replace the original arguments with the unpacked values
|
||||
// e.g., a = (int*)input[0], b = (char*)input[1]
|
||||
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
||||
ii != ee; ++ii) {
|
||||
Type *ArgType = ii->getType();
|
||||
// calculate addr
|
||||
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
|
||||
// load corresponding int*
|
||||
GEP = createLoad(Builder, GEP);
|
||||
// bitcast
|
||||
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
|
||||
Value *Arg = createLoad(Builder, GEP);
|
||||
Arguments.push_back(Arg);
|
||||
++idx;
|
||||
}
|
||||
Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
|
||||
Builder.CreateRetVoid();
|
||||
}
|
||||
for (auto f : need_remove) {
|
||||
f->dropAllReferences();
|
||||
f->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
// after flat/hierarchical collapsing, the barrier instructions are useless
|
||||
void remove_barrier(llvm::Module *M) {
|
||||
std::vector<Instruction *> need_remove;
|
||||
for (auto F = M->begin(); F != M->end(); ++F)
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto Inst = BB->begin(); Inst != BB->end(); Inst++) {
|
||||
if (auto Call = dyn_cast<CallInst>(Inst)) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void remove_useless_var(llvm::Module *M) {
|
||||
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
|
||||
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
|
||||
}
|
||||
|
||||
void generate_cpu_format(llvm::Module *M) {
|
||||
DEBUG_INFO("generate cpu format\n");
|
||||
// change metadata
|
||||
set_meta_data(M);
|
||||
// decode argument
|
||||
decode_input(M);
|
||||
// remove barrier
|
||||
remove_barrier(M);
|
||||
// remove useless func/variable
|
||||
remove_useless_var(M);
|
||||
}
|
|
@ -1,13 +1,7 @@
|
|||
#include "handle_sync.h"
|
||||
#include "debug.hpp"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
|
@ -2,26 +2,11 @@
|
|||
#include "debug.hpp"
|
||||
#include "memory_hierarchy.h"
|
||||
#include "tool.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -31,11 +16,9 @@ bool inline_warp_level_func(llvm::Module *M) {
|
|||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||
if (c->getCalledFunction()) {
|
||||
|
@ -60,8 +43,7 @@ bool inline_warp_level_func(llvm::Module *M) {
|
|||
}
|
||||
|
||||
bool find_sreg_inst(llvm::Function *F) {
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||
if (c->getCalledFunction()) {
|
||||
|
@ -229,14 +211,12 @@ void llvm_preprocess(llvm::Module *M) {
|
|||
Passes.run(*M);
|
||||
}
|
||||
|
||||
// transform constant expression into sequence of instructions
|
||||
bool lower_constant_expr(llvm::Module *M) {
|
||||
bool modified = false;
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<CallInst *> need_remove;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
|
||||
|
@ -301,8 +281,8 @@ bool lower_constant_expr(llvm::Module *M) {
|
|||
return modified;
|
||||
}
|
||||
|
||||
// replace _ZL3expd, just delete its body
|
||||
void replace_cuda_math_built_in(llvm::Module *M) {
|
||||
// replace _ZL3expd, just delete its body
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
|
@ -4,29 +4,9 @@
|
|||
#include "handle_sync.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -44,7 +24,7 @@ public:
|
|||
std::vector<llvm::Instruction *> insert_intra_warp_sync_before;
|
||||
std::vector<llvm::Instruction *> insert_inter_warp_sync_before;
|
||||
|
||||
// insert sync in the entry
|
||||
// insert sync after the entry and before the first non-AllocaInst
|
||||
BasicBlock *entry = &(*F.begin());
|
||||
for (auto i = entry->begin(); i != entry->end(); i++) {
|
||||
if (!isa<AllocaInst>(i)) {
|
||||
|
@ -54,10 +34,8 @@ public:
|
|||
}
|
||||
|
||||
for (Function::iterator I = F.begin(); I != F.end(); ++I) {
|
||||
BasicBlock::iterator BI = I->begin();
|
||||
|
||||
// insert barrier before return
|
||||
for (; BI != I->end(); BI++) {
|
||||
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
|
||||
llvm::ReturnInst *Ret = llvm::dyn_cast<llvm::ReturnInst>(&(*BI));
|
||||
if (Ret) {
|
||||
insert_inter_warp_sync_before.push_back(&(*BI));
|
||||
|
@ -125,7 +103,7 @@ public:
|
|||
|
||||
auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>();
|
||||
|
||||
// first find all conditional barriers
|
||||
// find all conditional barriers
|
||||
std::vector<BasicBlock *> conditionalBarriers;
|
||||
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
|
||||
BasicBlock *b = &*i;
|
||||
|
@ -148,12 +126,9 @@ public:
|
|||
conditionalBarriers.pop_back();
|
||||
|
||||
// insert barrier in the start of if-condition
|
||||
|
||||
BasicBlock *pos = b;
|
||||
BasicBlock *pred = firstNonBackedgePredecessor(b);
|
||||
|
||||
while (PDT->getPostDomTree().dominates(b, pred)) {
|
||||
pos = pred;
|
||||
// If our BB post dominates the given block, we know it is not the
|
||||
// branching block that makes the barrier conditional.
|
||||
pred = firstNonBackedgePredecessor(pred);
|
||||
|
@ -468,7 +443,6 @@ public:
|
|||
auto header_block = L->getHeader();
|
||||
assert(header_block->getTerminator()->getNumSuccessors() == 2 &&
|
||||
"has more than 2 successors of the for-head\n");
|
||||
BasicBlock *for_body = NULL;
|
||||
for (int i = 0; i < header_block->getTerminator()->getNumSuccessors();
|
||||
i++) {
|
||||
auto bb = header_block->getTerminator()->getSuccessor(i);
|
|
@ -4,43 +4,20 @@
|
|||
#include "handle_sync.h"
|
||||
#include "tool.h"
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/Analysis/DivergenceAnalysis.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/MC/TargetRegistry.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -115,10 +92,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
|||
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
|
||||
|
||||
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
|
||||
Function *FF = instruction->getParent()->getParent();
|
||||
Module *M = instruction->getParent()->getParent()->getParent();
|
||||
LLVMContext &C = M->getContext();
|
||||
const llvm::DataLayout &Layout = M->getDataLayout();
|
||||
|
||||
llvm::Type *elementType;
|
||||
if (isa<AllocaInst>(instruction)) {
|
||||
|
@ -129,8 +103,6 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
|||
}
|
||||
|
||||
Type *AllocType = elementType;
|
||||
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
|
||||
llvm::Value *ItemSize = nullptr;
|
||||
llvm::AllocaInst *Alloca = nullptr;
|
||||
|
||||
auto block_size_addr = M->getGlobalVariable("block_size");
|
||||
|
@ -697,9 +669,6 @@ public:
|
|||
is_single_conditional_branch_block = 1;
|
||||
} else {
|
||||
// generate by replicate local variable
|
||||
printf(
|
||||
"[WARNING] match single conditional branch with HARD CODE\n");
|
||||
bool branch_to_intra_init = false;
|
||||
for (unsigned suc = 0; suc < br->getNumSuccessors(); ++suc) {
|
||||
llvm::BasicBlock *entryCandidate = br->getSuccessor(suc);
|
||||
auto block_name = entryCandidate->getName().str();
|
||||
|
@ -755,7 +724,7 @@ public:
|
|||
entry = entryCandidate;
|
||||
break;
|
||||
}
|
||||
// delete useless PR, those PRs only have branch
|
||||
// delete useless PR, those PRs only have branch instructions
|
||||
if (entry == exit) {
|
||||
if (entry->size() == 1 && isa<llvm::BranchInst>(entry->begin())) {
|
||||
return;
|
|
@ -1,29 +1,10 @@
|
|||
#include "memory_hierarchy.h"
|
||||
#include "debug.hpp"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
void mem_share2global(llvm::Module *M) {
|
||||
LLVMContext *C = &M->getContext();
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int64T = Type::getInt64Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
||||
std::set<llvm::Instruction *> need_remove;
|
||||
std::set<GlobalVariable *> need_remove_share_memory;
|
||||
|
@ -45,7 +26,6 @@ void mem_share2global(llvm::Module *M) {
|
|||
// generate global type pointer
|
||||
PointerType *PointerTy =
|
||||
PointerType::get(array_type->getElementType(), 0);
|
||||
llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
|
||||
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
|
||||
*M, PointerTy, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "dynamic_shared_memory", NULL,
|
||||
|
@ -75,7 +55,7 @@ void mem_share2global(llvm::Module *M) {
|
|||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
} else if (element_type->isFloatTy()) {
|
||||
auto FP_type = llvm::Type::getFloatTy(*C);
|
||||
auto FP_type = llvm::Type::getFloatTy(M->getContext());
|
||||
auto zero = llvm::ConstantFP::get(FP_type, 0);
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
|
||||
|
@ -128,11 +108,6 @@ void mem_share2global(llvm::Module *M) {
|
|||
}
|
||||
|
||||
void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
||||
LLVMContext *C = &M->getContext();
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int64T = Type::getInt64Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
||||
std::set<llvm::Instruction *> need_remove;
|
||||
std::set<GlobalVariable *> need_remove_constant_memory;
|
||||
|
@ -142,7 +117,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
|||
if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
|
||||
if (auto PT = dyn_cast<PointerType>(I->getType())) {
|
||||
unsigned AS = PT->getAddressSpace();
|
||||
if (AS == 4) { // find a share memory
|
||||
if (AS == 4) { // find a constant memory
|
||||
need_remove_constant_memory.insert(constant_memory);
|
||||
// generate the corresponding global memory variable
|
||||
auto new_name = "wrapper_global_" + constant_memory->getName().str();
|
||||
|
@ -150,7 +125,7 @@ void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
|||
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
|
||||
if (constant_memory->hasExternalLinkage() &&
|
||||
array_type->getArrayNumElements() == 0) {
|
||||
// external shared memory of []
|
||||
// external constant memory of []
|
||||
// generate global type pointer
|
||||
PointerType *PointerTy =
|
||||
PointerType::get(array_type->getElementType(), 0);
|
|
@ -1,43 +1,13 @@
|
|||
#include "performance.h"
|
||||
#include "debug.hpp"
|
||||
#include "tool.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/MachineModuleInfo.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/MC/TargetRegistry.h"
|
||||
#include "llvm/PassInfo.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Host.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -53,7 +23,7 @@ void performance_optimization(llvm::Module *M) {
|
|||
llvm::legacy::PassManager Passes;
|
||||
|
||||
// add target machine info
|
||||
llvm::Triple triple("x86_64-unknown-linux-gnu");
|
||||
llvm::Triple triple(llvm::sys::getProcessTriple());
|
||||
|
||||
std::string Error;
|
||||
const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
|
||||
|
@ -62,7 +32,7 @@ void performance_optimization(llvm::Module *M) {
|
|||
Options.FloatABIType = FloatABI::Hard;
|
||||
|
||||
TargetMachine *TM = TheTarget->createTargetMachine(
|
||||
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef("+m,+f"),
|
||||
triple.getTriple(), llvm::sys::getHostCPUName().str(), StringRef(""),
|
||||
Options, Reloc::PIC_, CodeModel::Small, CodeGenOpt::Aggressive);
|
||||
assert(TM && "No Machine Information\n");
|
||||
|
||||
|
@ -80,9 +50,6 @@ void performance_optimization(llvm::Module *M) {
|
|||
Builder.LoopVectorize = true;
|
||||
Builder.SLPVectorize = true;
|
||||
|
||||
Builder.VerifyInput = true;
|
||||
Builder.VerifyOutput = true;
|
||||
|
||||
Builder.populateModulePassManager(Passes);
|
||||
Passes.run(*M);
|
||||
}
|
|
@ -1,29 +1,13 @@
|
|||
#include "tool.h"
|
||||
#include "debug.hpp"
|
||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||
#include "llvm/Config/llvm-config.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/GlobalVariable.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/ErrorOr.h"
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/ManagedStatic.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -133,7 +117,7 @@ llvm::Instruction *BreakPHIToAllocas(PHINode *phi) {
|
|||
Value *val = phi->getIncomingValue(incoming);
|
||||
BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
|
||||
builder.SetInsertPoint(incomingBB->getTerminator());
|
||||
llvm::Instruction *store = builder.CreateStore(val, alloca);
|
||||
builder.CreateStore(val, alloca);
|
||||
}
|
||||
builder.SetInsertPoint(phi);
|
||||
|
||||
|
@ -164,7 +148,6 @@ void phi2alloc(llvm::Module *M) {
|
|||
}
|
||||
}
|
||||
|
||||
bool changed = false;
|
||||
for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); ++i) {
|
||||
Instruction *instr = *i;
|
||||
BreakPHIToAllocas(dyn_cast<PHINode>(instr));
|
||||
|
@ -279,9 +262,7 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Load = dyn_cast<LoadInst>(BI)) {
|
||||
auto load_from = Load->getOperand(0);
|
||||
} else if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->getCalledFunction()) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
|
||||
|
@ -425,7 +406,6 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->getCalledFunction()) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
auto callFn = Call->getCalledFunction();
|
||||
if (func_name == "vprintf") {
|
||||
/*
|
||||
* replace CUDA's printf to C's printf
|
||||
|
@ -458,7 +438,7 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
dyn_cast<PointerType>(BC->getOperand(0)->getType());
|
||||
auto SrcTy = SrcPointTy->getElementType();
|
||||
// reverse the bitcast
|
||||
auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
|
||||
new BitCastInst(BC, SrcPointTy, "", Call);
|
||||
assert(SrcTy->isStructTy() == 1);
|
||||
auto StructTy = dyn_cast<StructType>(SrcTy);
|
||||
for (int i = 0; i < StructTy->getNumElements(); i++) {
|
||||
|
@ -528,7 +508,6 @@ void replace_built_in_function(llvm::Module *M) {
|
|||
|
||||
void replace_asm_call(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<CallInst *> need_remove;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
|
@ -2,15 +2,6 @@
|
|||
#include "warp_func.h"
|
||||
#include "debug.hpp"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -107,7 +98,7 @@ void handle_warp_vote(llvm::Module *M) {
|
|||
res = BinaryOperator::CreateNot(res, "", sync_inst);
|
||||
}
|
||||
|
||||
auto sotre_mask = new llvm::StoreInst(res, GEP, "", sync_inst);
|
||||
new llvm::StoreInst(res, GEP, "", sync_inst);
|
||||
// create barrier
|
||||
CreateIntraWarpBarrier(sync_inst);
|
||||
/*
|
|
@ -1,186 +0,0 @@
|
|||
#include "generate_x86_format.h"
|
||||
#include "debug.hpp"
|
||||
#include "tool.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/MC/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void set_meta_data(llvm::Module *M) {
|
||||
M->setTargetTriple("x86_64-unknown-linux-gnu");
|
||||
M->setDataLayout(
|
||||
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128");
|
||||
}
|
||||
|
||||
// as pthread only accept a single void* for input
|
||||
// we have to decode this input inside the kernel
|
||||
void decode_input(llvm::Module *M) {
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
|
||||
LLVMContext *C = &M->getContext();
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
llvm::FunctionType *LauncherFuncT = FunctionType::get(
|
||||
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
|
||||
|
||||
std::set<GlobalVariable *> dynmaic_memory;
|
||||
|
||||
std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
|
||||
|
||||
// generate Wrapper Function type
|
||||
// now we only support a single int32*
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
auto func_name = F->getName().str();
|
||||
// remove mangle prefix
|
||||
// remove _Z24
|
||||
for (int pos = 2; pos < func_name.length(); pos++) {
|
||||
if (func_name[pos] >= '0' && func_name[pos] <= '9')
|
||||
continue;
|
||||
func_name = func_name.substr(pos);
|
||||
break;
|
||||
}
|
||||
llvm::IRBuilder<> Builder(M->getContext());
|
||||
|
||||
FunctionCallee fc =
|
||||
M->getOrInsertFunction(func_name + "_wrapper", LauncherFuncT);
|
||||
Function *WorkGroup = dyn_cast<Function>(fc.getCallee());
|
||||
|
||||
BasicBlock *Block = BasicBlock::Create(M->getContext(), "", WorkGroup);
|
||||
Builder.SetInsertPoint(Block);
|
||||
|
||||
// WorkGroup has only a single input
|
||||
Function::arg_iterator ai = WorkGroup->arg_begin();
|
||||
|
||||
SmallVector<Value *, 8> Arguments;
|
||||
Value *input_arg = &*ai;
|
||||
// convert to int**
|
||||
input_arg = Builder.CreateBitOrPointerCast(
|
||||
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
|
||||
|
||||
// dynamic memory load in the wrapper function
|
||||
GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
|
||||
if (share_memory != NULL) {
|
||||
dynmaic_memory.insert(share_memory);
|
||||
llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
|
||||
*M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
"thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
|
||||
0, false);
|
||||
Value *loadedValue = createLoad(Builder, global_mem);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 = FunctionType::get(
|
||||
PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
||||
FunctionCallee fc2 =
|
||||
M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
|
||||
|
||||
Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
|
||||
|
||||
WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
|
||||
WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
|
||||
Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
|
||||
co->setSelectionKind(Comdat::SelectionKind::Any);
|
||||
WorkGroup2->setComdat(co);
|
||||
|
||||
BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
|
||||
|
||||
llvm::IRBuilder<> Builder2(M->getContext());
|
||||
Builder2.SetInsertPoint(Block2);
|
||||
Builder2.CreateRet(share_memory);
|
||||
|
||||
auto PT = dyn_cast<PointerType>(share_memory->getType());
|
||||
auto element_type = PT->getElementType();
|
||||
|
||||
AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
|
||||
Value *new_ar = new_arr;
|
||||
Value *gptr = Builder.CreateBitOrPointerCast(
|
||||
share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
|
||||
|
||||
Builder.CreateStore(new_ar, gptr);
|
||||
}
|
||||
|
||||
size_t idx = 0;
|
||||
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
||||
ii != ee; ++ii) {
|
||||
Type *ArgType = ii->getType();
|
||||
|
||||
// calculate addr
|
||||
Value *GEP = createGEP(Builder, input_arg, ConstantInt::get(Int32T, idx));
|
||||
// load corresponding int*
|
||||
GEP = createLoad(Builder, GEP);
|
||||
// bitcast
|
||||
GEP = Builder.CreateBitOrPointerCast(GEP, PointerType::get(ArgType, 0));
|
||||
Value *Arg = createLoad(Builder, GEP);
|
||||
Arguments.push_back(Arg);
|
||||
++idx;
|
||||
}
|
||||
|
||||
CallInst *c = Builder.CreateCall(F, ArrayRef<llvm::Value *>(Arguments));
|
||||
Builder.CreateRetVoid();
|
||||
}
|
||||
for (auto f : need_remove) {
|
||||
f->dropAllReferences();
|
||||
f->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void remove_barrier(llvm::Module *M) {
|
||||
std::vector<Instruction *> need_remove;
|
||||
for (auto F = M->begin(); F != M->end(); ++F)
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void remove_useless_var(llvm::Module *M) {
|
||||
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
|
||||
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
|
||||
}
|
||||
|
||||
void generate_x86_format(llvm::Module *M) {
|
||||
DEBUG_INFO("generate x86 format\n");
|
||||
// change metadata
|
||||
set_meta_data(M);
|
||||
// decode argument
|
||||
decode_input(M);
|
||||
// remove barrier
|
||||
remove_barrier(M);
|
||||
// remove useless func/variable
|
||||
remove_useless_var(M);
|
||||
}
|
|
@ -1,9 +1,9 @@
|
|||
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
|
||||
project(
|
||||
X86runtime
|
||||
DESCRIPTION "Implementation CUDA runtime API with x86"
|
||||
CPUruntime
|
||||
DESCRIPTION "Implementation CUDA runtime API with CPUs"
|
||||
LANGUAGES CXX)
|
||||
set(LIB_NAME x86Runtime)
|
||||
set(LIB_NAME CPUruntime)
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
|
||||
# compile threadPool implementation
|
||||
|
@ -12,9 +12,9 @@ add_subdirectory(threadPool)
|
|||
# compile x86 runtime library
|
||||
include_directories(../common)
|
||||
include_directories(./include/)
|
||||
include_directories(./include/x86)
|
||||
include_directories(./include/cpu)
|
||||
include_directories(./threadPool/include/)
|
||||
include_directories(./threadPool/include/x86)
|
||||
include_directories(./threadPool/include/cpu)
|
||||
include_directories(../external/moodycamel/)
|
||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#ifndef __RUNTIME_IMPL__
|
||||
#ifndef __KERNEL_IMPL__
|
||||
#define __KERNEL_IMPL__
|
||||
#include "structures.h"
|
||||
#include <stdint.h>
|
|
@ -10,27 +10,31 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
cudaError_t cudaGetDevice(int *devPtr) {
|
||||
*devPtr = 0;
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
|
||||
cudaError_t cudaDeviceReset(void) {
|
||||
scheduler_uninit();
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaDeviceReset(void) { return cudaSuccess; }
|
||||
|
||||
cudaError_t cudaDeviceSynchronize(void) {
|
||||
cuSynchronizeBarrier();
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaThreadSynchronize(void) {
|
||||
cuSynchronizeBarrier();
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaFree(void *devPtr) {
|
||||
free(devPtr);
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaFreeHost(void *devPtr) {
|
||||
free(devPtr);
|
||||
return cudaSuccess;
|
||||
|
@ -47,20 +51,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|||
cu_kernel *ker =
|
||||
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
|
||||
|
||||
int lstatus = cuLaunchKernel(&ker);
|
||||
|
||||
cuLaunchKernel(&ker);
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
||||
*devPtr = malloc(size);
|
||||
if (devPtr == NULL)
|
||||
return cudaErrorMemoryAllocation;
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
|
||||
memset(devPtr, value, count);
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
||||
cudaMemcpyKind kind) {
|
||||
if (kind == cudaMemcpyHostToHost) {
|
||||
|
@ -105,7 +111,6 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
|||
return cudaSuccess;
|
||||
}
|
||||
|
||||
static int stream_counter = 1;
|
||||
/*
|
||||
From our evaluation, CPU backend can gain little benefit
|
||||
from multi stream. Thus, we only use single stream
|
||||
|
@ -159,6 +164,8 @@ static cudaError_t lastError = cudaSuccess;
|
|||
const char *cudaGetErrorString(cudaError_t error) {
|
||||
if (error == cudaSuccess) {
|
||||
return "Cuda Get Error Success";
|
||||
} else {
|
||||
return "Cuda Get Error Failed";
|
||||
}
|
||||
}
|
||||
|
|
@ -10,11 +10,10 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
|||
set(LIB_NAME threadPool)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
include_directories(../../common)
|
||||
include_directories(./include)
|
||||
include_directories(./include/x86)
|
||||
include_directories(./include/cpu)
|
||||
include_directories(../../external/moodycamel)
|
||||
|
||||
file(GLOB proj_SOURCES "src/x86/*.cpp")
|
||||
file(GLOB proj_SOURCES "src/cpu/*.cpp")
|
||||
add_library(${LIB_NAME} SHARED ${proj_SOURCES})
|
||||
|
|
|
@ -1,3 +1,19 @@
|
|||
/*
|
||||
This file contains the implementation of the CPU thread pool. For a kernel
|
||||
launch, the host thread will enqueue the kernel to the kernelQueue, and the
|
||||
threads in the thread pool will try to fetch work from the queue. After a
|
||||
thread fetches a kernel from the queue, it will execute the kernel. After the
|
||||
kernel execution, the thread will try to fetch another kernel from the queue.
|
||||
If the queue is empty, the thread will wait for the next kernel launch.
|
||||
|
||||
By default, we try to use all CPU cores for execution. Thus, for a kernel
|
||||
launch, the host thread pushes P kernel variables to the queue, where P is the
|
||||
number of CPU cores.
|
||||
|
||||
For some lightweight kernels, useing fewer CPU cores can speed up the overall
|
||||
execution time, due to fewer CPU cores lead to lower synchronization overhead.
|
||||
*/
|
||||
|
||||
#include "api.h"
|
||||
#include "blockingconcurrentqueue.h"
|
||||
#include "debug.hpp"
|
||||
|
@ -9,9 +25,6 @@
|
|||
#include <stdlib.h>
|
||||
#include <thread>
|
||||
|
||||
/*
|
||||
Initialize the device
|
||||
*/
|
||||
int device_max_compute_units = 1;
|
||||
bool device_initilized = false;
|
||||
int init_device() {
|
||||
|
@ -32,7 +45,6 @@ int init_device() {
|
|||
}
|
||||
|
||||
// Create Kernel
|
||||
static int kernelIds = 0;
|
||||
cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
void **args, size_t sharedMem, cudaStream_t stream) {
|
||||
cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));
|
||||
|
@ -71,13 +83,11 @@ __thread int warp_shfl[32] = {
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
/*
|
||||
Enqueue Kernel (k) to the scheduler kernelQueue
|
||||
*/
|
||||
int TaskToExecute;
|
||||
// Enqueue Kernel to the scheduler kernelQueue
|
||||
int schedulerEnqueueKernel(cu_kernel *k) {
|
||||
int totalBlocks =
|
||||
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
|
||||
int totalBlocks = k->totalBlocks;
|
||||
// by default, all CPU cores are used to execute GPU blocks equally
|
||||
int gpuBlockToExecutePerCpuThread =
|
||||
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
|
||||
TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
|
||||
|
@ -93,28 +103,21 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
|||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Kernel Launch with numBlocks and numThreadsPerBlock
|
||||
*/
|
||||
// Push kernel to the kernelQueue
|
||||
int cuLaunchKernel(cu_kernel **k) {
|
||||
if (!device_initilized) {
|
||||
init_device();
|
||||
}
|
||||
// Calculate Block Size N/numBlocks
|
||||
cu_kernel *ker = *k;
|
||||
int status = C_RUN;
|
||||
// set complete to false, this variable is used for sync
|
||||
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
||||
scheduler->thread_pool[i].completeTask = 0;
|
||||
}
|
||||
schedulerEnqueueKernel(ker);
|
||||
schedulerEnqueueKernel(*k);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Thread Gets Work
|
||||
*/
|
||||
// threads in thread-pool try to fetch work from the queue
|
||||
int get_work(c_thread *th) {
|
||||
int dynamic_shared_mem_size = 0;
|
||||
dim3 gridDim;
|
||||
|
@ -136,6 +139,7 @@ int get_work(c_thread *th) {
|
|||
grid_size_x = gridDim.x;
|
||||
grid_size_y = gridDim.y;
|
||||
grid_size_z = gridDim.z;
|
||||
// allocate dynamic shared memory
|
||||
if (dynamic_shared_mem_size > 0)
|
||||
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
||||
// execute GPU blocks
|
||||
|
@ -153,7 +157,8 @@ int get_work(c_thread *th) {
|
|||
}
|
||||
// if cannot get tasks, check whether programs stop
|
||||
if (scheduler->threadpool_shutdown_requested) {
|
||||
return true; // thread exit
|
||||
// thread exit
|
||||
break;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
@ -176,9 +181,7 @@ void *driver_thread(void *p) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Initialize the scheduler
|
||||
*/
|
||||
// Initialize the scheduler
|
||||
int scheduler_init(cu_device device) {
|
||||
scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
|
||||
scheduler->num_worker_threads = device.max_compute_units;
|
||||
|
@ -198,8 +201,6 @@ int scheduler_init(cu_device device) {
|
|||
return C_SUCCESS;
|
||||
}
|
||||
|
||||
void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
|
||||
|
||||
/*
|
||||
Barrier for Kernel Launch
|
||||
*/
|
|
@ -30,7 +30,7 @@ g++ -o $1 -fPIC -no-pie \
|
|||
$HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc $HeteroMark_PATH/src/common/benchmark/*.cc \
|
||||
$HeteroMark_PATH/src/common/command_line_option/*.cc $HeteroMark_PATH/src/common/time_measurement/*.cc \
|
||||
-L$CuPBoP_BUILD_PATH/runtime -L$CuPBoP_BUILD_PATH/runtime/threadPool \
|
||||
-I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lx86Runtime -lthreadPool
|
||||
-I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool
|
||||
|
||||
case $1 in
|
||||
aes)
|
||||
|
|
Loading…
Reference in New Issue