add codebase for TACO submission
This commit is contained in:
parent
897af29748
commit
f8e72916c1
|
@ -39,3 +39,4 @@ set(GCC_COVERAGE_LINK_FLAGS
|
|||
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
|
||||
|
||||
add_subdirectory(compilation)
|
||||
add_subdirectory(runtime)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Contributing to CuPBoP
|
||||
# Contributing to COX
|
||||
|
||||
Thank you for your interest in contributing to CuPBoP!
|
||||
Thank you for your interest in contributing to COX!
|
||||
We appreciate all contributions, including but not limited to:
|
||||
|
||||
- Add documentation
|
||||
|
@ -10,9 +10,9 @@ We appreciate all contributions, including but not limited to:
|
|||
## How to contribute?
|
||||
|
||||
0. (Optional) Open an issue and discuss your idea before start
|
||||
1. Fork the latest version CuPBoP
|
||||
1. Fork the latest version COX
|
||||
2. Commit to the forked repo
|
||||
3. Create a Pull Request to CuPBoP main branch
|
||||
3. Create a Pull Request to COX main branch
|
||||
|
||||
## Code style
|
||||
|
||||
|
@ -21,14 +21,13 @@ To make sure your contribution is following the correct style,
|
|||
we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
|
||||
|
||||
```bash
|
||||
# Python3 environment is required
|
||||
# Python environment is required
|
||||
pip install pre-commit
|
||||
```
|
||||
|
||||
Then, from the repository folder, execute the following instruction:
|
||||
|
||||
```bash
|
||||
# execute in CuPBoP's root folder
|
||||
pre-commit install
|
||||
```
|
||||
|
22
README.md
22
README.md
|
@ -1,10 +1,10 @@
|
|||
# CuPBoP: Cuda for Parallelized and Broad-range Processors
|
||||
# COX: CUDA on X86
|
||||
|
||||
## Introduction
|
||||
|
||||
CuPBoP (Cuda for parallelized and broad-range processors) is a framework
|
||||
aims to execute CUDA source code on non-NVIDIA devices,
|
||||
including CPU, GPU and other architectures.
|
||||
This project consists of two parts: a series of LLVM passes that
|
||||
achieve a SPMD NVVM IR as input, and output the corresponding
|
||||
MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
|
||||
|
||||
## Install
|
||||
|
||||
|
@ -22,8 +22,8 @@ including CPU, GPU and other architectures.
|
|||
1. Clone from github
|
||||
|
||||
```bash
|
||||
git clone https://github.com/cupbop/CuPBoP
|
||||
cd CuPBoP
|
||||
git clone https://github.com/drcut/open_source_template
|
||||
cd open_source_template
|
||||
```
|
||||
|
||||
2. Build the transformer for NVVM IR to LLVM IR for X86
|
||||
|
@ -55,12 +55,8 @@ g++ ../compilation/examples/vecadd/host.cpp \
|
|||
./vecadd_example
|
||||
```
|
||||
|
||||
## Contribution
|
||||
|
||||
We sincerely appreciate all kinds of contributions.
|
||||
Please refer to [CONTRIBUTING](docs/CONTRIBUTING.md) for the contributing guideline.
|
||||
|
||||
## Author
|
||||
|
||||
* [Ruobing Han](https://drcut.github.io/)
|
||||
* [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/)
|
||||
[Ruobing Han](https://drcut.github.io/) is a CS phd student in
|
||||
Georgia Institute Technology, under the supervision
|
||||
of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).
|
||||
|
|
|
@ -1,25 +1,43 @@
|
|||
#include "ReplaceKernelLaunch.h"
|
||||
#include "RemoveCudaBuiltin.h"
|
||||
#include "ReplaceConstantMemory.h"
|
||||
#include "ReplaceCudaBuiltin.h"
|
||||
#include "ReplaceKernelArgs.h"
|
||||
#include "tool.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
std::string PATH = "kernel_meta.log";
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
assert(argc == 3 && "incorrect number of arguments\n");
|
||||
|
||||
char *input_host_path = argv[1];
|
||||
char *output_host_path = argv[2];
|
||||
|
||||
std::ifstream fin;
|
||||
fin.open(PATH);
|
||||
|
||||
// load LLVM module(s)
|
||||
llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
|
||||
VerifyModule(hostModule);
|
||||
// replace const memory
|
||||
ReplaceConstantMemory(hostModule, fin);
|
||||
// process host module
|
||||
ReplaceKernelLaunch(hostModule);
|
||||
ReplaceCudaBuiltin(hostModule);
|
||||
// remove builtin unuse functions and variables
|
||||
RemoveCudaBuiltin(hostModule);
|
||||
// replace arguments in kernel_arg, from alloc to malloc
|
||||
ReplaceKernelArg(hostModule);
|
||||
|
||||
VerifyModule(hostModule);
|
||||
DumpModule(hostModule, output_host_path);
|
||||
|
||||
fin.close();
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__
|
||||
#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__
|
||||
#ifndef __NVVM2x86_REMOVE_CUDABUILTIN__
|
||||
#define __NVVM2x86_REMOVE_CUDABUILTIN__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
/*
|
||||
* Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
|
||||
* Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
|
||||
*/
|
||||
void ReplaceKernelLaunch(llvm::Module *M);
|
||||
void RemoveCudaBuiltin(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef __NVVM2x86_REPLACE_CONSTANT_MEMORY__
|
||||
#define __NVVM2x86_REPLACE_CONSTANT_MEMORY__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <fstream>
|
||||
/*
|
||||
* From: @ff_variable = internal global [5 x float] undef, align 16
|
||||
* To: @wrapper_global_ff_variable = common global [5 x float] zeroinitializer
|
||||
*/
|
||||
void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,11 @@
|
|||
#ifndef __NVVM2x86_REPLACE_CUDA_BUILTIN__
|
||||
#define __NVVM2x86_REPLACE_CUDA_BUILTIN__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
/*
|
||||
* Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
|
||||
* Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
|
||||
*/
|
||||
void ReplaceCudaBuiltin(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,14 @@
|
|||
#ifndef __NVVM2x86_REPLACE_KERNEL_ARGS__
|
||||
#define __NVVM2x86_REPLACE_KERNEL_ARGS__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
/*
|
||||
* before:
|
||||
* %m_cuda.addr = alloca float*, align 8
|
||||
* after:
|
||||
* %m_cuda.addr_tmp = call i8* @malloc(i64 8)
|
||||
* %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
|
||||
*/
|
||||
void ReplaceKernelArg(llvm::Module *M);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,7 @@
|
|||
/**
|
||||
* Generate a file for Cuda Kernel Function Attributes
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*/
|
|
@ -0,0 +1,6 @@
|
|||
/*
|
||||
|
||||
Initialize the cudaDevice as first statements if not set by the User
|
||||
(cudaSetDevice)
|
||||
|
||||
*/
|
|
@ -0,0 +1,59 @@
|
|||
/**
|
||||
* Remove Clang cuda builtin functions and variables
|
||||
*/
|
||||
#include "RemoveCudaBuiltin.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void RemoveCudaBuiltin(llvm::Module *M) {
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
|
||||
if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) {
|
||||
gv->dropAllReferences();
|
||||
gv->eraseFromParent();
|
||||
}
|
||||
Function *c_tor = NULL;
|
||||
if (c_tor = M->getFunction("__cuda_module_ctor")) {
|
||||
c_tor->dropAllReferences();
|
||||
c_tor->eraseFromParent();
|
||||
}
|
||||
if (c_tor = M->getFunction("__cuda_module_dtor")) {
|
||||
c_tor->dropAllReferences();
|
||||
c_tor->eraseFromParent();
|
||||
}
|
||||
if (c_tor = M->getFunction("__cuda_register_globals")) {
|
||||
|
||||
c_tor->dropAllReferences();
|
||||
c_tor->eraseFromParent();
|
||||
}
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
|
||||
if (func_name == "__cuda_module_dtor" ||
|
||||
func_name == "__cuda_register_globals" ||
|
||||
func_name == "__cudaRegisterFunction" ||
|
||||
func_name == "__cudaRegisterVar" ||
|
||||
func_name == "__cudaRegisterFatBinary" ||
|
||||
func_name == "__cuda_module_ctor" ||
|
||||
func_name == "__cudaRegisterFatBinaryEnd" ||
|
||||
func_name == "__cudaUnregisterFatBinary") {
|
||||
need_remove.insert(F);
|
||||
}
|
||||
}
|
||||
for (auto f : need_remove) {
|
||||
f->dropAllReferences();
|
||||
f->eraseFromParent();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
#include "ReplaceConstantMemory.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin) {
|
||||
std::string s;
|
||||
bool find_constant_memory = false;
|
||||
while (getline(fin, s)) {
|
||||
if (s.find("ConstMemory2GlobalMemory") != std::string::npos) {
|
||||
find_constant_memory = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!find_constant_memory) {
|
||||
assert(0 && "Do not find constant to global mapping\n");
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> corresponding_global_memory;
|
||||
while (getline(fin, s)) {
|
||||
if (s.find("END") != std::string::npos) {
|
||||
break;
|
||||
}
|
||||
// get constant name
|
||||
size_t pos = 0;
|
||||
pos = s.find(' ');
|
||||
std::string constant_name = s.substr(0, pos);
|
||||
s.erase(0, pos + 1);
|
||||
// get mapped global name
|
||||
std::string global_name = s.substr(3, s.length() - 1);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<std::string, std::string>(constant_name, global_name));
|
||||
}
|
||||
|
||||
std::set<llvm::GlobalVariable *> need_remove_constant_memory;
|
||||
// find all constant memory and generate corresponding global memory
|
||||
for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
|
||||
if (auto constant_memory = dyn_cast<llvm::GlobalVariable>(I)) {
|
||||
if (corresponding_global_memory.find(constant_memory->getName().str()) !=
|
||||
corresponding_global_memory.end()) {
|
||||
auto global_name =
|
||||
corresponding_global_memory.find(constant_memory->getName().str())
|
||||
->second;
|
||||
// create a new global variable
|
||||
if (auto PT = dyn_cast<llvm::PointerType>(I->getType())) {
|
||||
need_remove_constant_memory.insert(constant_memory);
|
||||
// generate the corresponding global memory variable
|
||||
auto element_type = PT->getElementType();
|
||||
if (auto array_type = dyn_cast<llvm::ArrayType>(element_type)) {
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, array_type, false, llvm::GlobalValue::CommonLinkage, NULL,
|
||||
global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
|
||||
|
||||
llvm::ConstantAggregateZero *const_array =
|
||||
llvm::ConstantAggregateZero::get(array_type);
|
||||
global_memory->setInitializer(const_array);
|
||||
constant_memory->replaceAllUsesWith(
|
||||
llvm::ConstantExpr::getPointerCast(
|
||||
global_memory,
|
||||
cast<PointerType>(constant_memory->getType())));
|
||||
} else if (element_type->isStructTy()) {
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, element_type, false, llvm::GlobalValue::CommonLinkage, NULL,
|
||||
global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
|
||||
llvm::ConstantAggregateZero *const_array =
|
||||
llvm::ConstantAggregateZero::get(element_type);
|
||||
global_memory->setInitializer(const_array);
|
||||
constant_memory->replaceAllUsesWith(
|
||||
llvm::ConstantExpr::getPointerCast(
|
||||
global_memory,
|
||||
cast<PointerType>(constant_memory->getType())));
|
||||
} else {
|
||||
assert(0 && "The required Constant Memory Type is not supported\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto i : need_remove_constant_memory) {
|
||||
i->dropAllReferences();
|
||||
i->eraseFromParent();
|
||||
}
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,292 @@
|
|||
#include "ReplaceCudaBuiltin.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
/*
|
||||
insert sync after cudaKernel launch
|
||||
call void @_Z13staticReversePii(i32* %55, i32 64)
|
||||
%57 = call i32 @cudaDeviceSynchronize()
|
||||
*/
|
||||
void InsertSyncAfterKernelLaunch(llvm::Module *M) {
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::FunctionType *LauncherFuncT = FunctionType::get(Int32T, NULL);
|
||||
llvm::FunctionCallee _f =
|
||||
M->getOrInsertFunction("cudaDeviceSynchronize", LauncherFuncT);
|
||||
llvm::Function *func_launch = llvm::cast<llvm::Function>(_f.getCallee());
|
||||
std::set<std::string> launch_function_name;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
|
||||
// F is a kernel launch function
|
||||
launch_function_name.insert(func_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
if (launch_function_name.find(calledFunction->getName().str()) !=
|
||||
launch_function_name.end()) {
|
||||
// insert a sync after launch
|
||||
if (callInst->getNextNonDebugInstruction()) {
|
||||
llvm::CallInst::Create(func_launch, "",
|
||||
callInst->getNextNonDebugInstruction());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
|
||||
// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
|
||||
void ReplaceKernelLaunch(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto VoidTy = llvm::Type::getVoidTy(context);
|
||||
auto I8 = llvm::Type::getInt8PtrTy(context);
|
||||
std::map<std::string, Function *> kernels;
|
||||
|
||||
std::set<llvm::Function *> need_remove;
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(Type::getVoidTy(*C), NULL);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 =
|
||||
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
||||
bool done = false;
|
||||
|
||||
std::set<std::string> cuda_register_kernel_names;
|
||||
|
||||
std::string str;
|
||||
llvm::raw_string_ostream ss(str);
|
||||
|
||||
/*
|
||||
|
||||
When using << >>, clang generates cudaPushCallConfiguration with the same
|
||||
function definition as the kernel definition in the kernel bitcode
|
||||
|
||||
define internal void @__cuda_register_globals(i8** %0) {
|
||||
entry:
|
||||
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*,
|
||||
float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x
|
||||
i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14
|
||||
x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32*
|
||||
null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void
|
||||
(float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8*
|
||||
getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8*
|
||||
getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8*
|
||||
null, i8* null, i8* null, i8* null, i32* null) ret void
|
||||
}
|
||||
|
||||
*/
|
||||
Function *f_register_global = M->getFunction("__cuda_register_globals");
|
||||
if (f_register_global) {
|
||||
for (Function::iterator b = f_register_global->begin();
|
||||
b != f_register_global->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
if (calledFunction->getName().str() == "__cudaRegisterFunction") {
|
||||
Value *callOperand = callInst->getArgOperand(1);
|
||||
|
||||
Function *functionOperand =
|
||||
dyn_cast<Function>(callInst->getArgOperand(1));
|
||||
|
||||
// call function is wrapped in a bitcast
|
||||
if (functionOperand == NULL) {
|
||||
|
||||
std::vector<size_t> arg_sizes;
|
||||
functionOperand =
|
||||
dyn_cast<Function>(callOperand->stripPointerCasts());
|
||||
|
||||
cuda_register_kernel_names.insert(
|
||||
functionOperand->getName().str());
|
||||
std::cout << "Cuda Register Global Kernel: "
|
||||
<< functionOperand->getName().str() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
bool host_changed = false;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
|
||||
if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
|
||||
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
|
||||
|
||||
Value *callOperand = callInst->getArgOperand(0);
|
||||
|
||||
Function *functionOperand =
|
||||
dyn_cast<Function>(callInst->getArgOperand(0));
|
||||
|
||||
// call function is wrapped in a bitcast
|
||||
if (functionOperand == NULL) {
|
||||
|
||||
std::vector<size_t> arg_sizes;
|
||||
functionOperand =
|
||||
dyn_cast<Function>(callOperand->stripPointerCasts());
|
||||
|
||||
FunctionType *ft = calledFunction->getFunctionType();
|
||||
std::cout << " Parent (Caller) Function Name: " << func_name
|
||||
<< ", cudaLaunchKernel Function: "
|
||||
<< functionOperand->getName().str() << ", args "
|
||||
<< functionOperand->arg_size() << std::endl;
|
||||
auto rep = kernels.find(functionOperand->getName().str());
|
||||
if (rep != kernels.end()) {
|
||||
Function *FC = rep->second;
|
||||
BitCastInst *B = new BitCastInst(FC, I8, "", callInst);
|
||||
callInst->setArgOperand(0, B);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<Type *> Params;
|
||||
Params.push_back(I8);
|
||||
FunctionType *FT = FunctionType::get(VoidTy, Params, false);
|
||||
|
||||
/*
|
||||
Because of the TODO in the 2nd if statement, need to get the
|
||||
prior name before _host is add
|
||||
*/
|
||||
std::string oldName = functionOperand->getName().str();
|
||||
|
||||
// if parent function is __host and same as the cudaKernelLaunch
|
||||
std::string newName = oldName + "_wrapper";
|
||||
if (func_name == oldName && host_changed &&
|
||||
oldName.find("_host") != std::string::npos) {
|
||||
newName =
|
||||
oldName.substr(0, oldName.length() - 5) + "_wrapper";
|
||||
}
|
||||
std::cout << "Change Kernel Name to: " << newName << std::endl;
|
||||
|
||||
Function *F =
|
||||
Function::Create(FT, Function::ExternalLinkage, newName, M);
|
||||
F->setDSOLocal(true);
|
||||
F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
|
||||
|
||||
BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
|
||||
callInst->setArgOperand(0, BC);
|
||||
kernels.insert({functionOperand->getName().str(), F});
|
||||
}
|
||||
} else if (cuda_register_kernel_names.find(
|
||||
calledFunction->getName()) !=
|
||||
cuda_register_kernel_names.end()) {
|
||||
// if the called function collides with kernel definiton
|
||||
// TODO: some reason changes all occurences of the function name
|
||||
// for both cudaKernelLaunch calls and regular function call
|
||||
// errs() << *inst;
|
||||
host_changed = true;
|
||||
calledFunction->setName(calledFunction->getName() + "_host");
|
||||
std::cout << std::endl;
|
||||
std::cout << "Change Host Function Name To: "
|
||||
<< calledFunction->getName().str() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReplaceMemcpyToSymbol(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<llvm::Instruction *> need_remove;
|
||||
for (Module::iterator F = M->begin(); F != M->end(); ++F) {
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->getCalledFunction()) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "cudaMemcpyToSymbol") {
|
||||
std::vector<llvm::Type *> args;
|
||||
// i32 @cudaMemcpyToSymbol(i8* %1, i8* %2, i64 %3, i64 %4, i32 %5)
|
||||
args.push_back(llvm::Type::getInt8PtrTy(context));
|
||||
args.push_back(llvm::Type::getInt8PtrTy(context));
|
||||
args.push_back(llvm::Type::getInt64Ty(context));
|
||||
args.push_back(llvm::Type::getInt64Ty(context));
|
||||
args.push_back(llvm::Type::getInt32Ty(context));
|
||||
llvm::FunctionType *func_Type =
|
||||
FunctionType::get(I32, args, false);
|
||||
|
||||
llvm::FunctionCallee _f =
|
||||
M->getOrInsertFunction("cudaMemcpyToSymbol_host", func_Type);
|
||||
llvm::Function *func = llvm::cast<llvm::Function>(_f.getCallee());
|
||||
// construct argument(s)
|
||||
std::vector<Value *> func_args;
|
||||
func_args.push_back(Call->getArgOperand(0));
|
||||
func_args.push_back(Call->getArgOperand(1));
|
||||
func_args.push_back(Call->getArgOperand(2));
|
||||
func_args.push_back(Call->getArgOperand(3));
|
||||
func_args.push_back(Call->getArgOperand(4));
|
||||
|
||||
auto c_inst = llvm::CallInst::Create(func, func_args, "", Call);
|
||||
// insert
|
||||
Call->replaceAllUsesWith(c_inst);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
void ReplaceCudaBuiltin(llvm::Module *M) {
|
||||
InsertSyncAfterKernelLaunch(M);
|
||||
ReplaceKernelLaunch(M);
|
||||
ReplaceMemcpyToSymbol(M);
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
#include "ReplaceKernelArgs.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
/*
|
||||
* before:
|
||||
* %m_cuda.addr = alloca float*, align 8
|
||||
* after:
|
||||
* %m_cuda.addr_tmp = call i8* @malloc(i64 8)
|
||||
* %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
|
||||
*/
|
||||
// TODO: we use hard-code to implement this replacement,
|
||||
// to use use-analysis to find the arguments in the future
|
||||
void ReplaceKernelArg(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto VoidTy = llvm::Type::getVoidTy(context);
|
||||
auto I8 = llvm::Type::getInt8PtrTy(context);
|
||||
std::map<std::string, Function *> kernels;
|
||||
|
||||
std::set<llvm::Function *> need_replace;
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
|
||||
need_replace.insert(F);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find/create C's malloc function
|
||||
std::vector<llvm::Type *> args;
|
||||
args.push_back(llvm::Type::getInt8PtrTy(context));
|
||||
llvm::FunctionType *mallocFuncType =
|
||||
FunctionType::get(llvm::Type::getInt8PtrTy(context),
|
||||
{llvm::Type::getInt64Ty(context)}, false);
|
||||
|
||||
llvm::FunctionCallee _f = M->getOrInsertFunction("malloc", mallocFuncType);
|
||||
llvm::Function *func_malloc = llvm::cast<llvm::Function>(_f.getCallee());
|
||||
|
||||
for (auto F : need_replace) {
|
||||
std::set<const llvm::Value *> args_set;
|
||||
int arg_cnt = 0;
|
||||
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
||||
ii != ee; ++ii) {
|
||||
args_set.insert(&(*ii));
|
||||
arg_cnt++;
|
||||
}
|
||||
std::vector<llvm::Instruction *> need_remove;
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
if (llvm::AllocaInst *alloc = llvm::dyn_cast<llvm::AllocaInst>(inst)) {
|
||||
// just replace all alloc in that function
|
||||
auto c_malloc_inst = llvm::CallInst::Create(
|
||||
func_malloc,
|
||||
ConstantInt::get(llvm::Type::getInt64Ty(context), 256), "",
|
||||
alloc);
|
||||
auto bit_cast = new BitCastInst(c_malloc_inst, alloc->getType(),
|
||||
alloc->getName().str(), alloc);
|
||||
alloc->replaceAllUsesWith(bit_cast);
|
||||
need_remove.push_back(alloc);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
#include "ReplaceKernelLaunch.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
|
||||
// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
|
||||
void ReplaceKernelLaunch(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto VoidTy = llvm::Type::getVoidTy(context);
|
||||
auto I8 = llvm::Type::getInt8PtrTy(context);
|
||||
std::map<std::string, BitCastInst *> kernels;
|
||||
|
||||
LLVMContext *C = &M->getContext();
|
||||
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
llvm::FunctionType *LauncherFuncT =
|
||||
FunctionType::get(Type::getVoidTy(*C), NULL);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 =
|
||||
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
||||
bool done = false;
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
|
||||
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
BasicBlock *B = &(*b);
|
||||
|
||||
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
|
||||
Instruction *inst = &(*i);
|
||||
|
||||
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
|
||||
if (Function *calledFunction = callInst->getCalledFunction()) {
|
||||
|
||||
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
|
||||
|
||||
Value *callOperand = callInst->getArgOperand(0);
|
||||
|
||||
Function *functionOperand =
|
||||
dyn_cast<Function>(callInst->getArgOperand(0));
|
||||
|
||||
// call function is wrapped in a bitcast
|
||||
if (functionOperand == NULL) {
|
||||
|
||||
std::vector<size_t> arg_sizes;
|
||||
functionOperand =
|
||||
dyn_cast<Function>(callOperand->stripPointerCasts());
|
||||
|
||||
FunctionType *ft = calledFunction->getFunctionType();
|
||||
std::cout << " Parent (Caller) Function Name: " << func_name
|
||||
<< ", cudaLaunchKernel Function: "
|
||||
<< functionOperand->getName().str() << ", args "
|
||||
<< functionOperand->arg_size() << std::endl;
|
||||
auto rep = kernels.find(functionOperand->getName().str());
|
||||
if (rep != kernels.end()) {
|
||||
|
||||
callInst->setArgOperand(0, rep->second);
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<Type *> Params;
|
||||
Params.push_back(I8);
|
||||
FunctionType *FT = FunctionType::get(VoidTy, Params, false);
|
||||
std::string newName =
|
||||
functionOperand->getName().str() + "_wrapper";
|
||||
|
||||
Function *F =
|
||||
Function::Create(FT, Function::ExternalLinkage, newName, M);
|
||||
F->setDSOLocal(true);
|
||||
|
||||
BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
|
||||
callInst->setArgOperand(0, BC);
|
||||
kernels.insert({functionOperand->getName().str(), BC});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -8,46 +8,66 @@
|
|||
#include "warp_func.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
std::string PATH = "kernel_meta.log";
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
assert(argc == 9 && "incorrect number of arguments\n");
|
||||
assert(argc == 3 && "incorrect number of arguments\n");
|
||||
llvm::Module *program = LoadModuleFromFilr(argv[1]);
|
||||
// get size of grid and dim from input arguments
|
||||
int *grid_dim = new int[3];
|
||||
int *block_dim = new int[3];
|
||||
grid_dim[0] = atoi(argv[3]);
|
||||
grid_dim[1] = atoi(argv[4]);
|
||||
grid_dim[2] = atoi(argv[5]);
|
||||
block_dim[0] = atoi(argv[6]);
|
||||
block_dim[1] = atoi(argv[7]);
|
||||
block_dim[2] = atoi(argv[8]);
|
||||
|
||||
std::ofstream fout;
|
||||
fout.open(PATH);
|
||||
|
||||
// inline, and create auxiliary global variables
|
||||
init_block(program);
|
||||
init_block(program, fout);
|
||||
// insert sync before each vote, and replace the
|
||||
// original vote function to warp vote
|
||||
handle_warp_vote(program);
|
||||
|
||||
// replace warp shuffle
|
||||
// VerifyModule(program);
|
||||
handle_warp_shfl(program);
|
||||
// insert sync
|
||||
// VerifyModule(program);
|
||||
insert_sync(program);
|
||||
// split block by sync
|
||||
// VerifyModule(program);
|
||||
std::cout << "split\n" << std::flush;
|
||||
split_block_by_sync(program);
|
||||
// add loop for intra&intera thread
|
||||
insert_warp_loop(program);
|
||||
// (TODO): replace this patch
|
||||
replace_built_in_function(program, grid_dim, block_dim);
|
||||
|
||||
// VerifyModule(program);
|
||||
std::cout << "insert\n" << std::flush;
|
||||
insert_warp_loop(program);
|
||||
|
||||
// VerifyModule(program);
|
||||
|
||||
// (TODO): replace this patch
|
||||
std::cout << "replace\n" << std::flush;
|
||||
replace_built_in_function(program);
|
||||
|
||||
// VerifyModule(program);
|
||||
std::cout << "generate\n" << std::flush;
|
||||
generate_x86_format(program);
|
||||
|
||||
// VerifyModule(program);
|
||||
|
||||
// performance optimization
|
||||
performance_optimization(program);
|
||||
|
||||
VerifyModule(program);
|
||||
|
||||
DumpModule(program, argv[2]);
|
||||
|
||||
fout.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -5,4 +5,6 @@
|
|||
|
||||
void generate_x86_format(llvm::Module *M);
|
||||
|
||||
void set_meta_data(llvm::Module *M);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -2,6 +2,6 @@
|
|||
#define __NVVM2x86_INIT__
|
||||
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
void init_block(llvm::Module *M);
|
||||
#include <fstream>
|
||||
void init_block(llvm::Module *M, std::ofstream &fout);
|
||||
#endif
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
#ifndef __NVVM2x86_MEMORY_HIERARCHY__
|
||||
#define __NVVM2x86_MEMORY_HIERARCHY__
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
#include <fstream>
|
||||
using namespace llvm;
|
||||
|
||||
void mem_share2global(llvm::Module *M);
|
||||
void mem_constant2global(llvm::Module *M, std::ofstream &fout);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -12,7 +12,7 @@ llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
|
|||
void VerifyModule(llvm::Module *);
|
||||
void phi2alloc(llvm::Module *M);
|
||||
void remove_cuda_built_in(llvm::Module *M);
|
||||
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim);
|
||||
void replace_built_in_function(llvm::Module *M);
|
||||
void replace_asm_call(llvm::Module *M);
|
||||
bool find_block_barrier_in_region(llvm::BasicBlock *start,
|
||||
llvm::BasicBlock *end);
|
||||
|
@ -21,4 +21,5 @@ bool has_warp_barrier(llvm::BasicBlock *B);
|
|||
bool has_barrier(llvm::BasicBlock *B);
|
||||
bool has_block_barrier(llvm::BasicBlock *B);
|
||||
bool has_barrier(llvm::Function *F);
|
||||
void replace_dynamic_shared_memory(llvm::Module *M);
|
||||
#endif
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -40,6 +41,10 @@ void decode_input(llvm::Module *M) {
|
|||
llvm::FunctionType *LauncherFuncT = FunctionType::get(
|
||||
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
|
||||
|
||||
std::set<GlobalVariable *> dynmaic_memory;
|
||||
|
||||
std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
|
||||
|
||||
// generate Wrapper Function type
|
||||
// now we only support a single int32*
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
|
@ -64,6 +69,51 @@ void decode_input(llvm::Module *M) {
|
|||
// convert to int**
|
||||
input_arg = Builder.CreateBitOrPointerCast(
|
||||
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
|
||||
|
||||
// dynamic memory load in the wrapper function
|
||||
GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
|
||||
if (share_memory != NULL) {
|
||||
dynmaic_memory.insert(share_memory);
|
||||
llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
|
||||
*M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
"thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
|
||||
0, false);
|
||||
Value *loadedValue = Builder.CreateLoad(global_mem);
|
||||
|
||||
llvm::FunctionType *LaunchFun2 = FunctionType::get(
|
||||
PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
|
||||
|
||||
FunctionCallee fc2 =
|
||||
M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
|
||||
|
||||
Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
|
||||
|
||||
WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
|
||||
WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
|
||||
Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
|
||||
co->setSelectionKind(Comdat::SelectionKind::Any);
|
||||
WorkGroup2->setComdat(co);
|
||||
|
||||
BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
|
||||
|
||||
llvm::IRBuilder<> Builder2(M->getContext());
|
||||
Builder2.SetInsertPoint(Block2);
|
||||
Builder2.CreateRet(share_memory);
|
||||
|
||||
auto PT = dyn_cast<PointerType>(share_memory->getType());
|
||||
auto element_type = PT->getElementType();
|
||||
// std::cout << element_type->getTypeID() << " Got global memor $$$$$$"
|
||||
// << share_memory->getName().str() << std::endl;
|
||||
|
||||
AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
|
||||
// new_arr->setAlignment(llvm::MaybeAlign(16));
|
||||
Value *new_ar = new_arr;
|
||||
Value *gptr = Builder.CreateBitOrPointerCast(
|
||||
share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
|
||||
|
||||
Builder.CreateStore(new_ar, gptr);
|
||||
}
|
||||
|
||||
size_t idx = 0;
|
||||
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
|
||||
ii != ee; ++ii) {
|
||||
|
@ -95,6 +145,8 @@ void remove_barrier(llvm::Module *M) {
|
|||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
func_name == "llvm.nvvm.barrier0" ||
|
||||
|
@ -109,6 +161,11 @@ void remove_barrier(llvm::Module *M) {
|
|||
}
|
||||
}
|
||||
|
||||
void remove_useless_var(llvm::Module *M) {
|
||||
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
|
||||
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
|
||||
}
|
||||
|
||||
void generate_x86_format(llvm::Module *M) {
|
||||
// change metadata
|
||||
set_meta_data(M);
|
||||
|
@ -116,4 +173,6 @@ void generate_x86_format(llvm::Module *M) {
|
|||
decode_input(M);
|
||||
// remove barrier
|
||||
remove_barrier(M);
|
||||
// remove useless func/variable
|
||||
remove_useless_var(M);
|
||||
}
|
||||
|
|
|
@ -27,6 +27,8 @@ void split_block_by_sync(llvm::Function *F) {
|
|||
}
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "init.h"
|
||||
#include "memory_hierarchy.h"
|
||||
#include "tool.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
|
@ -23,7 +24,8 @@
|
|||
|
||||
using namespace llvm;
|
||||
|
||||
void inline_func_vote(llvm::Module *M) {
|
||||
bool inline_warp_level_func(llvm::Module *M) {
|
||||
bool changed = false;
|
||||
std::set<llvm::Function *> need_remove;
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
|
@ -36,10 +38,13 @@ void inline_func_vote(llvm::Module *M) {
|
|||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||
if (c->getCalledFunction()) {
|
||||
if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") {
|
||||
auto func_name = c->getCalledFunction()->getName().str();
|
||||
if (func_name == "_Z10__any_syncji" ||
|
||||
func_name.find("shfl_down_sync") != std::string::npos) {
|
||||
InlineFunctionInfo IFI;
|
||||
InlineFunction(c, IFI);
|
||||
need_remove.insert(c->getCalledFunction());
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -50,6 +55,56 @@ void inline_func_vote(llvm::Module *M) {
|
|||
f->dropAllReferences();
|
||||
f->eraseFromParent();
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
||||
bool find_sreg_inst(llvm::Function *F) {
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||
if (c->getCalledFunction()) {
|
||||
auto func_name = c->getCalledFunction()->getName().str();
|
||||
if (func_name.find("llvm.nvvm.read.ptx.sreg.") != std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool inline_func_with_tid(llvm::Module *M) {
|
||||
bool changed = false;
|
||||
std::set<llvm::Function *> need_remove;
|
||||
std::set<CallInst *> need_inline;
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
Function::iterator I = F->begin();
|
||||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
|
||||
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
|
||||
if (c->getCalledFunction()) {
|
||||
if (find_sreg_inst(c->getCalledFunction())) {
|
||||
printf("inline: %s\n",
|
||||
c->getCalledFunction()->getName().str().c_str());
|
||||
need_inline.insert(c);
|
||||
need_remove.insert(c->getCalledFunction());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!need_inline.empty()) {
|
||||
changed = true;
|
||||
}
|
||||
for (auto c : need_inline) {
|
||||
InlineFunctionInfo IFI;
|
||||
InlineFunction(c, IFI);
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
||||
void create_global_variable(llvm::Module *M) {
|
||||
|
@ -70,21 +125,33 @@ void create_global_variable(llvm::Module *M) {
|
|||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size_x", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size_y", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_size_z", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "grid_size", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, false);
|
||||
NULL, "grid_size_x", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_index", NULL,
|
||||
NULL, "grid_size_y", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "grid_size_z", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_index_x", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_index_y", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, "block_index_z", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
// TLS variable used for warp-level collective operators
|
||||
new llvm::GlobalVariable(
|
||||
|
@ -224,24 +291,23 @@ bool lower_constant_expr(llvm::Module *M) {
|
|||
auto load_from = load_inst->getOperand(0);
|
||||
if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
|
||||
modified = true;
|
||||
auto ReplInst = get_element_ptr->getAsInstruction();
|
||||
ReplInst->insertBefore(load_inst);
|
||||
std::vector<Instruction *> Users;
|
||||
// Do not replace use during iteration of use. Do it in another loop
|
||||
for (auto U : get_element_ptr->users()) {
|
||||
if (auto InstUser = dyn_cast<Instruction>(U)) {
|
||||
Users.push_back(InstUser);
|
||||
}
|
||||
}
|
||||
for (auto &User : Users)
|
||||
for (auto &User : Users) {
|
||||
auto ReplInst = get_element_ptr->getAsInstruction();
|
||||
ReplInst->insertBefore(User);
|
||||
User->replaceUsesOfWith(get_element_ptr, ReplInst);
|
||||
}
|
||||
}
|
||||
} else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
|
||||
auto store_to = store_inst->getOperand(1);
|
||||
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
|
||||
modified = true;
|
||||
auto ReplInst = addr_cast->getAsInstruction();
|
||||
ReplInst->insertBefore(store_inst);
|
||||
|
||||
std::vector<Instruction *> Users;
|
||||
// Do not replace use during iteration of use. Do it in another loop
|
||||
for (auto U : addr_cast->users()) {
|
||||
|
@ -249,16 +315,19 @@ bool lower_constant_expr(llvm::Module *M) {
|
|||
Users.push_back(InstUser);
|
||||
}
|
||||
}
|
||||
for (auto &User : Users)
|
||||
for (auto &User : Users) {
|
||||
auto ReplInst = addr_cast->getAsInstruction();
|
||||
ReplInst->insertBefore(User);
|
||||
User->replaceUsesOfWith(addr_cast, ReplInst);
|
||||
}
|
||||
}
|
||||
} else if (auto get_element_ptr =
|
||||
dyn_cast<llvm::GetElementPtrInst>(BI)) {
|
||||
auto get_from = get_element_ptr->getOperand(0);
|
||||
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
|
||||
modified = true;
|
||||
auto ReplInst = addr_cast->getAsInstruction();
|
||||
ReplInst->insertBefore(get_element_ptr);
|
||||
// auto ReplInst = addr_cast->getAsInstruction();
|
||||
// ReplInst->insertBefore(get_element_ptr);
|
||||
std::vector<Instruction *> Users;
|
||||
// Do not replace use during iteration of use. Do it in another loop
|
||||
for (auto U : addr_cast->users()) {
|
||||
|
@ -266,21 +335,37 @@ bool lower_constant_expr(llvm::Module *M) {
|
|||
Users.push_back(InstUser);
|
||||
}
|
||||
}
|
||||
for (auto &User : Users)
|
||||
for (auto &User : Users) {
|
||||
auto ReplInst = addr_cast->getAsInstruction();
|
||||
ReplInst->insertBefore(User);
|
||||
User->replaceUsesOfWith(addr_cast, ReplInst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return modified;
|
||||
}
|
||||
|
||||
void init_block(llvm::Module *M) {
|
||||
void replace_cuda_math_built_in(llvm::Module *M) {
|
||||
// replace _ZL3expd, just delete its body
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
auto func_name = F->getName().str();
|
||||
if (func_name.find("_ZL3expd") != std::string::npos) {
|
||||
F->deleteBody();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_block(llvm::Module *M, std::ofstream &fout) {
|
||||
// using official llvm preprocess
|
||||
llvm_preprocess(M);
|
||||
// remove useles Cuda function
|
||||
remove_cuda_built_in(M);
|
||||
// replace CUDA math function, like expf
|
||||
replace_cuda_math_built_in(M);
|
||||
|
||||
// lower ConstantExpression
|
||||
bool modified;
|
||||
|
@ -289,14 +374,26 @@ void init_block(llvm::Module *M) {
|
|||
} while (modified);
|
||||
// remove useless metadata
|
||||
remove_metadata(M);
|
||||
// inline vote function
|
||||
inline_func_vote(M);
|
||||
// inline warp-level function
|
||||
while (1) {
|
||||
if (!inline_warp_level_func(M))
|
||||
break;
|
||||
}
|
||||
// TODO: remove the hardcode
|
||||
while (1) {
|
||||
if (!inline_func_with_tid(M))
|
||||
break;
|
||||
}
|
||||
// create global variable for warp and vote
|
||||
create_global_variable(M);
|
||||
// replace phi with data load
|
||||
phi2alloc(M);
|
||||
// replace share memory
|
||||
mem_share2global(M);
|
||||
// replace share memory
|
||||
mem_constant2global(M, fout);
|
||||
// replace asm Inline
|
||||
replace_asm_call(M);
|
||||
// replace dynamic shared memory
|
||||
replace_dynamic_shared_memory(M);
|
||||
}
|
||||
|
|
|
@ -212,11 +212,22 @@ public:
|
|||
changed = true;
|
||||
|
||||
// we may create a new conditional barrier after insert
|
||||
if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock()))
|
||||
if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) {
|
||||
// if the block postdominates all its predecessor
|
||||
// then it is not a conditional barriers
|
||||
bool post_dominate_all = true;
|
||||
for (auto I = pred_begin(pred); I != pred_end(pred); I++) {
|
||||
if (!PDT->getPostDomTree().dominates(pred, *I)) {
|
||||
post_dominate_all = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!post_dominate_all)
|
||||
conditionalBarriers.push_back(pred);
|
||||
}
|
||||
|
||||
// find any block which are not dominated by header
|
||||
// but be posdiminated by merge point
|
||||
// but be postdominated by merge point
|
||||
std::queue<llvm::BasicBlock *> if_body;
|
||||
std::set<llvm::BasicBlock *> visited_block;
|
||||
for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
|
||||
|
@ -234,19 +245,26 @@ public:
|
|||
PDT->getPostDomTree().dominates(merge_point, curr)) {
|
||||
// we should insert barrier at the beginning and
|
||||
// end of its predecessor
|
||||
printf("insert [255]: %s\n", curr->getName().str().c_str());
|
||||
if (has_warp_barrier(b)) {
|
||||
CreateIntraWarpBarrier(&(*curr->begin()));
|
||||
for (BasicBlock *Pred : predecessors(curr)) {
|
||||
printf("insert [262]: %s\n", Pred->getName().str().c_str());
|
||||
CreateIntraWarpBarrier(&(*Pred->getTerminator()));
|
||||
}
|
||||
} else {
|
||||
CreateInterWarpBarrier(&(*curr->begin()));
|
||||
for (BasicBlock *Pred : predecessors(curr)) {
|
||||
printf("insert [268]: %s\n", Pred->getName().str().c_str());
|
||||
CreateInterWarpBarrier(&(*Pred->getTerminator()));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
|
||||
// avoid backedge
|
||||
if (DT->dominates(curr->getTerminator()->getSuccessor(i), pred)) {
|
||||
continue;
|
||||
}
|
||||
if_body.push(curr->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
}
|
||||
|
@ -266,6 +284,32 @@ public:
|
|||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
}
|
||||
|
||||
BasicBlock *find_merge_point(BasicBlock *start, PostDominatorTree &PDT) {
|
||||
assert(start->getTerminator()->getNumSuccessors() == 2);
|
||||
std::set<llvm::BasicBlock *> visit;
|
||||
std::queue<llvm::BasicBlock *> pending_blocks;
|
||||
for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
|
||||
pending_blocks.push(start->getTerminator()->getSuccessor(i));
|
||||
}
|
||||
while (!pending_blocks.empty()) {
|
||||
BasicBlock *current = pending_blocks.front();
|
||||
pending_blocks.pop();
|
||||
|
||||
if (visit.find(current) != visit.end())
|
||||
continue;
|
||||
|
||||
visit.insert(current);
|
||||
if (PDT.dominates(current, start))
|
||||
return current;
|
||||
for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
|
||||
auto succ = current->getTerminator()->getSuccessor(i);
|
||||
if (visit.find(succ) == visit.end())
|
||||
pending_blocks.push(succ);
|
||||
}
|
||||
}
|
||||
assert(0 && "Do not find merge point\n");
|
||||
return NULL;
|
||||
}
|
||||
virtual bool runOnFunction(Function &F) {
|
||||
if (!isKernelFunction(F.getParent(), &F))
|
||||
return 0;
|
||||
|
@ -280,18 +324,8 @@ public:
|
|||
|
||||
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
|
||||
BasicBlock *b = &*i;
|
||||
BasicBlock *merge_point = NULL;
|
||||
if (b->getTerminator()->getNumSuccessors() == 2) {
|
||||
auto b1 = b->getTerminator()->getSuccessor(0);
|
||||
auto b2 = b->getTerminator()->getSuccessor(1);
|
||||
if (PDT->getPostDomTree().dominates(b1, b2)) {
|
||||
merge_point = b1;
|
||||
} else if (PDT->getPostDomTree().dominates(b2, b2)) {
|
||||
merge_point = b2;
|
||||
} else {
|
||||
assert(0 && "find complex if-else branch\n");
|
||||
}
|
||||
std::cout << std::flush;
|
||||
auto merge_point = find_merge_point(b, PDT->getPostDomTree());
|
||||
for (BasicBlock *Pred : predecessors(merge_point)) {
|
||||
if (!DT->dominates(b, Pred)) {
|
||||
// we need to insert an extra block to be the merge point
|
||||
|
@ -305,14 +339,8 @@ public:
|
|||
auto M = F.getParent();
|
||||
for (auto head : if_head) {
|
||||
assert(head->getTerminator()->getNumSuccessors() == 2);
|
||||
BasicBlock *merge_point = NULL;
|
||||
auto s1 = head->getTerminator()->getSuccessor(0);
|
||||
auto s2 = head->getTerminator()->getSuccessor(1);
|
||||
if (PDT->getPostDomTree().dominates(s1, s2)) {
|
||||
merge_point = s1;
|
||||
} else {
|
||||
merge_point = s2;
|
||||
}
|
||||
BasicBlock *merge_point = find_merge_point(head, PDT->getPostDomTree());
|
||||
assert(PDT->getPostDomTree().dominates(merge_point, head));
|
||||
if (!find_barrier_in_region(head, merge_point)) {
|
||||
printf("do not need to handle tri-income if: %s\n",
|
||||
merge_point->getName().str().c_str());
|
||||
|
@ -368,6 +396,8 @@ public:
|
|||
for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
|
||||
++j) {
|
||||
if (auto Call = dyn_cast<CallInst>(j)) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
|
@ -383,7 +413,7 @@ public:
|
|||
}
|
||||
if (!is_conditional_loop)
|
||||
return 0;
|
||||
// insert barrier at the beginning of header
|
||||
// insert barrier at the beginning of header (for_cond)
|
||||
// and the end of pre header, so that we can get a
|
||||
// single block connected with latch
|
||||
if (!is_warp) {
|
||||
|
@ -399,17 +429,40 @@ public:
|
|||
}
|
||||
|
||||
// as we assume all loops are rotated, we have to insert
|
||||
// barrier before the condition jump of the loop exit
|
||||
|
||||
if (auto exit_block = L->getExitingBlock()) {
|
||||
// barrier before the condition jump of the for_cond
|
||||
if (auto for_cond = L->getExitingBlock()) {
|
||||
assert(for_cond->getTerminator()->getNumSuccessors() == 2 &&
|
||||
"has more than 2 successors of the for-cond\n");
|
||||
auto conditional_br =
|
||||
dyn_cast<llvm::BranchInst>(exit_block->getTerminator());
|
||||
dyn_cast<llvm::BranchInst>(for_cond->getTerminator());
|
||||
assert(conditional_br && conditional_br->isConditional());
|
||||
// insert barrier at the beginning of successor of exit
|
||||
// insert barrier before the condition jump of the loop cond
|
||||
if (!is_warp)
|
||||
CreateInterWarpBarrier(conditional_br);
|
||||
else
|
||||
CreateIntraWarpBarrier(conditional_br);
|
||||
// insert barrier before the for_body
|
||||
auto for_body = for_cond->getTerminator()->getSuccessor(0);
|
||||
if (for_body == L->getExitBlock()) {
|
||||
for_body = for_cond->getTerminator()->getSuccessor(1);
|
||||
}
|
||||
// insert at the beginning of for_body
|
||||
if (!is_warp)
|
||||
CreateInterWarpBarrier(&(*for_body->begin()));
|
||||
else
|
||||
CreateIntraWarpBarrier(&(*for_body->begin()));
|
||||
// insert at the beginning and end in for_inc block
|
||||
if (auto for_inc = L->getLoopLatch()) {
|
||||
if (!is_warp) {
|
||||
CreateInterWarpBarrier(&(*for_inc->begin()));
|
||||
CreateInterWarpBarrier(for_inc->getTerminator());
|
||||
} else {
|
||||
CreateIntraWarpBarrier(&(*for_inc->begin()));
|
||||
CreateIntraWarpBarrier(for_inc->getTerminator());
|
||||
}
|
||||
} else {
|
||||
assert(0 && "has continue in a barrier loop\n");
|
||||
}
|
||||
} else {
|
||||
// handle break in for-loop
|
||||
printf("loop has multiply exists\n");
|
||||
|
|
|
@ -67,9 +67,15 @@ std::map<std::string, llvm::Instruction *> contextArrays;
|
|||
int tempInstructionIndex = 0;
|
||||
int need_nested_loop;
|
||||
|
||||
// adding multiple kenerl in file support
|
||||
|
||||
bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
|
||||
if (isa<BranchInst>(instr))
|
||||
return true;
|
||||
// if (isa<AddrSpaceCastInst>(instr))
|
||||
// return true;
|
||||
// if (isa<CastInst>(instr))
|
||||
// return true;
|
||||
|
||||
llvm::Module *M = instr->getParent()->getParent()->getParent();
|
||||
llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
|
||||
|
@ -111,6 +117,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
|||
return contextArrays[varName];
|
||||
|
||||
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
|
||||
|
||||
IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
|
||||
Function *FF = instruction->getParent()->getParent();
|
||||
Module *M = instruction->getParent()->getParent()->getParent();
|
||||
|
@ -127,6 +134,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
|||
|
||||
Type *AllocType = elementType;
|
||||
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
|
||||
/*
|
||||
if (InstCast) {
|
||||
unsigned Alignment = InstCast->getAlignment();
|
||||
|
||||
|
@ -166,7 +174,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
llvm::Value *ItemSize = nullptr;
|
||||
llvm::AllocaInst *Alloca = nullptr;
|
||||
|
||||
|
@ -354,14 +362,37 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
|
|||
auto F = PRs[0].start_block->getParent();
|
||||
for (auto bb = F->begin(); bb != F->end(); bb++) {
|
||||
for (auto ii = bb->begin(); ii != bb->end(); ii++) {
|
||||
if (isa<AllocaInst>(&(*ii)))
|
||||
instruction_to_fix.push_back(&(*ii));
|
||||
if (isa<AllocaInst>(&(*ii))) {
|
||||
auto alloc = dyn_cast<AllocaInst>(&(*ii));
|
||||
// Do not duplicate var used outside PRs
|
||||
bool used_in_non_PR = false;
|
||||
for (Instruction::use_iterator ui = alloc->use_begin(),
|
||||
ue = alloc->use_end();
|
||||
ui != ue; ++ui) {
|
||||
llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
|
||||
auto user_block = user->getParent();
|
||||
bool find_in_PR = false;
|
||||
for (auto PR : PRs) {
|
||||
if (PR.wrapped_block.find(user_block) != PR.wrapped_block.end()) {
|
||||
find_in_PR = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (find_in_PR == false) {
|
||||
used_in_non_PR = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!used_in_non_PR) {
|
||||
instruction_to_fix.push_back(alloc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto inst : instruction_to_fix) {
|
||||
AddContextSaveRestore(inst, intra_warp_loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto parallel_regions : PRs) {
|
||||
std::set<llvm::Instruction *> instruction_in_region;
|
||||
|
@ -380,10 +411,8 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
|
|||
for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
|
||||
++instr) {
|
||||
llvm::Instruction *instruction = &*instr;
|
||||
|
||||
if (ShouldNotBeContextSaved(instruction))
|
||||
continue;
|
||||
|
||||
for (Instruction::use_iterator ui = instruction->use_begin(),
|
||||
ue = instruction->use_end();
|
||||
ui != ue; ++ui) {
|
||||
|
@ -582,6 +611,8 @@ void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
|
|||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
need_remove.push_back(Call);
|
||||
|
@ -648,6 +679,8 @@ public:
|
|||
bool has_barrier = 0;
|
||||
for (auto i = current->begin(), e = current->end(); i != e; ++i) {
|
||||
if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
|
||||
if (call_inst->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = call_inst->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync")
|
||||
|
@ -761,6 +794,8 @@ public:
|
|||
for (Function::iterator s = F->begin(); s != F->end(); s++) {
|
||||
if (llvm::CallInst *call_inst =
|
||||
llvm::dyn_cast<llvm::CallInst>(s->begin())) {
|
||||
if (call_inst->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = call_inst->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
|
@ -787,6 +822,12 @@ public:
|
|||
if (!isKernelFunction(F.getParent(), &F))
|
||||
return 0;
|
||||
|
||||
auto func_name = (&F)->getName().str();
|
||||
// clear context array, temp variables for new kernel function
|
||||
contextArrays.clear();
|
||||
tempInstructionIds.clear();
|
||||
tempInstructionIndex = 0;
|
||||
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
|
||||
|
||||
|
@ -794,11 +835,11 @@ public:
|
|||
auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
|
||||
assert(!parallel_regions.empty() && "can not find any parallel regions\n");
|
||||
// print_parallel_region(parallel_regions);
|
||||
add_warp_loop(parallel_regions, intra_warp_loop);
|
||||
|
||||
if (intra_warp_loop) {
|
||||
handle_local_variable_intra_warp(parallel_regions);
|
||||
}
|
||||
add_warp_loop(parallel_regions, intra_warp_loop);
|
||||
remove_barrier(&F, intra_warp_loop);
|
||||
return 1;
|
||||
}
|
||||
|
@ -816,6 +857,8 @@ bool has_warp_barrier(llvm::Module *M) {
|
|||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
return true;
|
||||
|
@ -841,8 +884,8 @@ void insert_warp_loop(llvm::Module *M) {
|
|||
// only need a single loop, with size=block_size
|
||||
Passes.add(new InsertWarpLoopPass(intra_warp));
|
||||
Passes.run(*M);
|
||||
}
|
||||
// remove all barriers
|
||||
for (auto F = M->begin(); F != M->end(); ++F)
|
||||
remove_barrier(dyn_cast<llvm::Function>(F), false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <assert.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
|
@ -36,15 +38,35 @@ void mem_share2global(llvm::Module *M) {
|
|||
auto new_name = "wrapper_global_" + share_memory->getName().str();
|
||||
auto element_type = PT->getElementType();
|
||||
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
|
||||
if (share_memory->hasExternalLinkage() &&
|
||||
array_type->getArrayNumElements() == 0) {
|
||||
// external shared memory of []
|
||||
// generate global type pointer
|
||||
PointerType *PointerTy =
|
||||
PointerType::get(array_type->getElementType(), 0);
|
||||
llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
|
||||
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
|
||||
*M, PointerTy, false, llvm::GlobalValue::CommonLinkage, x1,
|
||||
"wrapper_global_data", NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||
|
||||
global_ptr->setDSOLocal(true);
|
||||
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_ptr));
|
||||
} else {
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL,
|
||||
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1);
|
||||
*M, array_type, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, new_name, NULL,
|
||||
llvm::GlobalValue::GeneralDynamicTLSModel, 1);
|
||||
ConstantAggregateZero *const_array =
|
||||
ConstantAggregateZero::get(array_type);
|
||||
global_memory->setInitializer(const_array);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
}
|
||||
} else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
|
||||
auto zero = llvm::ConstantInt::get(int_type, 0, true);
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
|
@ -54,6 +76,16 @@ void mem_share2global(llvm::Module *M) {
|
|||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
} else if (element_type->isFloatTy()) {
|
||||
auto FP_type = llvm::Type::getFloatTy(*C);
|
||||
auto zero = llvm::ConstantFP::get(FP_type, 0);
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
|
||||
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
|
||||
false);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
} else {
|
||||
assert(0 && "The required Share Memory Type is not supported\n");
|
||||
}
|
||||
|
@ -62,57 +94,11 @@ void mem_share2global(llvm::Module *M) {
|
|||
}
|
||||
}
|
||||
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
|
||||
BasicBlock *b = &*i;
|
||||
for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) {
|
||||
if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
|
||||
auto read_array = get_element_ptr->getPointerOperand();
|
||||
if (GlobalVariable *read_share_memory =
|
||||
dyn_cast<llvm::GlobalVariable>(read_array)) {
|
||||
// find a GetElementPtr which read share memory
|
||||
if (corresponding_global_memory.find(read_share_memory) !=
|
||||
corresponding_global_memory.end()) {
|
||||
std::vector<Value *> Indices;
|
||||
for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
|
||||
Indices.push_back(get_element_ptr->getOperand(i + 1));
|
||||
|
||||
auto new_GEP = GetElementPtrInst::Create(
|
||||
NULL, // Pointee type
|
||||
corresponding_global_memory.find(read_share_memory)
|
||||
->second, // Alloca
|
||||
Indices, // Indices
|
||||
"", get_element_ptr);
|
||||
// replace all get_element_ptr with new_GEP:
|
||||
// we can not directly use:
|
||||
// get_element_ptr->replaceAllUsesWith(new_GEP);
|
||||
// as get_element_ptr and new_GEP have different return type
|
||||
llvm::Type *original_type = get_element_ptr->getType();
|
||||
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
|
||||
new_GEP, original_type, "", get_element_ptr);
|
||||
get_element_ptr->replaceAllUsesWith(FormatASC);
|
||||
need_remove.insert(get_element_ptr);
|
||||
}
|
||||
}
|
||||
} else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
|
||||
auto read_array = addr_cast->getOperand(0);
|
||||
if (GlobalVariable *read_share_memory =
|
||||
dyn_cast<llvm::GlobalVariable>(read_array)) {
|
||||
// find a GetElementPtr which read share memory
|
||||
if (corresponding_global_memory.find(read_share_memory) !=
|
||||
corresponding_global_memory.end()) {
|
||||
llvm::Type *original_type = addr_cast->getType();
|
||||
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
|
||||
corresponding_global_memory.find(read_share_memory)->second,
|
||||
original_type, "", addr_cast);
|
||||
addr_cast->replaceAllUsesWith(FormatASC);
|
||||
need_remove.insert(addr_cast);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto k : corresponding_global_memory) {
|
||||
auto share_addr = k.first;
|
||||
auto global_addr = k.second;
|
||||
share_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
|
||||
global_addr, cast<PointerType>(share_addr->getType())));
|
||||
}
|
||||
|
||||
for (auto i : need_remove) {
|
||||
|
@ -124,3 +110,83 @@ void mem_share2global(llvm::Module *M) {
|
|||
i->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
|
||||
LLVMContext *C = &M->getContext();
|
||||
llvm::Type *Int32T = Type::getInt32Ty(*C);
|
||||
llvm::Type *Int64T = Type::getInt64Ty(*C);
|
||||
llvm::Type *Int8T = Type::getInt8Ty(*C);
|
||||
|
||||
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
|
||||
std::set<llvm::Instruction *> need_remove;
|
||||
std::set<GlobalVariable *> need_remove_constant_memory;
|
||||
|
||||
// find all constant memory and generate corresponding global memory
|
||||
for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
|
||||
if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
|
||||
if (auto PT = dyn_cast<PointerType>(I->getType())) {
|
||||
unsigned AS = PT->getAddressSpace();
|
||||
if (AS == 4) { // find a share memory
|
||||
need_remove_constant_memory.insert(constant_memory);
|
||||
// generate the corresponding global memory variable
|
||||
auto new_name = "wrapper_global_" + constant_memory->getName().str();
|
||||
auto element_type = PT->getElementType();
|
||||
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
|
||||
if (constant_memory->hasExternalLinkage() &&
|
||||
array_type->getArrayNumElements() == 0) {
|
||||
// external shared memory of []
|
||||
// generate global type pointer
|
||||
PointerType *PointerTy =
|
||||
PointerType::get(array_type->getElementType(), 0);
|
||||
llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
|
||||
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
|
||||
*M, PointerTy, false, llvm::GlobalValue::ExternalLinkage, x1,
|
||||
"wrapper_global_data", NULL,
|
||||
llvm::GlobalValue::NotThreadLocal, 0, true);
|
||||
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
|
||||
global_ptr));
|
||||
} else {
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, array_type, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
|
||||
global_memory));
|
||||
}
|
||||
} else if (element_type->isStructTy()) {
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, element_type, false, llvm::GlobalValue::ExternalLinkage,
|
||||
NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
|
||||
global_memory));
|
||||
} else {
|
||||
assert(0 && "The required Constant Memory Type is not supported\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fout << "ConstMemory2GlobalMemory\n";
|
||||
for (auto k : corresponding_global_memory) {
|
||||
auto const_addr = k.first;
|
||||
auto global_addr = k.second;
|
||||
const_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
|
||||
global_addr, cast<PointerType>(const_addr->getType())));
|
||||
// this file will be used by host translator
|
||||
fout << const_addr->getName().str().c_str() << " to "
|
||||
<< global_addr->getName().str().c_str() << std::endl;
|
||||
}
|
||||
fout << "END\n";
|
||||
|
||||
for (auto i : need_remove) {
|
||||
i->dropAllReferences();
|
||||
i->eraseFromParent();
|
||||
}
|
||||
for (auto i : need_remove_constant_memory) {
|
||||
i->dropAllReferences();
|
||||
i->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "tool.h"
|
||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
|
@ -187,7 +188,52 @@ void remove_cuda_built_in(llvm::Module *M) {
|
|||
}
|
||||
}
|
||||
|
||||
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
|
||||
// copied from POCL
|
||||
static void breakConstantExpressions(llvm::Value *Val, llvm::Function *Func) {
|
||||
std::vector<llvm::Value *> Users(Val->user_begin(), Val->user_end());
|
||||
for (auto *U : Users) {
|
||||
if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(U)) {
|
||||
// First, make sure no users of this constant expression are themselves
|
||||
// constant expressions.
|
||||
breakConstantExpressions(U, Func);
|
||||
// Convert this constant expression to an instruction.
|
||||
llvm::Instruction *I = CE->getAsInstruction();
|
||||
I->insertBefore(&*Func->begin()->begin());
|
||||
CE->replaceAllUsesWith(I);
|
||||
CE->destroyConstant();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void replace_dynamic_shared_memory(llvm::Module *M) {
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
if (!isKernelFunction(M, F))
|
||||
continue;
|
||||
for (Module::global_iterator i = M->global_begin(), e = M->global_end();
|
||||
i != e; ++i) {
|
||||
breakConstantExpressions(&*i, F);
|
||||
}
|
||||
auto dynamic_shared_memory_addr =
|
||||
M->getGlobalVariable("dynamic_shared_memory");
|
||||
if (!dynamic_shared_memory_addr) {
|
||||
return;
|
||||
}
|
||||
auto load_shared_memory =
|
||||
new LoadInst(dynamic_shared_memory_addr, "new_load");
|
||||
auto new_bit_cast =
|
||||
new BitCastInst(load_shared_memory,
|
||||
dynamic_shared_memory_addr->getType(), "new_bit_cast");
|
||||
new_bit_cast->insertBefore(&*F->begin()->begin());
|
||||
load_shared_memory->insertBefore(new_bit_cast);
|
||||
dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) {
|
||||
auto *Instr = dyn_cast<Instruction>(U.getUser());
|
||||
return Instr != new_bit_cast && Instr != load_shared_memory;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void replace_built_in_function(llvm::Module *M) {
|
||||
LLVMContext &context = M->getContext();
|
||||
auto I32 = llvm::Type::getInt32Ty(context);
|
||||
std::vector<llvm::Instruction *> need_remove;
|
||||
|
@ -203,28 +249,60 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
|
|||
auto local_intra_warp_idx =
|
||||
builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
|
||||
0, "local_intra_warp_idx");
|
||||
global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx);
|
||||
global_intra_warp_idx->replaceUsesWithIf(local_intra_warp_idx, [&](Use &U) {
|
||||
auto *Instr = dyn_cast<Instruction>(U.getUser());
|
||||
return Instr->getParent()->getParent()->getName().str() == func_name;
|
||||
});
|
||||
|
||||
auto global_inter_warp_idx =
|
||||
F->getParent()->getGlobalVariable("inter_warp_index");
|
||||
|
||||
auto local_inter_warp_idx =
|
||||
builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
|
||||
0, "local_inter_warp_idx");
|
||||
global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
|
||||
|
||||
builder.CreateStore(ConstantInt::get(I32, 0), local_inter_warp_idx);
|
||||
|
||||
global_inter_warp_idx->replaceUsesWithIf(local_inter_warp_idx, [&](Use &U) {
|
||||
auto *Instr = dyn_cast<Instruction>(U.getUser());
|
||||
return Instr->getParent()->getParent()->getName().str() == func_name;
|
||||
});
|
||||
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Load = dyn_cast<LoadInst>(BI)) {
|
||||
auto load_from = Load->getOperand(0);
|
||||
if (load_from == F->getParent()->getGlobalVariable("block_size")) {
|
||||
Load->replaceAllUsesWith(ConstantInt::get(
|
||||
I32, block_dim[0] * block_dim[1] * block_dim[2]));
|
||||
need_remove.push_back(Load);
|
||||
}
|
||||
} else if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->getCalledFunction()) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") {
|
||||
if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
|
||||
func_name ==
|
||||
"_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto val = builder.CreateLoad(block_size_addr);
|
||||
Call->replaceAllUsesWith(val);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto val = builder.CreateLoad(block_size_addr);
|
||||
Call->replaceAllUsesWith(val);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto val = builder.CreateLoad(block_size_addr);
|
||||
Call->replaceAllUsesWith(val);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" ||
|
||||
func_name == "_ZN26__cuda_builtin_threadIdx_t17__fetch_"
|
||||
"builtin_xEv") {
|
||||
// replace it by warp_id
|
||||
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
|
||||
|
@ -234,12 +312,11 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
|
|||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
|
||||
thread_idx, "thread_idx");
|
||||
if (block_dim[1] != 1 || block_dim[2] != 1) {
|
||||
printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]);
|
||||
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::SRem, thread_idx,
|
||||
ConstantInt::get(I32, block_dim[0]), "thread_id_x");
|
||||
}
|
||||
builder.CreateLoad(M->getGlobalVariable("block_size_x")),
|
||||
"thread_id_x");
|
||||
|
||||
Call->replaceAllUsesWith(thread_idx);
|
||||
need_remove.push_back(Call);
|
||||
|
@ -257,63 +334,61 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
|
|||
// tidy = tid / block_dim.x
|
||||
thread_idx = builder.CreateBinOp(
|
||||
Instruction::SDiv, thread_idx,
|
||||
ConstantInt::get(I32, block_dim[0]),
|
||||
// builder.CreateLoad(M->getGlobalVariable("block_size_x")),
|
||||
builder.CreateLoad(M->getGlobalVariable("block_size_x")),
|
||||
"thread_id_y");
|
||||
|
||||
Call->replaceAllUsesWith(thread_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
|
||||
printf("[WARNING] We DO NOT support multi-dim block\n");
|
||||
printf("[WARNING] We DO NOT support triple-dim block\n");
|
||||
exit(1);
|
||||
auto zero = ConstantInt::get(I32, 0);
|
||||
Call->replaceAllUsesWith(zero);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index");
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x" ||
|
||||
func_name == "_ZN25__cuda_builtin_blockIdx_t17__fetch_"
|
||||
"builtin_xEv") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" ||
|
||||
func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
|
||||
printf("[WARNING We DO NOT support multi-dim grid\n");
|
||||
auto zero = ConstantInt::get(I32, 0);
|
||||
Call->replaceAllUsesWith(zero);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_x");
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_size = ConstantInt::get(I32, block_dim[0]);
|
||||
Call->replaceAllUsesWith(block_size);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_y");
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
|
||||
auto block_index_addr = M->getGlobalVariable("block_index_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_size = ConstantInt::get(I32, block_dim[1]);
|
||||
Call->replaceAllUsesWith(block_size);
|
||||
auto block_idx = builder.CreateLoad(block_index_addr);
|
||||
Call->replaceAllUsesWith(block_idx);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
|
||||
auto block_size_addr = M->getGlobalVariable("block_size_z");
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" ||
|
||||
func_name == "_ZN24__cuda_builtin_gridDim_t17__fetch_"
|
||||
"builtin_xEv") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size_x");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto block_size = ConstantInt::get(I32, block_dim[2]);
|
||||
Call->replaceAllUsesWith(block_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = ConstantInt::get(I32, grid_dim[0]);
|
||||
auto grid_size = builder.CreateLoad(grid_size_addr);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" ||
|
||||
func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
|
||||
printf("[WARNING We DO NOT support multi-dim grid\n");
|
||||
auto one = ConstantInt::get(I32, 1);
|
||||
Call->replaceAllUsesWith(one);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size_y");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = builder.CreateLoad(grid_size_addr);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
|
||||
auto grid_size_addr = M->getGlobalVariable("grid_size_z");
|
||||
IRBuilder<> builder(context);
|
||||
builder.SetInsertPoint(Call);
|
||||
auto grid_size = builder.CreateLoad(grid_size_addr);
|
||||
Call->replaceAllUsesWith(grid_size);
|
||||
need_remove.push_back(Call);
|
||||
}
|
||||
}
|
||||
|
@ -334,6 +409,98 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
|
|||
}
|
||||
}
|
||||
}
|
||||
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
Function *F = &(*i);
|
||||
for (auto BB = F->begin(); BB != F->end(); ++BB) {
|
||||
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
|
||||
if (auto Call = dyn_cast<CallInst>(BI)) {
|
||||
if (Call->getCalledFunction()) {
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
auto callFn = Call->getCalledFunction();
|
||||
if (func_name == "vprintf") {
|
||||
/*
|
||||
* replace CUDA's printf to C's printf
|
||||
* CUDA:
|
||||
* %0 = tail call i32 @vprintf(i8* getelementptr inbounds ([19 x
|
||||
* i8], [19 x i8]* @.str, i64 0, i64 0), i8* null)
|
||||
* C: %call1 = call i32 (i8*, ...) @printf(i8* getelementptr
|
||||
* inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0))
|
||||
*/
|
||||
// find/create C's printf function
|
||||
std::vector<llvm::Type *> args;
|
||||
args.push_back(llvm::Type::getInt8PtrTy(context));
|
||||
llvm::FunctionType *printfType =
|
||||
FunctionType::get(I32, args, true);
|
||||
|
||||
llvm::FunctionCallee _f =
|
||||
M->getOrInsertFunction("printf", printfType);
|
||||
llvm::Function *func_printf =
|
||||
llvm::cast<llvm::Function>(_f.getCallee());
|
||||
// construct argument(s)
|
||||
std::vector<Value *> printf_args;
|
||||
// first argument is same between CUDA and C
|
||||
auto placeholder = Call->getArgOperand(0);
|
||||
printf_args.push_back(placeholder);
|
||||
// insert arguments
|
||||
auto compressed_args = Call->getArgOperand(1);
|
||||
if (auto BC = dyn_cast<BitCastInst>(compressed_args)) {
|
||||
auto src_alloc = BC->getOperand(0);
|
||||
auto SrcPointTy =
|
||||
dyn_cast<PointerType>(BC->getOperand(0)->getType());
|
||||
auto SrcTy = SrcPointTy->getElementType();
|
||||
// reverse the bitcast
|
||||
auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
|
||||
assert(SrcTy->isStructTy() == 1);
|
||||
auto StructTy = dyn_cast<StructType>(SrcTy);
|
||||
for (int i = 0; i < StructTy->getNumElements(); i++) {
|
||||
std::vector<Value *> Indices;
|
||||
Indices.push_back(ConstantInt::get(I32, 0));
|
||||
Indices.push_back(ConstantInt::get(I32, i));
|
||||
auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type
|
||||
src_alloc, // Alloca
|
||||
Indices, // Indices
|
||||
"", Call);
|
||||
auto new_load = new LoadInst(new_GEP, "", Call);
|
||||
printf_args.push_back(new_load);
|
||||
}
|
||||
}
|
||||
auto c_printf_inst =
|
||||
llvm::CallInst::Create(func_printf, printf_args, "", Call);
|
||||
// insert
|
||||
Call->replaceAllUsesWith(c_printf_inst);
|
||||
need_remove.push_back(Call);
|
||||
} else if (func_name == "__nv_fast_log2f" ||
|
||||
func_name == "__nv_log2f" ||
|
||||
func_name == "__nv_fast_powf" ||
|
||||
func_name == "__nv_powf" || func_name == "__nv_logf" ||
|
||||
func_name == "__nv_expf" || func_name == "__nv_fabsf" ||
|
||||
func_name == "__nv_log10f" ||
|
||||
func_name == "__nv_fmodf" || func_name == "__nv_sqrt" ||
|
||||
func_name == "__nv_sqrtf" || func_name == "__nv_exp" ||
|
||||
func_name == "__nv_isnanf" ||
|
||||
func_name == "__nv_isinff" || func_name == "__nv_powi" ||
|
||||
func_name == "__nv_powif") {
|
||||
Call->getCalledFunction()->deleteBody();
|
||||
} else if (func_name == "llvm.nvvm.fma.rn.d") {
|
||||
Call->getCalledFunction()->setName("__nvvm_fma_rn_d");
|
||||
} else if (func_name == "llvm.nvvm.d2i.lo") {
|
||||
Call->getCalledFunction()->setName("__nvvm_d2i_lo");
|
||||
} else if (func_name == "llvm.nvvm.d2i.hi") {
|
||||
Call->getCalledFunction()->setName("__nvvm_d2i_hi");
|
||||
} else if (func_name == "llvm.nvvm.add.rn.d") {
|
||||
Call->getCalledFunction()->setName("__nvvm_add_rn_d");
|
||||
} else if (func_name == "llvm.nvvm.lohi.i2d") {
|
||||
Call->getCalledFunction()->setName("__nvvm_lohi_i2d");
|
||||
} else if (func_name == "llvm.nvvm.fabs.f") {
|
||||
Call->getCalledFunction()->setName("__nvvm_fabs_f");
|
||||
} else if (func_name == "llvm.nvvm.mul24.i") {
|
||||
Call->getCalledFunction()->setName("__nvvm_mul24_i");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto inst : need_remove) {
|
||||
inst->eraseFromParent();
|
||||
|
@ -382,6 +549,8 @@ bool has_warp_barrier(llvm::BasicBlock *B) {
|
|||
Instruction *inst = &(*i);
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.bar.warp.sync") {
|
||||
return true;
|
||||
|
@ -396,6 +565,8 @@ bool has_barrier(llvm::BasicBlock *B) {
|
|||
Instruction *inst = &(*i);
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.bar.warp.sync" ||
|
||||
|
@ -412,6 +583,8 @@ bool has_block_barrier(llvm::BasicBlock *B) {
|
|||
Instruction *inst = &(*i);
|
||||
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
|
||||
if (Call) {
|
||||
if (Call->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = Call->getCalledFunction()->getName().str();
|
||||
if (func_name == "llvm.nvvm.barrier0" ||
|
||||
func_name == "llvm.nvvm.barrier.sync") {
|
||||
|
@ -478,3 +651,21 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Print IR to String Output for Debugging Purposes
|
||||
*/
|
||||
// void printModule(llvm::Module *M) {
|
||||
// std::string str;
|
||||
// llvm::raw_string_ostream ss(str);
|
||||
// std::cout << "### Printing Module ###" << std::endl;
|
||||
// for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
|
||||
// Function *F = &(*i);
|
||||
// auto func_name = F->getName().str();
|
||||
// std::cout << func_name << std::endl;
|
||||
// for (Function::iterator b = F->begin(); b != F->end(); ++b) {
|
||||
// BasicBlock *B = &(*b);
|
||||
// errs() << *B;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
|
|
@ -44,6 +44,8 @@ void handle_warp_vote(llvm::Module *M) {
|
|||
for (Function::iterator E = F->end(); I != E; ++I) {
|
||||
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
|
||||
if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
|
||||
if (vote_any_sync->isInlineAsm())
|
||||
continue;
|
||||
auto func_name = vote_any_sync->getCalledFunction()->getName();
|
||||
if (func_name == "llvm.nvvm.vote.any.sync" ||
|
||||
func_name == "llvm.nvvm.vote.all.sync") {
|
||||
|
|
|
@ -1,82 +0,0 @@
|
|||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define NUM_WARP 2
|
||||
#define NUM_BLOCK 1
|
||||
|
||||
int block_size = 32 * NUM_WARP;
|
||||
int block_size_x = block_size;
|
||||
int block_size_y = 1;
|
||||
int block_size_z = 1;
|
||||
__thread int block_index = 0;
|
||||
int grid_size = NUM_BLOCK;
|
||||
|
||||
extern "C" {
|
||||
void *_Z7reduce0PiS_j_wrapper(void *);
|
||||
__thread int warp_shfl[32];
|
||||
}
|
||||
|
||||
void *wrap(void *p) {
|
||||
int **res = (int **)p;
|
||||
block_index = (*(int *)res[3]);
|
||||
_Z7reduce0PiS_j_wrapper(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
|
||||
int **ret = new int *[4];
|
||||
|
||||
int **p0 = new int *;
|
||||
*p0 = g_idata;
|
||||
ret[0] = (int *)(p0);
|
||||
|
||||
int **p1 = new int *;
|
||||
*p1 = g_odata;
|
||||
ret[1] = (int *)(p1);
|
||||
|
||||
unsigned int *p2 = new unsigned int;
|
||||
*p2 = n;
|
||||
ret[2] = (int *)p2;
|
||||
|
||||
int *p3 = new int;
|
||||
*p3 = bid;
|
||||
ret[3] = (int *)p3;
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int *g_idata;
|
||||
|
||||
int size = block_size * NUM_BLOCK;
|
||||
g_idata = new int[size * 2];
|
||||
int *res = new int[size];
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
g_idata[i] = i;
|
||||
}
|
||||
|
||||
pthread_t threads[NUM_BLOCK];
|
||||
|
||||
void *inp[NUM_BLOCK];
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
inp[t] = gen_input(t, g_idata, res, size);
|
||||
}
|
||||
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
pthread_create(&threads[t], NULL, wrap, inp[t]);
|
||||
}
|
||||
for (long t = 0; t < NUM_BLOCK; t++)
|
||||
pthread_join(threads[t], NULL);
|
||||
int gold = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
gold += g_idata[i];
|
||||
}
|
||||
assert(*res == gold && "Incorrect res\n");
|
||||
printf("PASS\n");
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
|
@ -1,150 +0,0 @@
|
|||
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
|
||||
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
|
||||
%2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
|
||||
%mul = mul i32 %2, %1
|
||||
%add = add i32 %mul, %0
|
||||
%cmp = icmp ult i32 %add, %n
|
||||
br i1 %cmp, label %cond.true, label %cond.end
|
||||
|
||||
cond.true: ; preds = %entry
|
||||
%idxprom = zext i32 %add to i64
|
||||
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
|
||||
%3 = load i32, i32* %arrayidx, align 4, !tbaa !13
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %entry, %cond.true
|
||||
%cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
|
||||
%idxprom5 = zext i32 %0 to i64
|
||||
%arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
|
||||
%arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
|
||||
store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
|
||||
%cmp839 = icmp ugt i32 %2, 1
|
||||
br i1 %cmp839, label %for.body, label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %if.end, %cond.end
|
||||
%cmp18 = icmp eq i32 %0, 0
|
||||
br i1 %cmp18, label %if.then19, label %if.end23
|
||||
|
||||
for.body: ; preds = %cond.end, %if.end
|
||||
%s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
|
||||
%mul9 = shl nuw nsw i32 %s.040, 1
|
||||
%rem = urem i32 %0, %mul9
|
||||
%cmp10 = icmp eq i32 %rem, 0
|
||||
br i1 %cmp10, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%add11 = add i32 %s.040, %0
|
||||
%idxprom12 = zext i32 %add11 to i64
|
||||
%arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
|
||||
%arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
|
||||
%4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
|
||||
%5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
|
||||
%add16 = add nsw i32 %5, %4
|
||||
store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
|
||||
%cmp8 = icmp ult i32 %mul9, %2
|
||||
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
||||
|
||||
if.then19: ; preds = %for.cond.cleanup
|
||||
%idxprom21 = zext i32 %1 to i64
|
||||
%arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
|
||||
%6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
|
||||
store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
|
||||
br label %if.end23
|
||||
|
||||
if.end23: ; preds = %if.then19, %for.cond.cleanup
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier.sync(i32) #3
|
||||
|
||||
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
attributes #4 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
||||
!10 = !{i32 0, i32 1024}
|
||||
!11 = !{i32 0, i32 2147483647}
|
||||
!12 = !{i32 1, i32 1025}
|
||||
!13 = !{!14, !14, i64 0}
|
||||
!14 = !{!"int", !15, i64 0}
|
||||
!15 = !{!"omnipotent char", !16, i64 0}
|
||||
!16 = !{!"Simple C++ TBAA"}
|
|
@ -1,6 +0,0 @@
|
|||
#!/bin/bash
|
||||
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
|
||||
llc --filetype=obj kernel.bc
|
||||
g++ host.cpp kernel.o -lpthread -o test
|
||||
./test
|
|
@ -1,82 +0,0 @@
|
|||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define NUM_WARP 2
|
||||
#define NUM_BLOCK 1
|
||||
|
||||
int block_size = 32 * NUM_WARP;
|
||||
int block_size_x = block_size;
|
||||
int block_size_y = 1;
|
||||
int block_size_z = 1;
|
||||
__thread int block_index = 0;
|
||||
int grid_size = NUM_BLOCK;
|
||||
|
||||
extern "C" {
|
||||
void *_Z7reduce5PiS_j_wrapper(void *);
|
||||
__thread int warp_shfl[32];
|
||||
}
|
||||
|
||||
void *wrap(void *p) {
|
||||
int **res = (int **)p;
|
||||
block_index = (*(int *)res[3]);
|
||||
_Z7reduce5PiS_j_wrapper(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
|
||||
int **ret = new int *[4];
|
||||
|
||||
int **p0 = new int *;
|
||||
*p0 = g_idata;
|
||||
ret[0] = (int *)(p0);
|
||||
|
||||
int **p1 = new int *;
|
||||
*p1 = g_odata;
|
||||
ret[1] = (int *)(p1);
|
||||
|
||||
unsigned int *p2 = new unsigned int;
|
||||
*p2 = n;
|
||||
ret[2] = (int *)p2;
|
||||
|
||||
int *p3 = new int;
|
||||
*p3 = bid;
|
||||
ret[3] = (int *)p3;
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int *g_idata;
|
||||
|
||||
int size = block_size * NUM_BLOCK;
|
||||
g_idata = new int[size * 2];
|
||||
int *res = new int[size];
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
g_idata[i] = i;
|
||||
}
|
||||
|
||||
pthread_t threads[NUM_BLOCK];
|
||||
|
||||
void *inp[NUM_BLOCK];
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
inp[t] = gen_input(t, g_idata, res, size);
|
||||
}
|
||||
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
pthread_create(&threads[t], NULL, wrap, inp[t]);
|
||||
}
|
||||
for (long t = 0; t < NUM_BLOCK; t++)
|
||||
pthread_join(threads[t], NULL);
|
||||
int gold = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
gold += g_idata[i];
|
||||
}
|
||||
assert(*res == gold && "Incorrect res\n");
|
||||
printf("PASS\n");
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
|
@ -1,179 +0,0 @@
|
|||
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
|
||||
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
|
||||
%mul = shl i32 %1, 7
|
||||
%add = add i32 %mul, %0
|
||||
%cmp = icmp ult i32 %add, %n
|
||||
br i1 %cmp, label %cond.true, label %cond.end
|
||||
|
||||
cond.true: ; preds = %entry
|
||||
%idxprom = zext i32 %add to i64
|
||||
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
|
||||
%2 = load i32, i32* %arrayidx, align 4, !tbaa !12
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %entry, %cond.true
|
||||
%cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
|
||||
%add4 = add i32 %add, 64
|
||||
%cmp5 = icmp ult i32 %add4, %n
|
||||
br i1 %cmp5, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %cond.end
|
||||
%idxprom7 = zext i32 %add4 to i64
|
||||
%arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
|
||||
%3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
|
||||
%add9 = add nsw i32 %3, %cond
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %cond.end
|
||||
%mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
|
||||
%idxprom10 = zext i32 %0 to i64
|
||||
%arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
|
||||
%arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
|
||||
store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
|
||||
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
|
||||
%5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
|
||||
%mul.i.i52 = mul nuw nsw i32 %5, %4
|
||||
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
|
||||
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
|
||||
%mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
|
||||
%add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
|
||||
%add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
|
||||
%cmp14 = icmp ult i32 %add8.i.i55, 32
|
||||
br i1 %cmp14, label %if.then15, label %if.end32
|
||||
|
||||
if.then15: ; preds = %if.end
|
||||
%add16 = add nuw nsw i32 %0, 32
|
||||
%idxprom17 = zext i32 %add16 to i64
|
||||
%arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
|
||||
%arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
|
||||
%8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
|
||||
%add19 = add nsw i32 %8, %mySum.0
|
||||
%9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
|
||||
%add23 = add nsw i32 %9, %add19
|
||||
%10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
|
||||
%add23.1 = add nsw i32 %10, %add23
|
||||
%11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
|
||||
%add23.2 = add nsw i32 %11, %add23.1
|
||||
%12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
|
||||
%add23.3 = add nsw i32 %12, %add23.2
|
||||
%13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
|
||||
%cmp27 = icmp eq i32 %add8.i.i55, 0
|
||||
br i1 %cmp27, label %if.then28, label %if.end32
|
||||
|
||||
if.then28: ; preds = %if.then15
|
||||
%add23.4 = add nsw i32 %13, %add23.3
|
||||
%idxprom30 = zext i32 %1 to i64
|
||||
%arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
|
||||
store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
|
||||
br label %if.end32
|
||||
|
||||
if.end32: ; preds = %if.end, %if.then28, %if.then15
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier.sync(i32) #3
|
||||
|
||||
; Function Attrs: convergent inaccessiblememonly nounwind
|
||||
declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
|
||||
|
||||
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
attributes #4 = { convergent inaccessiblememonly nounwind }
|
||||
attributes #5 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
||||
!10 = !{i32 0, i32 1024}
|
||||
!11 = !{i32 0, i32 2147483647}
|
||||
!12 = !{!13, !13, i64 0}
|
||||
!13 = !{!"int", !14, i64 0}
|
||||
!14 = !{!"omnipotent char", !15, i64 0}
|
||||
!15 = !{!"Simple C++ TBAA"}
|
||||
!16 = !{i32 0, i32 64}
|
||||
!17 = !{i32 1, i32 1025}
|
|
@ -1,6 +0,0 @@
|
|||
#!/bin/bash
|
||||
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
|
||||
llc --filetype=obj kernel.bc
|
||||
g++ host.cpp kernel.o -lpthread -o test
|
||||
./test
|
|
@ -1,11 +0,0 @@
|
|||
#!bin/sh
|
||||
for file in ./*
|
||||
do
|
||||
if test -d $file
|
||||
then
|
||||
echo executing $file
|
||||
cd $file
|
||||
bash run.sh
|
||||
cd ..
|
||||
fi
|
||||
done
|
|
@ -1,84 +0,0 @@
|
|||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#define NUM_BLOCK 1
|
||||
int N = 32;
|
||||
|
||||
int block_size = 32;
|
||||
int block_size_x = block_size;
|
||||
int block_size_y = 1;
|
||||
int block_size_z = 1;
|
||||
__thread int block_index = 0;
|
||||
int grid_size = NUM_BLOCK;
|
||||
|
||||
extern "C" {
|
||||
void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
|
||||
}
|
||||
|
||||
void *wrap(void *p) {
|
||||
int **res = (int **)p;
|
||||
block_index = (*(int *)res[4]);
|
||||
_Z9vectorAddPKfS0_Pfi_wrapper(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *gen_input(int bid, float *A, float *B, float *C, int N) {
|
||||
int **ret = new int *[5];
|
||||
|
||||
float **p0 = new float *;
|
||||
*p0 = A;
|
||||
ret[0] = (int *)(p0);
|
||||
|
||||
float **p1 = new float *;
|
||||
*p1 = B;
|
||||
ret[1] = (int *)(p1);
|
||||
|
||||
float **p2 = new float *;
|
||||
*p2 = C;
|
||||
ret[2] = (int *)(p2);
|
||||
|
||||
int *p3 = new int;
|
||||
*p3 = N;
|
||||
ret[3] = (int *)p3;
|
||||
|
||||
int *p4 = new int;
|
||||
*p4 = bid;
|
||||
ret[4] = (int *)p4;
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
int main() {
|
||||
float *A, *B, *C;
|
||||
|
||||
A = new float[N];
|
||||
B = new float[N];
|
||||
C = new float[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
A[i] = i;
|
||||
B[i] = 1;
|
||||
C[i] = 0;
|
||||
}
|
||||
|
||||
pthread_t threads[NUM_BLOCK];
|
||||
|
||||
int rc;
|
||||
for (long t = 0; t < NUM_BLOCK; t++) {
|
||||
void *inp = gen_input(t, A, B, C, N);
|
||||
rc = pthread_create(&threads[t], NULL, wrap, inp);
|
||||
}
|
||||
clock_t t1 = clock();
|
||||
/* Last thing that main() should do */
|
||||
for (long t = 0; t < NUM_BLOCK; t++)
|
||||
pthread_join(threads[t], NULL);
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
assert(C[i] == (A[i] + B[i]));
|
||||
}
|
||||
printf("PASS\n");
|
||||
pthread_exit(NULL);
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
|
||||
entry:
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: nofree nounwind
|
||||
define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
|
||||
%idxprom8 = zext i32 %0 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
|
||||
%1 = load float, float* %arrayidx, align 4, !tbaa !11
|
||||
%arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
|
||||
%2 = load float, float* %arrayidx2, align 4, !tbaa !11
|
||||
%add = fadd contract float %1, %2
|
||||
%arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
|
||||
store float %add, float* %arrayidx4, align 4, !tbaa !11
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
||||
!10 = !{i32 0, i32 1024}
|
||||
!11 = !{!12, !12, i64 0}
|
||||
!12 = !{!"float", !13, i64 0}
|
||||
!13 = !{!"omnipotent char", !14, i64 0}
|
||||
!14 = !{!"Simple C++ TBAA"}
|
|
@ -1,6 +0,0 @@
|
|||
#!/bin/bash
|
||||
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
|
||||
llc --filetype=obj kernel.bc
|
||||
g++ host.cpp kernel.o -lpthread -o test
|
||||
./test
|
Binary file not shown.
Before Width: | Height: | Size: 109 KiB |
|
@ -1,11 +0,0 @@
|
|||
# The workflow of CuPBoP
|
||||
|
||||
The workflow of CuPBoP is described as following:
|
||||
![The workflow of executing CUDA applications on CuPBoP.](figures/workflow.png)
|
||||
First, CuPBoP uses Clang to compile the CUDA source code into NVVM IR,
|
||||
which consists of two parts: Host part and Kernel Part.
|
||||
In the next step, CuPBoP-compilation parses and transforms these NVVM IRs
|
||||
to make it suitable for executing on specific architectures.
|
||||
The CuPBoP-runtime compiles the transformed Host IR and executes the generated programs,
|
||||
which will compile the transformed Kernel IR and
|
||||
upload the compiled kernel programs to specific architectures.
|
|
@ -0,0 +1,454 @@
|
|||
#include "backprop.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
//#define OPEN
|
||||
|
||||
#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
|
||||
|
||||
#define fastcopy(to, from, len) \
|
||||
{ \
|
||||
register char *_to, *_from; \
|
||||
register int _i, _l; \
|
||||
_to = (char *)(to); \
|
||||
_from = (char *)(from); \
|
||||
_l = (len); \
|
||||
for (_i = 0; _i < _l; _i++) \
|
||||
*_to++ = *_from++; \
|
||||
}
|
||||
|
||||
/*** Return random number between 0.0 and 1.0 ***/
|
||||
float drnd() { return ((float)rand() / (float)BIGRND); }
|
||||
|
||||
/*** Return random number between -1.0 and 1.0 ***/
|
||||
float dpn1() { return ((drnd() * 2.0) - 1.0); }
|
||||
|
||||
/*** The squashing function. Currently, it's a sigmoid. ***/
|
||||
|
||||
float squash(x)
|
||||
float x;
|
||||
{
|
||||
float m;
|
||||
// x = -x;
|
||||
// m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
|
||||
// return(1.0 / (1.0 + m));
|
||||
return (1.0 / (1.0 + exp(-x)));
|
||||
}
|
||||
|
||||
/*** Allocate 1d array of floats ***/
|
||||
|
||||
float *alloc_1d_dbl(n)
|
||||
int n;
|
||||
{
|
||||
float *new;
|
||||
|
||||
new = (float *)malloc((unsigned)(n * sizeof(float)));
|
||||
if (new == NULL) {
|
||||
printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
|
||||
return (NULL);
|
||||
}
|
||||
return (new);
|
||||
}
|
||||
|
||||
/*** Allocate 2d array of floats ***/
|
||||
|
||||
float **alloc_2d_dbl(m, n)
|
||||
int m, n;
|
||||
{
|
||||
int i;
|
||||
float **new;
|
||||
|
||||
new = (float **)malloc((unsigned)(m * sizeof(float *)));
|
||||
if (new == NULL) {
|
||||
printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
new[i] = alloc_1d_dbl(n);
|
||||
}
|
||||
|
||||
return (new);
|
||||
}
|
||||
|
||||
bpnn_randomize_weights(w, m, n) float **w;
|
||||
int m, n;
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i <= m; i++) {
|
||||
for (j = 0; j <= n; j++) {
|
||||
w[i][j] = (float)rand() / RAND_MAX;
|
||||
// w[i][j] = dpn1();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bpnn_randomize_row(w, m) float *w;
|
||||
int m;
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i <= m; i++) {
|
||||
// w[i] = (float) rand()/RAND_MAX;
|
||||
w[i] = 0.1;
|
||||
}
|
||||
}
|
||||
|
||||
bpnn_zero_weights(w, m, n) float **w;
|
||||
int m, n;
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i <= m; i++) {
|
||||
for (j = 0; j <= n; j++) {
|
||||
w[i][j] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bpnn_initialize(seed) {
|
||||
printf("Random number generator seed: %d\n", seed);
|
||||
srand(seed);
|
||||
}
|
||||
|
||||
BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
|
||||
int n_in, n_hidden, n_out;
|
||||
{
|
||||
BPNN *newnet;
|
||||
|
||||
newnet = (BPNN *)malloc(sizeof(BPNN));
|
||||
if (newnet == NULL) {
|
||||
printf("BPNN_CREATE: Couldn't allocate neural network\n");
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
newnet->input_n = n_in;
|
||||
newnet->hidden_n = n_hidden;
|
||||
newnet->output_n = n_out;
|
||||
newnet->input_units = alloc_1d_dbl(n_in + 1);
|
||||
newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
|
||||
newnet->output_units = alloc_1d_dbl(n_out + 1);
|
||||
|
||||
newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
|
||||
newnet->output_delta = alloc_1d_dbl(n_out + 1);
|
||||
newnet->target = alloc_1d_dbl(n_out + 1);
|
||||
|
||||
newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
|
||||
newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
|
||||
|
||||
newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
|
||||
newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
|
||||
|
||||
return (newnet);
|
||||
}
|
||||
|
||||
void bpnn_free(net) BPNN *net;
|
||||
{
|
||||
int n1, n2, i;
|
||||
|
||||
n1 = net->input_n;
|
||||
n2 = net->hidden_n;
|
||||
|
||||
free((char *)net->input_units);
|
||||
free((char *)net->hidden_units);
|
||||
free((char *)net->output_units);
|
||||
|
||||
free((char *)net->hidden_delta);
|
||||
free((char *)net->output_delta);
|
||||
free((char *)net->target);
|
||||
|
||||
for (i = 0; i <= n1; i++) {
|
||||
free((char *)net->input_weights[i]);
|
||||
free((char *)net->input_prev_weights[i]);
|
||||
}
|
||||
free((char *)net->input_weights);
|
||||
free((char *)net->input_prev_weights);
|
||||
|
||||
for (i = 0; i <= n2; i++) {
|
||||
free((char *)net->hidden_weights[i]);
|
||||
free((char *)net->hidden_prev_weights[i]);
|
||||
}
|
||||
free((char *)net->hidden_weights);
|
||||
free((char *)net->hidden_prev_weights);
|
||||
|
||||
free((char *)net);
|
||||
}
|
||||
|
||||
/*** Creates a new fully-connected network from scratch,
|
||||
with the given numbers of input, hidden, and output units.
|
||||
Threshold units are automatically included. All weights are
|
||||
randomly initialized.
|
||||
Space is also allocated for temporary storage (momentum weights,
|
||||
error computations, etc).
|
||||
***/
|
||||
|
||||
BPNN *bpnn_create(n_in, n_hidden, n_out)
|
||||
int n_in, n_hidden, n_out;
|
||||
{
|
||||
|
||||
BPNN *newnet;
|
||||
|
||||
newnet = bpnn_internal_create(n_in, n_hidden, n_out);
|
||||
|
||||
#ifdef INITZERO
|
||||
bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
|
||||
#else
|
||||
bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
|
||||
#endif
|
||||
bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
|
||||
bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
|
||||
bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
|
||||
bpnn_randomize_row(newnet->target, n_out);
|
||||
return (newnet);
|
||||
}
|
||||
|
||||
void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
|
||||
int n1, n2;
|
||||
{
|
||||
float sum;
|
||||
int j, k;
|
||||
|
||||
/*** Set up thresholding unit ***/
|
||||
l1[0] = 1.0;
|
||||
#ifdef OPEN
|
||||
omp_set_num_threads(NUM_THREAD);
|
||||
#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
|
||||
#endif
|
||||
/*** For each unit in second layer ***/
|
||||
for (j = 1; j <= n2; j++) {
|
||||
|
||||
/*** Compute weighted sum of its inputs ***/
|
||||
sum = 0.0;
|
||||
for (k = 0; k <= n1; k++) {
|
||||
sum += conn[k][j] * l1[k];
|
||||
}
|
||||
l2[j] = squash(sum);
|
||||
}
|
||||
}
|
||||
|
||||
// extern "C"
|
||||
void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
|
||||
*output, *err;
|
||||
int nj;
|
||||
{
|
||||
int j;
|
||||
float o, t, errsum;
|
||||
errsum = 0.0;
|
||||
for (j = 1; j <= nj; j++) {
|
||||
o = output[j];
|
||||
t = target[j];
|
||||
delta[j] = o * (1.0 - o) * (t - o);
|
||||
errsum += ABS(delta[j]);
|
||||
}
|
||||
*err = errsum;
|
||||
}
|
||||
|
||||
void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
|
||||
err) float *delta_h,
|
||||
*delta_o, *hidden, **who, *err;
|
||||
int nh, no;
|
||||
{
|
||||
int j, k;
|
||||
float h, sum, errsum;
|
||||
|
||||
errsum = 0.0;
|
||||
for (j = 1; j <= nh; j++) {
|
||||
h = hidden[j];
|
||||
sum = 0.0;
|
||||
for (k = 1; k <= no; k++) {
|
||||
sum += delta_o[k] * who[j][k];
|
||||
}
|
||||
delta_h[j] = h * (1.0 - h) * sum;
|
||||
errsum += ABS(delta_h[j]);
|
||||
}
|
||||
*err = errsum;
|
||||
}
|
||||
|
||||
void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
|
||||
**w, **oldw;
|
||||
{
|
||||
float new_dw;
|
||||
int k, j;
|
||||
ly[0] = 1.0;
|
||||
// eta = 0.3;
|
||||
// momentum = 0.3;
|
||||
|
||||
#ifdef OPEN
|
||||
omp_set_num_threads(NUM_THREAD);
|
||||
#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw) \
|
||||
firstprivate(ndelta, nly, momentum)
|
||||
#endif
|
||||
for (j = 1; j <= ndelta; j++) {
|
||||
for (k = 0; k <= nly; k++) {
|
||||
new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
|
||||
w[k][j] += new_dw;
|
||||
oldw[k][j] = new_dw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bpnn_feedforward(net) BPNN *net;
|
||||
{
|
||||
int in, hid, out;
|
||||
|
||||
in = net->input_n;
|
||||
hid = net->hidden_n;
|
||||
out = net->output_n;
|
||||
|
||||
/*** Feed forward input activations. ***/
|
||||
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
||||
hid);
|
||||
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
||||
hid, out);
|
||||
}
|
||||
|
||||
void bpnn_train(net, eo, eh) BPNN *net;
|
||||
float *eo, *eh;
|
||||
{
|
||||
int in, hid, out;
|
||||
float out_err, hid_err;
|
||||
|
||||
in = net->input_n;
|
||||
hid = net->hidden_n;
|
||||
out = net->output_n;
|
||||
|
||||
/*** Feed forward input activations. ***/
|
||||
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
||||
hid);
|
||||
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
||||
hid, out);
|
||||
|
||||
/*** Compute error on output and hidden units. ***/
|
||||
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
|
||||
&out_err);
|
||||
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
|
||||
net->hidden_weights, net->hidden_units, &hid_err);
|
||||
*eo = out_err;
|
||||
*eh = hid_err;
|
||||
|
||||
/*** Adjust input and hidden weights. ***/
|
||||
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
|
||||
net->hidden_weights, net->hidden_prev_weights);
|
||||
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
|
||||
net->input_weights, net->input_prev_weights);
|
||||
}
|
||||
|
||||
void bpnn_save(net, filename) BPNN *net;
|
||||
char *filename;
|
||||
{
|
||||
int n1, n2, n3, i, j, memcnt;
|
||||
float dvalue, **w;
|
||||
char *mem;
|
||||
/// add//
|
||||
FILE *pFile;
|
||||
pFile = fopen(filename, "w+");
|
||||
///////
|
||||
/*
|
||||
if ((fd = creat(filename, 0644)) == -1) {
|
||||
printf("BPNN_SAVE: Cannot create '%s'\n", filename);
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
n1 = net->input_n;
|
||||
n2 = net->hidden_n;
|
||||
n3 = net->output_n;
|
||||
printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
|
||||
// fflush(stdout);
|
||||
|
||||
// write(fd, (char *) &n1, sizeof(int));
|
||||
// write(fd, (char *) &n2, sizeof(int));
|
||||
// write(fd, (char *) &n3, sizeof(int));
|
||||
|
||||
fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
|
||||
fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
|
||||
fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
|
||||
|
||||
memcnt = 0;
|
||||
w = net->input_weights;
|
||||
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
|
||||
for (i = 0; i <= n1; i++) {
|
||||
for (j = 0; j <= n2; j++) {
|
||||
dvalue = w[i][j];
|
||||
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
// write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
|
||||
fwrite(mem, (unsigned)(sizeof(float)),
|
||||
(unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
|
||||
free(mem);
|
||||
|
||||
memcnt = 0;
|
||||
w = net->hidden_weights;
|
||||
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
|
||||
for (i = 0; i <= n2; i++) {
|
||||
for (j = 0; j <= n3; j++) {
|
||||
dvalue = w[i][j];
|
||||
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
// write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
|
||||
fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
|
||||
pFile);
|
||||
free(mem);
|
||||
|
||||
fclose(pFile);
|
||||
return;
|
||||
}
|
||||
|
||||
BPNN *bpnn_read(filename)
|
||||
char *filename;
|
||||
{
|
||||
char *mem;
|
||||
BPNN *new;
|
||||
int fd, n1, n2, n3, i, j, memcnt;
|
||||
|
||||
if ((fd = open(filename, 0, 0644)) == -1) {
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
printf("Reading '%s'\n", filename); // fflush(stdout);
|
||||
|
||||
read(fd, (char *)&n1, sizeof(int));
|
||||
read(fd, (char *)&n2, sizeof(int));
|
||||
read(fd, (char *)&n3, sizeof(int));
|
||||
new = bpnn_internal_create(n1, n2, n3);
|
||||
|
||||
printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
|
||||
printf("Reading input weights..."); // fflush(stdout);
|
||||
|
||||
memcnt = 0;
|
||||
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
|
||||
read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
|
||||
for (i = 0; i <= n1; i++) {
|
||||
for (j = 0; j <= n2; j++) {
|
||||
fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
free(mem);
|
||||
|
||||
printf("Done\nReading hidden weights..."); // fflush(stdout);
|
||||
|
||||
memcnt = 0;
|
||||
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
|
||||
read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
|
||||
for (i = 0; i <= n2; i++) {
|
||||
for (j = 0; j <= n3; j++) {
|
||||
fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
free(mem);
|
||||
close(fd);
|
||||
|
||||
printf("Done\n"); // fflush(stdout);
|
||||
|
||||
bpnn_zero_weights(new->input_prev_weights, n1, n2);
|
||||
bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
|
||||
|
||||
return (new);
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
#ifndef _BACKPROP_H_
|
||||
#define _BACKPROP_H_
|
||||
|
||||
#define BIGRND 0x7fffffff
|
||||
|
||||
#define GPU
|
||||
#define THREADS 256
|
||||
#define WIDTH 16 // shared memory width
|
||||
#define HEIGHT 16 // shared memory height
|
||||
|
||||
#define ETA 0.3 // eta value
|
||||
#define MOMENTUM 0.3 // momentum value
|
||||
#define NUM_THREAD 4 // OpenMP threads
|
||||
|
||||
typedef struct {
|
||||
int input_n; /* number of input units */
|
||||
int hidden_n; /* number of hidden units */
|
||||
int output_n; /* number of output units */
|
||||
|
||||
float *input_units; /* the input units */
|
||||
float *hidden_units; /* the hidden units */
|
||||
float *output_units; /* the output units */
|
||||
|
||||
float *hidden_delta; /* storage for hidden unit error */
|
||||
float *output_delta; /* storage for output unit error */
|
||||
|
||||
float *target; /* storage for target vector */
|
||||
|
||||
float **input_weights; /* weights from input to hidden layer */
|
||||
float **hidden_weights; /* weights from hidden to output layer */
|
||||
|
||||
/*** The next two are for momentum ***/
|
||||
float **input_prev_weights; /* previous change on input to hidden wgt */
|
||||
float **hidden_prev_weights; /* previous change on hidden to output wgt */
|
||||
} BPNN;
|
||||
|
||||
/*** User-level functions ***/
|
||||
|
||||
void bpnn_initialize();
|
||||
|
||||
BPNN *bpnn_create();
|
||||
void bpnn_free();
|
||||
|
||||
void bpnn_train();
|
||||
void bpnn_feedforward();
|
||||
|
||||
void bpnn_save();
|
||||
BPNN *bpnn_read();
|
||||
|
||||
#endif
|
|
@ -0,0 +1,615 @@
|
|||
; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "backprop_cuda.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
|
||||
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
|
||||
entry:
|
||||
%input_cuda.addr = alloca float*, align 8
|
||||
%output_hidden_cuda.addr = alloca float*, align 8
|
||||
%input_hidden_cuda.addr = alloca float*, align 8
|
||||
%hidden_partial_sum.addr = alloca float*, align 8
|
||||
%in.addr = alloca i32, align 4
|
||||
%hid.addr = alloca i32, align 4
|
||||
%by = alloca i32, align 4
|
||||
%tx = alloca i32, align 4
|
||||
%ty = alloca i32, align 4
|
||||
%index = alloca i32, align 4
|
||||
%index_in = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
%power_two = alloca i32, align 4
|
||||
store float* %input_cuda, float** %input_cuda.addr, align 8
|
||||
store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
|
||||
store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
|
||||
store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
|
||||
store i32 %in, i32* %in.addr, align 4
|
||||
store i32 %hid, i32* %hid.addr, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call, i32* %by, align 4
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %tx, align 4
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call2, i32* %ty, align 4
|
||||
%0 = load i32, i32* %hid.addr, align 4
|
||||
%add = add nsw i32 %0, 1
|
||||
%mul = mul nsw i32 %add, 16
|
||||
%1 = load i32, i32* %by, align 4
|
||||
%mul3 = mul nsw i32 %mul, %1
|
||||
%2 = load i32, i32* %hid.addr, align 4
|
||||
%add4 = add nsw i32 %2, 1
|
||||
%3 = load i32, i32* %ty, align 4
|
||||
%mul5 = mul nsw i32 %add4, %3
|
||||
%add6 = add nsw i32 %mul3, %mul5
|
||||
%4 = load i32, i32* %tx, align 4
|
||||
%add7 = add nsw i32 %add6, %4
|
||||
%add8 = add nsw i32 %add7, 1
|
||||
%5 = load i32, i32* %hid.addr, align 4
|
||||
%add9 = add nsw i32 %5, 1
|
||||
%add10 = add nsw i32 %add8, %add9
|
||||
store i32 %add10, i32* %index, align 4
|
||||
%6 = load i32, i32* %by, align 4
|
||||
%mul11 = mul nsw i32 16, %6
|
||||
%7 = load i32, i32* %ty, align 4
|
||||
%add12 = add nsw i32 %mul11, %7
|
||||
%add13 = add nsw i32 %add12, 1
|
||||
store i32 %add13, i32* %index_in, align 4
|
||||
%8 = load i32, i32* %tx, align 4
|
||||
%cmp = icmp eq i32 %8, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%9 = load float*, float** %input_cuda.addr, align 8
|
||||
%10 = load i32, i32* %index_in, align 4
|
||||
%idxprom = sext i32 %10 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
|
||||
%11 = load float, float* %arrayidx, align 4
|
||||
%12 = load i32, i32* %ty, align 4
|
||||
%idxprom14 = sext i32 %12 to i64
|
||||
%arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
|
||||
store float %11, float* %arrayidx15, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%13 = load float*, float** %input_hidden_cuda.addr, align 8
|
||||
%14 = load i32, i32* %index, align 4
|
||||
%idxprom16 = sext i32 %14 to i64
|
||||
%arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
|
||||
%15 = load float, float* %arrayidx17, align 4
|
||||
%16 = load i32, i32* %ty, align 4
|
||||
%idxprom18 = sext i32 %16 to i64
|
||||
%arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
|
||||
%17 = load i32, i32* %tx, align 4
|
||||
%idxprom20 = sext i32 %17 to i64
|
||||
%arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
|
||||
store float %15, float* %arrayidx21, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%18 = load i32, i32* %ty, align 4
|
||||
%idxprom22 = sext i32 %18 to i64
|
||||
%arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
|
||||
%19 = load i32, i32* %tx, align 4
|
||||
%idxprom24 = sext i32 %19 to i64
|
||||
%arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
|
||||
%20 = load float, float* %arrayidx25, align 4
|
||||
%21 = load i32, i32* %ty, align 4
|
||||
%idxprom26 = sext i32 %21 to i64
|
||||
%arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
|
||||
%22 = load float, float* %arrayidx27, align 4
|
||||
%mul28 = fmul contract float %20, %22
|
||||
%23 = load i32, i32* %ty, align 4
|
||||
%idxprom29 = sext i32 %23 to i64
|
||||
%arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
|
||||
%24 = load i32, i32* %tx, align 4
|
||||
%idxprom31 = sext i32 %24 to i64
|
||||
%arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
|
||||
store float %mul28, float* %arrayidx32, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
store i32 1, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %if.end
|
||||
%25 = load i32, i32* %i, align 4
|
||||
%conv = sitofp i32 %25 to float
|
||||
%call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
|
||||
%cmp34 = fcmp ole float %conv, %call33
|
||||
br i1 %cmp34, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%26 = load i32, i32* %i, align 4
|
||||
%conv35 = sitofp i32 %26 to float
|
||||
%call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
|
||||
%conv37 = fptosi float %call36 to i32
|
||||
store i32 %conv37, i32* %power_two, align 4
|
||||
%27 = load i32, i32* %ty, align 4
|
||||
%28 = load i32, i32* %power_two, align 4
|
||||
%rem = srem i32 %27, %28
|
||||
%cmp38 = icmp eq i32 %rem, 0
|
||||
br i1 %cmp38, label %if.then39, label %if.end54
|
||||
|
||||
if.then39: ; preds = %for.body
|
||||
%29 = load i32, i32* %ty, align 4
|
||||
%idxprom40 = sext i32 %29 to i64
|
||||
%arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
|
||||
%30 = load i32, i32* %tx, align 4
|
||||
%idxprom42 = sext i32 %30 to i64
|
||||
%arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
|
||||
%31 = load float, float* %arrayidx43, align 4
|
||||
%32 = load i32, i32* %ty, align 4
|
||||
%33 = load i32, i32* %power_two, align 4
|
||||
%div = sdiv i32 %33, 2
|
||||
%add44 = add nsw i32 %32, %div
|
||||
%idxprom45 = sext i32 %add44 to i64
|
||||
%arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
|
||||
%34 = load i32, i32* %tx, align 4
|
||||
%idxprom47 = sext i32 %34 to i64
|
||||
%arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
|
||||
%35 = load float, float* %arrayidx48, align 4
|
||||
%add49 = fadd contract float %31, %35
|
||||
%36 = load i32, i32* %ty, align 4
|
||||
%idxprom50 = sext i32 %36 to i64
|
||||
%arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
|
||||
%37 = load i32, i32* %tx, align 4
|
||||
%idxprom52 = sext i32 %37 to i64
|
||||
%arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
|
||||
store float %add49, float* %arrayidx53, align 4
|
||||
br label %if.end54
|
||||
|
||||
if.end54: ; preds = %if.then39, %for.body
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end54
|
||||
%38 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %38, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%39 = load i32, i32* %ty, align 4
|
||||
%idxprom55 = sext i32 %39 to i64
|
||||
%arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
|
||||
%40 = load i32, i32* %tx, align 4
|
||||
%idxprom57 = sext i32 %40 to i64
|
||||
%arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
|
||||
%41 = load float, float* %arrayidx58, align 4
|
||||
%42 = load float*, float** %input_hidden_cuda.addr, align 8
|
||||
%43 = load i32, i32* %index, align 4
|
||||
%idxprom59 = sext i32 %43 to i64
|
||||
%arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
|
||||
store float %41, float* %arrayidx60, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%44 = load i32, i32* %tx, align 4
|
||||
%cmp61 = icmp eq i32 %44, 0
|
||||
br i1 %cmp61, label %if.then62, label %if.end71
|
||||
|
||||
if.then62: ; preds = %for.end
|
||||
%45 = load i32, i32* %tx, align 4
|
||||
%idxprom63 = sext i32 %45 to i64
|
||||
%arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
|
||||
%46 = load i32, i32* %ty, align 4
|
||||
%idxprom65 = sext i32 %46 to i64
|
||||
%arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
|
||||
%47 = load float, float* %arrayidx66, align 4
|
||||
%48 = load float*, float** %hidden_partial_sum.addr, align 8
|
||||
%49 = load i32, i32* %by, align 4
|
||||
%50 = load i32, i32* %hid.addr, align 4
|
||||
%mul67 = mul nsw i32 %49, %50
|
||||
%51 = load i32, i32* %ty, align 4
|
||||
%add68 = add nsw i32 %mul67, %51
|
||||
%idxprom69 = sext i32 %add68 to i64
|
||||
%arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
|
||||
store float %47, float* %arrayidx70, align 4
|
||||
br label %if.end71
|
||||
|
||||
if.end71: ; preds = %if.then62, %for.end
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define internal float @_ZL7__log2ff(float %__a) #1 {
|
||||
entry:
|
||||
%__a.addr = alloca float, align 4
|
||||
store float %__a, float* %__a.addr, align 4
|
||||
%0 = load float, float* %__a.addr, align 4
|
||||
%call = call float @__nv_fast_log2f(float %0) #2
|
||||
ret float %call
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
|
||||
entry:
|
||||
%__a.addr = alloca float, align 4
|
||||
%__b.addr = alloca float, align 4
|
||||
store float %__a, float* %__a.addr, align 4
|
||||
store float %__b, float* %__b.addr, align 4
|
||||
%0 = load float, float* %__a.addr, align 4
|
||||
%1 = load float, float* %__b.addr, align 4
|
||||
%call = call float @__nv_fast_powf(float %0, float %1) #2
|
||||
ret float %call
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
|
||||
entry:
|
||||
%delta.addr = alloca float*, align 8
|
||||
%hid.addr = alloca i32, align 4
|
||||
%ly.addr = alloca float*, align 8
|
||||
%in.addr = alloca i32, align 4
|
||||
%w.addr = alloca float*, align 8
|
||||
%oldw.addr = alloca float*, align 8
|
||||
%by = alloca i32, align 4
|
||||
%tx = alloca i32, align 4
|
||||
%ty = alloca i32, align 4
|
||||
%index = alloca i32, align 4
|
||||
%index_y = alloca i32, align 4
|
||||
%index_x = alloca i32, align 4
|
||||
store float* %delta, float** %delta.addr, align 8
|
||||
store i32 %hid, i32* %hid.addr, align 4
|
||||
store float* %ly, float** %ly.addr, align 8
|
||||
store i32 %in, i32* %in.addr, align 4
|
||||
store float* %w, float** %w.addr, align 8
|
||||
store float* %oldw, float** %oldw.addr, align 8
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call, i32* %by, align 4
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %tx, align 4
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call2, i32* %ty, align 4
|
||||
%0 = load i32, i32* %hid.addr, align 4
|
||||
%add = add nsw i32 %0, 1
|
||||
%mul = mul nsw i32 %add, 16
|
||||
%1 = load i32, i32* %by, align 4
|
||||
%mul3 = mul nsw i32 %mul, %1
|
||||
%2 = load i32, i32* %hid.addr, align 4
|
||||
%add4 = add nsw i32 %2, 1
|
||||
%3 = load i32, i32* %ty, align 4
|
||||
%mul5 = mul nsw i32 %add4, %3
|
||||
%add6 = add nsw i32 %mul3, %mul5
|
||||
%4 = load i32, i32* %tx, align 4
|
||||
%add7 = add nsw i32 %add6, %4
|
||||
%add8 = add nsw i32 %add7, 1
|
||||
%5 = load i32, i32* %hid.addr, align 4
|
||||
%add9 = add nsw i32 %5, 1
|
||||
%add10 = add nsw i32 %add8, %add9
|
||||
store i32 %add10, i32* %index, align 4
|
||||
%6 = load i32, i32* %by, align 4
|
||||
%mul11 = mul nsw i32 16, %6
|
||||
%7 = load i32, i32* %ty, align 4
|
||||
%add12 = add nsw i32 %mul11, %7
|
||||
%add13 = add nsw i32 %add12, 1
|
||||
store i32 %add13, i32* %index_y, align 4
|
||||
%8 = load i32, i32* %tx, align 4
|
||||
%add14 = add nsw i32 %8, 1
|
||||
store i32 %add14, i32* %index_x, align 4
|
||||
%9 = load float*, float** %delta.addr, align 8
|
||||
%10 = load i32, i32* %index_x, align 4
|
||||
%idxprom = sext i32 %10 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
|
||||
%11 = load float, float* %arrayidx, align 4
|
||||
%conv = fpext float %11 to double
|
||||
%mul15 = fmul contract double 3.000000e-01, %conv
|
||||
%12 = load float*, float** %ly.addr, align 8
|
||||
%13 = load i32, i32* %index_y, align 4
|
||||
%idxprom16 = sext i32 %13 to i64
|
||||
%arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
|
||||
%14 = load float, float* %arrayidx17, align 4
|
||||
%conv18 = fpext float %14 to double
|
||||
%mul19 = fmul contract double %mul15, %conv18
|
||||
%15 = load float*, float** %oldw.addr, align 8
|
||||
%16 = load i32, i32* %index, align 4
|
||||
%idxprom20 = sext i32 %16 to i64
|
||||
%arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
|
||||
%17 = load float, float* %arrayidx21, align 4
|
||||
%conv22 = fpext float %17 to double
|
||||
%mul23 = fmul contract double 3.000000e-01, %conv22
|
||||
%add24 = fadd contract double %mul19, %mul23
|
||||
%18 = load float*, float** %w.addr, align 8
|
||||
%19 = load i32, i32* %index, align 4
|
||||
%idxprom25 = sext i32 %19 to i64
|
||||
%arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
|
||||
%20 = load float, float* %arrayidx26, align 4
|
||||
%conv27 = fpext float %20 to double
|
||||
%add28 = fadd contract double %conv27, %add24
|
||||
%conv29 = fptrunc double %add28 to float
|
||||
store float %conv29, float* %arrayidx26, align 4
|
||||
%21 = load float*, float** %delta.addr, align 8
|
||||
%22 = load i32, i32* %index_x, align 4
|
||||
%idxprom30 = sext i32 %22 to i64
|
||||
%arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
|
||||
%23 = load float, float* %arrayidx31, align 4
|
||||
%conv32 = fpext float %23 to double
|
||||
%mul33 = fmul contract double 3.000000e-01, %conv32
|
||||
%24 = load float*, float** %ly.addr, align 8
|
||||
%25 = load i32, i32* %index_y, align 4
|
||||
%idxprom34 = sext i32 %25 to i64
|
||||
%arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
|
||||
%26 = load float, float* %arrayidx35, align 4
|
||||
%conv36 = fpext float %26 to double
|
||||
%mul37 = fmul contract double %mul33, %conv36
|
||||
%27 = load float*, float** %oldw.addr, align 8
|
||||
%28 = load i32, i32* %index, align 4
|
||||
%idxprom38 = sext i32 %28 to i64
|
||||
%arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
|
||||
%29 = load float, float* %arrayidx39, align 4
|
||||
%conv40 = fpext float %29 to double
|
||||
%mul41 = fmul contract double 3.000000e-01, %conv40
|
||||
%add42 = fadd contract double %mul37, %mul41
|
||||
%conv43 = fptrunc double %add42 to float
|
||||
%30 = load float*, float** %oldw.addr, align 8
|
||||
%31 = load i32, i32* %index, align 4
|
||||
%idxprom44 = sext i32 %31 to i64
|
||||
%arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
|
||||
store float %conv43, float* %arrayidx45, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%32 = load i32, i32* %ty, align 4
|
||||
%cmp = icmp eq i32 %32, 0
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%33 = load i32, i32* %by, align 4
|
||||
%cmp46 = icmp eq i32 %33, 0
|
||||
br i1 %cmp46, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%34 = load float*, float** %delta.addr, align 8
|
||||
%35 = load i32, i32* %index_x, align 4
|
||||
%idxprom47 = sext i32 %35 to i64
|
||||
%arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
|
||||
%36 = load float, float* %arrayidx48, align 4
|
||||
%conv49 = fpext float %36 to double
|
||||
%mul50 = fmul contract double 3.000000e-01, %conv49
|
||||
%37 = load float*, float** %oldw.addr, align 8
|
||||
%38 = load i32, i32* %index_x, align 4
|
||||
%idxprom51 = sext i32 %38 to i64
|
||||
%arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
|
||||
%39 = load float, float* %arrayidx52, align 4
|
||||
%conv53 = fpext float %39 to double
|
||||
%mul54 = fmul contract double 3.000000e-01, %conv53
|
||||
%add55 = fadd contract double %mul50, %mul54
|
||||
%40 = load float*, float** %w.addr, align 8
|
||||
%41 = load i32, i32* %index_x, align 4
|
||||
%idxprom56 = sext i32 %41 to i64
|
||||
%arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
|
||||
%42 = load float, float* %arrayidx57, align 4
|
||||
%conv58 = fpext float %42 to double
|
||||
%add59 = fadd contract double %conv58, %add55
|
||||
%conv60 = fptrunc double %add59 to float
|
||||
store float %conv60, float* %arrayidx57, align 4
|
||||
%43 = load float*, float** %delta.addr, align 8
|
||||
%44 = load i32, i32* %index_x, align 4
|
||||
%idxprom61 = sext i32 %44 to i64
|
||||
%arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
|
||||
%45 = load float, float* %arrayidx62, align 4
|
||||
%conv63 = fpext float %45 to double
|
||||
%mul64 = fmul contract double 3.000000e-01, %conv63
|
||||
%46 = load float*, float** %oldw.addr, align 8
|
||||
%47 = load i32, i32* %index_x, align 4
|
||||
%idxprom65 = sext i32 %47 to i64
|
||||
%arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
|
||||
%48 = load float, float* %arrayidx66, align 4
|
||||
%conv67 = fpext float %48 to double
|
||||
%mul68 = fmul contract double 3.000000e-01, %conv67
|
||||
%add69 = fadd contract double %mul64, %mul68
|
||||
%conv70 = fptrunc double %add69 to float
|
||||
%49 = load float*, float** %oldw.addr, align 8
|
||||
%50 = load i32, i32* %index_x, align 4
|
||||
%idxprom71 = sext i32 %50 to i64
|
||||
%arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
|
||||
store float %conv70, float* %arrayidx72, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %land.lhs.true, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
|
||||
|
||||
; Function Attrs: alwaysinline convergent inlinehint nounwind
|
||||
define internal float @__nv_fast_log2f(float %a) #4 {
|
||||
%call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
|
||||
%1 = icmp ne i32 %call.i, 0
|
||||
br i1 %1, label %2, label %4
|
||||
|
||||
2: ; preds = %0
|
||||
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
|
||||
br label %__nvvm_builtin_log2f.exit
|
||||
|
||||
4: ; preds = %0
|
||||
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
|
||||
br label %__nvvm_builtin_log2f.exit
|
||||
|
||||
__nvvm_builtin_log2f.exit: ; preds = %4, %2
|
||||
%retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
|
||||
ret float %retval.0.i
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare i32 @__nvvm_reflect(i8*) #5
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.lg2.approx.f(float) #3
|
||||
|
||||
; Function Attrs: alwaysinline convergent inlinehint nounwind
|
||||
define internal float @__nv_fast_powf(float %a, float %b) #4 {
|
||||
%call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
|
||||
%1 = icmp ne i32 %call.i.i, 0
|
||||
br i1 %1, label %2, label %4
|
||||
|
||||
2: ; preds = %0
|
||||
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
|
||||
br label %__nv_fast_log2f.exit
|
||||
|
||||
4: ; preds = %0
|
||||
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
|
||||
br label %__nv_fast_log2f.exit
|
||||
|
||||
__nv_fast_log2f.exit: ; preds = %4, %2
|
||||
%retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
|
||||
%6 = fmul float %b, %retval.0.i.i
|
||||
%call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
|
||||
%7 = icmp ne i32 %call.i.i1, 0
|
||||
br i1 %7, label %8, label %10
|
||||
|
||||
8: ; preds = %__nv_fast_log2f.exit
|
||||
%9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
|
||||
br label %__nv_exp2f.exit
|
||||
|
||||
10: ; preds = %__nv_fast_log2f.exit
|
||||
%11 = call float @llvm.nvvm.ex2.approx.f(float %6)
|
||||
br label %__nv_exp2f.exit
|
||||
|
||||
__nv_exp2f.exit: ; preds = %10, %8
|
||||
%retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
|
||||
ret float %retval.0.i.i2
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.ex2.approx.f(float) #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
|
||||
!llvm.ident = !{!9}
|
||||
!nvvmir.version = !{!10}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
|
||||
!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
|
||||
!5 = !{null, !"align", i32 8}
|
||||
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!7 = !{null, !"align", i32 16}
|
||||
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!10 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,195 @@
|
|||
#include <cuda.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
// includes, kernels
|
||||
#include "backprop.h"
|
||||
#include "backprop_cuda_kernel.cu"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
|
||||
int n2);
|
||||
|
||||
extern "C" void bpnn_output_error(float *delta, float *target, float *output,
|
||||
int nj, float *err);
|
||||
|
||||
extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
|
||||
int no, float **who, float *hidden,
|
||||
float *err);
|
||||
|
||||
extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
|
||||
int nly, float **w, float **oldw);
|
||||
|
||||
extern "C" int setup(int argc, char **argv);
|
||||
|
||||
extern "C" float **alloc_2d_dbl(int m, int n);
|
||||
|
||||
extern "C" float squash(float x);
|
||||
|
||||
double gettime() {
|
||||
struct timeval t;
|
||||
gettimeofday(&t, NULL);
|
||||
return t.tv_sec + t.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
unsigned int num_threads = 0;
|
||||
unsigned int num_blocks = 0;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
setup(argc, argv);
|
||||
}
|
||||
|
||||
extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
|
||||
int in, hid, out;
|
||||
float out_err, hid_err;
|
||||
|
||||
in = net->input_n;
|
||||
hid = net->hidden_n;
|
||||
out = net->output_n;
|
||||
|
||||
#ifdef GPU
|
||||
int m = 0;
|
||||
float *input_hidden_cuda;
|
||||
float *input_cuda;
|
||||
float *output_hidden_cuda;
|
||||
float *partial_sum;
|
||||
float *hidden_partial_sum;
|
||||
float *hidden_delta_cuda;
|
||||
float *input_prev_weights_cuda;
|
||||
float sum;
|
||||
float *input_weights_one_dim;
|
||||
float *input_weights_prev_one_dim;
|
||||
num_blocks = in / 16;
|
||||
dim3 grid(1, num_blocks);
|
||||
dim3 threads(16, 16);
|
||||
|
||||
input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
|
||||
input_weights_prev_one_dim =
|
||||
(float *)malloc((in + 1) * (hid + 1) * sizeof(float));
|
||||
partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
|
||||
|
||||
// this preprocessing stage is added to correct the bugs of wrong memcopy
|
||||
// using two-dimensional net->inputweights
|
||||
for (int k = 0; k <= in; k++) {
|
||||
for (int j = 0; j <= hid; j++) {
|
||||
input_weights_one_dim[m] = net->input_weights[k][j];
|
||||
input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
|
||||
m++;
|
||||
}
|
||||
}
|
||||
|
||||
cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CPU
|
||||
|
||||
printf("Performing CPU computation\n");
|
||||
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
||||
hid);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GPU
|
||||
|
||||
printf("Performing GPU computation\n");
|
||||
|
||||
// printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
|
||||
|
||||
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
|
||||
input_hidden_cuda,
|
||||
hidden_partial_sum, in, hid);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaError_t error = cudaGetLastError();
|
||||
if (error != cudaSuccess) {
|
||||
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
cudaMemcpy(partial_sum, hidden_partial_sum,
|
||||
num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
for (int j = 1; j <= hid; j++) {
|
||||
sum = 0.0;
|
||||
for (int k = 0; k < num_blocks; k++) {
|
||||
sum += partial_sum[k * hid + j - 1];
|
||||
}
|
||||
sum += net->input_weights[0][j];
|
||||
net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
|
||||
}
|
||||
#endif
|
||||
|
||||
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
||||
hid, out);
|
||||
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
|
||||
&out_err);
|
||||
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
|
||||
net->hidden_weights, net->hidden_units, &hid_err);
|
||||
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
|
||||
net->hidden_weights, net->hidden_prev_weights);
|
||||
|
||||
#ifdef CPU
|
||||
|
||||
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
|
||||
net->input_weights, net->input_prev_weights);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GPU
|
||||
|
||||
cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&input_prev_weights_cuda,
|
||||
(in + 1) * (hid + 1) * sizeof(float));
|
||||
|
||||
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
|
||||
input_cuda, in, input_hidden_cuda,
|
||||
input_prev_weights_cuda);
|
||||
|
||||
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
|
||||
cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
for (int i = 0; i < (in + 1) * (hid + 1); i++) {
|
||||
printf("%f ", input_weights_one_dim[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
cudaFree(input_cuda);
|
||||
cudaFree(output_hidden_cuda);
|
||||
cudaFree(input_hidden_cuda);
|
||||
cudaFree(hidden_partial_sum);
|
||||
cudaFree(input_prev_weights_cuda);
|
||||
cudaFree(hidden_delta_cuda);
|
||||
|
||||
free(partial_sum);
|
||||
free(input_weights_one_dim);
|
||||
free(input_weights_prev_one_dim);
|
||||
|
||||
#endif
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
#ifndef _BACKPROP_CUDA_KERNEL_H_
|
||||
#define _BACKPROP_CUDA_KERNEL_H_
|
||||
|
||||
#include "backprop.h"
|
||||
#include "cuda.h"
|
||||
#include "math.h"
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void bpnn_layerforward_CUDA(float *input_cuda,
|
||||
float *output_hidden_cuda,
|
||||
float *input_hidden_cuda,
|
||||
float *hidden_partial_sum, int in,
|
||||
int hid) {
|
||||
int by = blockIdx.y;
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
|
||||
|
||||
int index_in = HEIGHT * by + ty + 1;
|
||||
|
||||
__shared__ float input_node[HEIGHT];
|
||||
__shared__ float weight_matrix[HEIGHT][WIDTH];
|
||||
|
||||
if (tx == 0)
|
||||
input_node[ty] = input_cuda[index_in];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
weight_matrix[ty][tx] = input_hidden_cuda[index];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 1; i <= __log2f(HEIGHT); i++) {
|
||||
|
||||
int power_two = __powf(2, i);
|
||||
|
||||
if (ty % power_two == 0)
|
||||
weight_matrix[ty][tx] =
|
||||
weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//__syncthreads();
|
||||
|
||||
input_hidden_cuda[index] = weight_matrix[ty][tx];
|
||||
|
||||
/*
|
||||
for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
|
||||
|
||||
unsigned int power_two = i - 1;
|
||||
if( (ty & power_two) == 0 ) {
|
||||
weight_matrix[ty][tx] = weight_matrix[ty][tx] +
|
||||
weight_matrix[ty + power_two/2][tx];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tx == 0) {
|
||||
hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
|
||||
int in, float *w, float *oldw) {
|
||||
|
||||
int by = blockIdx.y;
|
||||
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
|
||||
int index_y = HEIGHT * by + ty + 1;
|
||||
int index_x = tx + 1;
|
||||
// eta = 0.3;
|
||||
// momentum = 0.3;
|
||||
|
||||
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
||||
oldw[index] =
|
||||
((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ty == 0 && by == 0) {
|
||||
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
||||
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,48 @@
|
|||
#include "backprop.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
extern char *strcpy();
|
||||
extern void exit();
|
||||
|
||||
int layer_size = 0;
|
||||
|
||||
backprop_face() {
|
||||
BPNN *net;
|
||||
int i;
|
||||
float out_err, hid_err;
|
||||
net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
|
||||
|
||||
printf("Input layer size : %d\n", layer_size);
|
||||
load(net);
|
||||
// entering the training kernel, only one iteration
|
||||
printf("Starting training kernel\n");
|
||||
bpnn_train_cuda(net, &out_err, &hid_err);
|
||||
bpnn_free(net);
|
||||
printf("Training done\n");
|
||||
}
|
||||
|
||||
int setup(argc, argv)
|
||||
int argc;
|
||||
char *argv[];
|
||||
{
|
||||
|
||||
int seed;
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "usage: backprop <num of input elements>\n");
|
||||
exit(0);
|
||||
}
|
||||
layer_size = atoi(argv[1]);
|
||||
if (layer_size % 16 != 0) {
|
||||
fprintf(stderr, "The number of input points must be divided by 16\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
seed = 7;
|
||||
bpnn_initialize(seed);
|
||||
backprop_face();
|
||||
|
||||
exit(0);
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
#include "backprop.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
extern layer_size;
|
||||
|
||||
load(net) BPNN *net;
|
||||
{
|
||||
float *units;
|
||||
int nr, nc, imgsize, i, j, k;
|
||||
|
||||
nr = layer_size;
|
||||
|
||||
imgsize = nr * nc;
|
||||
units = net->input_units;
|
||||
|
||||
k = 1;
|
||||
for (i = 0; i < nr; i++) {
|
||||
units[k] = (float)rand() / RAND_MAX;
|
||||
k++;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
clang -c -emit-llvm backprop.c
|
||||
clang -c -emit-llvm facetrain.c
|
||||
clang -c -emit-llvm imagenet.c
|
||||
|
||||
llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
llc --relocation-model=pic --filetype=obj backprop.bc
|
||||
llc --relocation-model=pic --filetype=obj facetrain.bc
|
||||
llc --relocation-model=pic --filetype=obj imagenet.bc
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o demo \
|
||||
-fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
|
||||
-lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./demo 1024 > res.log
|
||||
if grep -q -e "0.173289 0.259645 0.350836" res.log; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -0,0 +1,307 @@
|
|||
; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "bfs.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
%struct.Node = type { i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
|
||||
entry:
|
||||
%g_graph_nodes.addr = alloca %struct.Node*, align 8
|
||||
%g_graph_edges.addr = alloca i32*, align 8
|
||||
%g_graph_mask.addr = alloca i8*, align 8
|
||||
%g_updating_graph_mask.addr = alloca i8*, align 8
|
||||
%g_graph_visited.addr = alloca i8*, align 8
|
||||
%g_cost.addr = alloca i32*, align 8
|
||||
%no_of_nodes.addr = alloca i32, align 4
|
||||
%tid = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
%id = alloca i32, align 4
|
||||
store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
|
||||
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
|
||||
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
|
||||
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
|
||||
store i32* %g_cost, i32** %g_cost.addr, align 8
|
||||
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call, 512
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add = add i32 %mul, %call1
|
||||
store i32 %add, i32* %tid, align 4
|
||||
%0 = load i32, i32* %tid, align 4
|
||||
%1 = load i32, i32* %no_of_nodes.addr, align 4
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end26
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%2 = load i8*, i8** %g_graph_mask.addr, align 8
|
||||
%3 = load i32, i32* %tid, align 4
|
||||
%idxprom = sext i32 %3 to i64
|
||||
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
|
||||
%4 = load i8, i8* %arrayidx, align 1
|
||||
%tobool = trunc i8 %4 to i1
|
||||
br i1 %tobool, label %if.then, label %if.end26
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%5 = load i8*, i8** %g_graph_mask.addr, align 8
|
||||
%6 = load i32, i32* %tid, align 4
|
||||
%idxprom2 = sext i32 %6 to i64
|
||||
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
|
||||
store i8 0, i8* %arrayidx3, align 1
|
||||
%7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
%8 = load i32, i32* %tid, align 4
|
||||
%idxprom4 = sext i32 %8 to i64
|
||||
%arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
|
||||
%starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
|
||||
%9 = load i32, i32* %starting, align 4
|
||||
store i32 %9, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %if.then
|
||||
%10 = load i32, i32* %i, align 4
|
||||
%11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
%12 = load i32, i32* %tid, align 4
|
||||
%idxprom6 = sext i32 %12 to i64
|
||||
%arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
|
||||
%no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
|
||||
%13 = load i32, i32* %no_of_edges, align 4
|
||||
%14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
%15 = load i32, i32* %tid, align 4
|
||||
%idxprom8 = sext i32 %15 to i64
|
||||
%arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
|
||||
%starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
|
||||
%16 = load i32, i32* %starting10, align 4
|
||||
%add11 = add nsw i32 %13, %16
|
||||
%cmp12 = icmp slt i32 %10, %add11
|
||||
br i1 %cmp12, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%17 = load i32*, i32** %g_graph_edges.addr, align 8
|
||||
%18 = load i32, i32* %i, align 4
|
||||
%idxprom13 = sext i32 %18 to i64
|
||||
%arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
|
||||
%19 = load i32, i32* %arrayidx14, align 4
|
||||
store i32 %19, i32* %id, align 4
|
||||
%20 = load i8*, i8** %g_graph_visited.addr, align 8
|
||||
%21 = load i32, i32* %id, align 4
|
||||
%idxprom15 = sext i32 %21 to i64
|
||||
%arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
|
||||
%22 = load i8, i8* %arrayidx16, align 1
|
||||
%tobool17 = trunc i8 %22 to i1
|
||||
br i1 %tobool17, label %if.end, label %if.then18
|
||||
|
||||
if.then18: ; preds = %for.body
|
||||
%23 = load i32*, i32** %g_cost.addr, align 8
|
||||
%24 = load i32, i32* %tid, align 4
|
||||
%idxprom19 = sext i32 %24 to i64
|
||||
%arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
|
||||
%25 = load i32, i32* %arrayidx20, align 4
|
||||
%add21 = add nsw i32 %25, 1
|
||||
%26 = load i32*, i32** %g_cost.addr, align 8
|
||||
%27 = load i32, i32* %id, align 4
|
||||
%idxprom22 = sext i32 %27 to i64
|
||||
%arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
|
||||
store i32 %add21, i32* %arrayidx23, align 4
|
||||
%28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
|
||||
%29 = load i32, i32* %id, align 4
|
||||
%idxprom24 = sext i32 %29 to i64
|
||||
%arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
|
||||
store i8 1, i8* %arrayidx25, align 1
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then18, %for.body
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%30 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %30, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
br label %if.end26
|
||||
|
||||
if.end26: ; preds = %for.end, %land.lhs.true, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
|
||||
entry:
|
||||
%g_graph_mask.addr = alloca i8*, align 8
|
||||
%g_updating_graph_mask.addr = alloca i8*, align 8
|
||||
%g_graph_visited.addr = alloca i8*, align 8
|
||||
%g_over.addr = alloca i8*, align 8
|
||||
%no_of_nodes.addr = alloca i32, align 4
|
||||
%tid = alloca i32, align 4
|
||||
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
|
||||
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
|
||||
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
|
||||
store i8* %g_over, i8** %g_over.addr, align 8
|
||||
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call, 512
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add = add i32 %mul, %call1
|
||||
store i32 %add, i32* %tid, align 4
|
||||
%0 = load i32, i32* %tid, align 4
|
||||
%1 = load i32, i32* %no_of_nodes.addr, align 4
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
|
||||
%3 = load i32, i32* %tid, align 4
|
||||
%idxprom = sext i32 %3 to i64
|
||||
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
|
||||
%4 = load i8, i8* %arrayidx, align 1
|
||||
%tobool = trunc i8 %4 to i1
|
||||
br i1 %tobool, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%5 = load i8*, i8** %g_graph_mask.addr, align 8
|
||||
%6 = load i32, i32* %tid, align 4
|
||||
%idxprom2 = sext i32 %6 to i64
|
||||
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
|
||||
store i8 1, i8* %arrayidx3, align 1
|
||||
%7 = load i8*, i8** %g_graph_visited.addr, align 8
|
||||
%8 = load i32, i32* %tid, align 4
|
||||
%idxprom4 = sext i32 %8 to i64
|
||||
%arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
|
||||
store i8 1, i8* %arrayidx5, align 1
|
||||
%9 = load i8*, i8** %g_over.addr, align 8
|
||||
store i8 1, i8* %9, align 1
|
||||
%10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
|
||||
%11 = load i32, i32* %tid, align 4
|
||||
%idxprom6 = sext i32 %11 to i64
|
||||
%arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
|
||||
store i8 0, i8* %arrayidx7, align 1
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %land.lhs.true, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
|
||||
!llvm.ident = !{!9}
|
||||
!nvvmir.version = !{!10}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
|
||||
!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
|
||||
!5 = !{null, !"align", i32 8}
|
||||
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!7 = !{null, !"align", i32 16}
|
||||
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!10 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,213 @@
|
|||
#include <cuda.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 512
|
||||
|
||||
int no_of_nodes;
|
||||
int edge_list_size;
|
||||
FILE *fp;
|
||||
|
||||
// Structure to hold a node information
|
||||
struct Node {
|
||||
int starting;
|
||||
int no_of_edges;
|
||||
};
|
||||
|
||||
#include "kernel.cu"
|
||||
#include "kernel2.cu"
|
||||
|
||||
void BFSGraph(int argc, char **argv);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Main Program
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
no_of_nodes = 0;
|
||||
edge_list_size = 0;
|
||||
BFSGraph(argc, argv);
|
||||
}
|
||||
|
||||
void Usage(int argc, char **argv) {
|
||||
|
||||
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Apply BFS on a Graph using CUDA
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void BFSGraph(int argc, char **argv) {
|
||||
|
||||
char *input_f;
|
||||
if (argc != 2) {
|
||||
Usage(argc, argv);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
input_f = argv[1];
|
||||
printf("Reading File\n");
|
||||
// Read in Graph from a file
|
||||
fp = fopen(input_f, "r");
|
||||
if (!fp) {
|
||||
printf("Error Reading graph file\n");
|
||||
return;
|
||||
}
|
||||
|
||||
int source = 0;
|
||||
|
||||
fscanf(fp, "%d", &no_of_nodes);
|
||||
|
||||
int num_of_blocks = 1;
|
||||
int num_of_threads_per_block = no_of_nodes;
|
||||
|
||||
// Make execution Parameters according to the number of nodes
|
||||
// Distribute threads across multiple Blocks if necessary
|
||||
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
|
||||
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
|
||||
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
}
|
||||
|
||||
// allocate host memory
|
||||
Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
|
||||
bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||||
bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||||
bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||||
|
||||
int start, edgeno;
|
||||
// initalize the memory
|
||||
for (unsigned int i = 0; i < no_of_nodes; i++) {
|
||||
fscanf(fp, "%d %d", &start, &edgeno);
|
||||
h_graph_nodes[i].starting = start;
|
||||
h_graph_nodes[i].no_of_edges = edgeno;
|
||||
h_graph_mask[i] = false;
|
||||
h_updating_graph_mask[i] = false;
|
||||
h_graph_visited[i] = false;
|
||||
}
|
||||
|
||||
// read the source node from the file
|
||||
fscanf(fp, "%d", &source);
|
||||
source = 0;
|
||||
|
||||
// set the source node as true in the mask
|
||||
h_graph_mask[source] = true;
|
||||
h_graph_visited[source] = true;
|
||||
|
||||
fscanf(fp, "%d", &edge_list_size);
|
||||
|
||||
int id, cost;
|
||||
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
|
||||
for (int i = 0; i < edge_list_size; i++) {
|
||||
fscanf(fp, "%d", &id);
|
||||
fscanf(fp, "%d", &cost);
|
||||
h_graph_edges[i] = id;
|
||||
}
|
||||
|
||||
if (fp)
|
||||
fclose(fp);
|
||||
|
||||
printf("Read File\n");
|
||||
|
||||
// Copy the Node list to device memory
|
||||
Node *d_graph_nodes;
|
||||
cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
|
||||
cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
// Copy the Edge List to device Memory
|
||||
int *d_graph_edges;
|
||||
cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
|
||||
cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
// Copy the Mask to device memory
|
||||
bool *d_graph_mask;
|
||||
cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
|
||||
cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
bool *d_updating_graph_mask;
|
||||
cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
|
||||
cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
|
||||
sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
|
||||
|
||||
// Copy the Visited nodes array to device memory
|
||||
bool *d_graph_visited;
|
||||
cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
|
||||
cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
// allocate mem for the result on host side
|
||||
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
|
||||
for (int i = 0; i < no_of_nodes; i++)
|
||||
h_cost[i] = -1;
|
||||
h_cost[source] = 0;
|
||||
|
||||
// allocate device memory for result
|
||||
int *d_cost;
|
||||
cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
|
||||
cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
|
||||
|
||||
// make a bool to check if the execution is over
|
||||
bool *d_over;
|
||||
cudaMalloc((void **)&d_over, sizeof(bool));
|
||||
|
||||
printf("Copied Everything to GPU memory\n");
|
||||
|
||||
// setup execution parameters
|
||||
dim3 grid(num_of_blocks, 1, 1);
|
||||
dim3 threads(num_of_threads_per_block, 1, 1);
|
||||
|
||||
int k = 0;
|
||||
printf("Start traversing the tree\n");
|
||||
bool stop;
|
||||
// Call the Kernel untill all the elements of Frontier are not false
|
||||
do {
|
||||
// if no thread changes this value then the loop stops
|
||||
stop = false;
|
||||
cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
|
||||
|
||||
Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
|
||||
d_updating_graph_mask, d_graph_visited, d_cost,
|
||||
no_of_nodes);
|
||||
cudaDeviceSynchronize();
|
||||
// check if kernel execution generated and error
|
||||
|
||||
Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
|
||||
d_graph_visited, d_over, no_of_nodes);
|
||||
cudaDeviceSynchronize();
|
||||
// check if kernel execution generated and error
|
||||
|
||||
cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
|
||||
|
||||
k++;
|
||||
} while (stop);
|
||||
|
||||
printf("Kernel Executed %d times\n", k);
|
||||
|
||||
// copy result from device to host
|
||||
cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
|
||||
|
||||
// Store the result into a file
|
||||
FILE *fpo = fopen("result.txt", "w");
|
||||
for (int i = 0; i < no_of_nodes; i++)
|
||||
fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
|
||||
fclose(fpo);
|
||||
printf("Result stored in result.txt\n");
|
||||
|
||||
// cleanup memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_edges);
|
||||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
free(h_cost);
|
||||
|
||||
cudaFree(d_graph_nodes);
|
||||
cudaFree(d_graph_edges);
|
||||
cudaFree(d_graph_mask);
|
||||
cudaFree(d_updating_graph_mask);
|
||||
cudaFree(d_graph_visited);
|
||||
cudaFree(d_cost);
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
#ifndef _KERNEL_H_
|
||||
#define _KERNEL_H_
|
||||
|
||||
__global__ void
|
||||
Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
|
||||
{
|
||||
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
|
||||
if( tid<no_of_nodes && g_graph_mask[tid])
|
||||
{
|
||||
g_graph_mask[tid]=false;
|
||||
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
|
||||
{
|
||||
int id = g_graph_edges[i];
|
||||
if(!g_graph_visited[id])
|
||||
{
|
||||
g_cost[id]=g_cost[tid]+1;
|
||||
g_updating_graph_mask[id]=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,18 @@
|
|||
#ifndef _KERNEL2_H_
|
||||
#define _KERNEL2_H_
|
||||
|
||||
__global__ void
|
||||
Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
|
||||
{
|
||||
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
|
||||
if( tid<no_of_nodes && g_updating_graph_mask[tid])
|
||||
{
|
||||
|
||||
g_graph_mask[tid]=true;
|
||||
g_graph_visited[tid]=true;
|
||||
*g_over=true;
|
||||
g_updating_graph_mask[tid]=false;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
|
||||
-o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./bfs.out ../../rodinia-data/bfs/graph65536.txt
|
||||
if grep -q "0) cost:0" result.txt; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -0,0 +1,343 @@
|
|||
// # ifdef __cplusplus
|
||||
// extern "C" {
|
||||
// # endif
|
||||
|
||||
// #ifndef LIST_H
|
||||
// # define LIST_H
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// DEFINE/INCLUDE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE (for some reason these are not recognized when defined in main
|
||||
// file before this one is included)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include <stdbool.h> // (in path known to compiler) needed by true/false, bool
|
||||
#include <stdint.h> // (in path known to compiler) needed by uint32_t
|
||||
#include <stdlib.h> // (in path known to compiler) needed by malloc
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#define fp float
|
||||
|
||||
#define Version "1.5"
|
||||
|
||||
#ifdef WINDOWS
|
||||
#define bool char
|
||||
#define false 0
|
||||
#define true 1
|
||||
#endif
|
||||
|
||||
/* #define DEFAULT_ORDER 256 */
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define DEFAULT_ORDER RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define DEFAULT_ORDER RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define DEFAULT_ORDER RD_WG_SIZE
|
||||
#else
|
||||
#define DEFAULT_ORDER 256
|
||||
#endif
|
||||
|
||||
/* #ifdef RD_WG_SIZE_1_0 */
|
||||
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
|
||||
/* #elif defined(RD_WG_SIZE_1) */
|
||||
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1 */
|
||||
/* #elif defined(RD_WG_SIZE) */
|
||||
/* #define DEFAULT_ORDER_2 RD_WG_SIZE */
|
||||
/* #else */
|
||||
/* #define DEFAULT_ORDER_2 256 */
|
||||
/* #endif */
|
||||
|
||||
/* #define DEFAULT_ORDER 508 */
|
||||
|
||||
#define malloc(size) \
|
||||
({ \
|
||||
void *_tmp; \
|
||||
\
|
||||
if (!(_tmp = malloc(size))) { \
|
||||
fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
|
||||
exit(-1); \
|
||||
} \
|
||||
\
|
||||
_tmp; \
|
||||
})
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// STRUCTURES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// struct list_item;
|
||||
typedef struct list_item list_item_t;
|
||||
|
||||
typedef struct list_t {
|
||||
list_item_t *head, *tail;
|
||||
uint32_t length;
|
||||
int32_t (*compare)(const void *key, const void *with);
|
||||
void (*datum_delete)(void *);
|
||||
} list_t;
|
||||
|
||||
typedef list_item_t *list_iterator_t;
|
||||
typedef list_item_t *list_reverse_iterator_t;
|
||||
|
||||
/* Type representing the record
|
||||
* to which a given key refers.
|
||||
* In a real B+ tree system, the
|
||||
* record would hold data (in a database)
|
||||
* or a file (in an operating system)
|
||||
* or some other information.
|
||||
* Users can rewrite this part of the code
|
||||
* to change the type and content
|
||||
* of the value field.
|
||||
*/
|
||||
typedef struct record {
|
||||
int value;
|
||||
} record;
|
||||
|
||||
/* Type representing a node in the B+ tree.
|
||||
* This type is general enough to serve for both
|
||||
* the leaf and the internal node.
|
||||
* The heart of the node is the array
|
||||
* of keys and the array of corresponding
|
||||
* pointers. The relation between keys
|
||||
* and pointers differs between leaves and
|
||||
* internal nodes. In a leaf, the index
|
||||
* of each key equals the index of its corresponding
|
||||
* pointer, with a maximum of order - 1 key-pointer
|
||||
* pairs. The last pointer points to the
|
||||
* leaf to the right (or NULL in the case
|
||||
* of the rightmost leaf).
|
||||
* In an internal node, the first pointer
|
||||
* refers to lower nodes with keys less than
|
||||
* the smallest key in the keys array. Then,
|
||||
* with indices i starting at 0, the pointer
|
||||
* at i + 1 points to the subtree with keys
|
||||
* greater than or equal to the key in this
|
||||
* node at index i.
|
||||
* The num_keys field is used to keep
|
||||
* track of the number of valid keys.
|
||||
* In an internal node, the number of valid
|
||||
* pointers is always num_keys + 1.
|
||||
* In a leaf, the number of valid pointers
|
||||
* to data is always num_keys. The
|
||||
* last leaf pointer points to the next leaf.
|
||||
*/
|
||||
typedef struct node {
|
||||
void **pointers;
|
||||
int *keys;
|
||||
struct node *parent;
|
||||
bool is_leaf;
|
||||
int num_keys;
|
||||
struct node *next; // Used for queue.
|
||||
} node;
|
||||
|
||||
//
|
||||
typedef struct knode {
|
||||
int location;
|
||||
int indices[DEFAULT_ORDER + 1];
|
||||
int keys[DEFAULT_ORDER + 1];
|
||||
bool is_leaf;
|
||||
int num_keys;
|
||||
} knode;
|
||||
|
||||
struct list_item {
|
||||
struct list_item *pred, *next;
|
||||
void *datum;
|
||||
};
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// PROTOTYPES
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Other
|
||||
//======================================================================================================================================================150
|
||||
|
||||
void list_item_init(list_item_t *li, void *datum);
|
||||
|
||||
void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
|
||||
|
||||
void list_insert_item_tail(list_t *l, list_item_t *i);
|
||||
|
||||
void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
|
||||
|
||||
void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
|
||||
|
||||
void list_insert_item_sorted(list_t *l, list_item_t *i);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// ???
|
||||
//======================================================================================================================================================150
|
||||
|
||||
void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
|
||||
void (*datum_delete)(void *datum));
|
||||
|
||||
void list_delete(list_t *l);
|
||||
|
||||
void list_reset(list_t *l);
|
||||
|
||||
void list_insert_head(list_t *l, void *v);
|
||||
|
||||
void list_insert_tail(list_t *l, void *v);
|
||||
|
||||
void list_insert_before(list_t *l, list_item_t *next, void *v);
|
||||
|
||||
void list_insert_after(list_t *l, list_item_t *pred, void *v);
|
||||
|
||||
void list_insert_sorted(list_t *l, void *v);
|
||||
|
||||
void list_insert_item_head(list_t *l, list_item_t *i);
|
||||
|
||||
void list_remove_item(list_t *l, list_item_t *i);
|
||||
|
||||
void list_remove_head(list_t *l);
|
||||
|
||||
void list_remove_tail(list_t *l);
|
||||
|
||||
list_item_t *list_find_item(list_t *l, void *datum);
|
||||
|
||||
list_item_t *list_get_head_item(list_t *l);
|
||||
|
||||
list_item_t *list_get_tail_item(list_t *l);
|
||||
|
||||
void *list_find(list_t *l, void *datum);
|
||||
|
||||
void *list_get_head(list_t *l);
|
||||
|
||||
void *list_get_tail(list_t *l);
|
||||
|
||||
uint32_t list_get_length(list_t *l);
|
||||
|
||||
bool list_is_empty(list_t *l);
|
||||
|
||||
bool list_not_empty(list_t *l);
|
||||
|
||||
void list_visit_items(list_t *l, void (*visitor)(void *v));
|
||||
|
||||
void *list_item_get_datum(list_item_t *li);
|
||||
|
||||
void list_iterator_init(list_t *l, list_iterator_t *li);
|
||||
|
||||
void list_iterator_delete(list_iterator_t *li);
|
||||
|
||||
void list_iterator_next(list_iterator_t *li);
|
||||
|
||||
void list_iterator_prev(list_iterator_t *li);
|
||||
|
||||
void *list_iterator_get_datum(list_iterator_t *li);
|
||||
|
||||
bool list_iterator_is_valid(list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_delete(list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_next(list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_prev(list_iterator_t *li);
|
||||
|
||||
void *list_reverse_iterator_get_datum(list_iterator_t *li);
|
||||
|
||||
bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Output and utility
|
||||
//======================================================================================================================================================150
|
||||
|
||||
void *kmalloc(int size);
|
||||
|
||||
long transform_to_cuda(node *n,
|
||||
bool verbose); // returns actual mem used in a long
|
||||
|
||||
void usage_1(void);
|
||||
|
||||
void usage_2(void);
|
||||
|
||||
void enqueue(node *new_node);
|
||||
|
||||
node *dequeue(void);
|
||||
|
||||
int height(node *root);
|
||||
|
||||
int path_to_root(node *root, node *child);
|
||||
|
||||
void print_leaves(node *root);
|
||||
|
||||
void print_tree(node *root);
|
||||
|
||||
node *find_leaf(node *root, int key, bool verbose);
|
||||
|
||||
record *find(node *root, int key, bool verbose);
|
||||
|
||||
int cut(int length);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Insertion
|
||||
//======================================================================================================================================================150
|
||||
|
||||
record *make_record(int value);
|
||||
|
||||
node *make_node(void);
|
||||
|
||||
node *make_leaf(void);
|
||||
|
||||
int get_left_index(node *parent, node *left);
|
||||
|
||||
node *insert_into_leaf(node *leaf, int key, record *pointer);
|
||||
|
||||
node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
|
||||
record *pointer);
|
||||
|
||||
node *insert_into_node(node *root, node *parent, int left_index, int key,
|
||||
node *right);
|
||||
|
||||
node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
|
||||
int key, node *right);
|
||||
|
||||
node *insert_into_parent(node *root, node *left, int key, node *right);
|
||||
|
||||
node *insert_into_new_root(node *left, int key, node *right);
|
||||
|
||||
node *start_new_tree(int key, record *pointer);
|
||||
|
||||
node *insert(node *root, int key, int value);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Deletion
|
||||
//======================================================================================================================================================150
|
||||
|
||||
int get_neighbor_index(node *n);
|
||||
|
||||
node *adjust_root(node *root);
|
||||
|
||||
node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
|
||||
int k_prime);
|
||||
|
||||
node *redistribute_nodes(node *root, node *n, node *neighbor,
|
||||
int neighbor_index, int k_prime_index, int k_prime);
|
||||
|
||||
node *delete_entry(node *root, node *n, int key, void *pointer);
|
||||
|
||||
node *deleteVal(node *root, int key);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
// int main( int argc,
|
||||
// char *argv []);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
// #endif
|
||||
|
||||
// # ifdef __cplusplus
|
||||
// }
|
||||
// # endif
|
|
@ -0,0 +1,54 @@
|
|||
//========================================================================================================================================================================================================200
|
||||
// findK function
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
__global__ void
|
||||
findK( long height,
|
||||
knode *knodesD,
|
||||
long knodes_elem,
|
||||
record *recordsD,
|
||||
|
||||
long *currKnodeD,
|
||||
long *offsetD,
|
||||
int *keysD,
|
||||
record *ansD)
|
||||
{
|
||||
|
||||
// private thread IDs
|
||||
int thid = threadIdx.x;
|
||||
int bid = blockIdx.x;
|
||||
|
||||
// processtree levels
|
||||
int i;
|
||||
for(i = 0; i < height; i++){
|
||||
|
||||
// if value is between the two keys
|
||||
if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
|
||||
// this conditional statement is inserted to avoid crush due to but in original code
|
||||
// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
|
||||
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||
if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
|
||||
offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// set for next tree level
|
||||
if(thid==0){
|
||||
currKnodeD[bid] = offsetD[bid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
//At this point, we have a candidate leaf node which may contain
|
||||
//the target record. Check each key to hopefully find the record
|
||||
if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
|
||||
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
|
@ -0,0 +1,70 @@
|
|||
//========================================================================================================================================================================================================200
|
||||
// findRangeK function
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
__global__ void
|
||||
findRangeK( long height,
|
||||
|
||||
knode *knodesD,
|
||||
long knodes_elem,
|
||||
|
||||
long *currKnodeD,
|
||||
long *offsetD,
|
||||
long *lastKnodeD,
|
||||
long *offset_2D,
|
||||
int *startD,
|
||||
int *endD,
|
||||
int *RecstartD,
|
||||
int *ReclenD)
|
||||
{
|
||||
|
||||
// private thread IDs
|
||||
int thid = threadIdx.x;
|
||||
int bid = blockIdx.x;
|
||||
|
||||
// ???
|
||||
int i;
|
||||
for(i = 0; i < height; i++){
|
||||
|
||||
if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
|
||||
// this conditional statement is inserted to avoid crush due to but in original code
|
||||
// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
|
||||
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||
if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
|
||||
offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
|
||||
}
|
||||
}
|
||||
if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
|
||||
// this conditional statement is inserted to avoid crush due to but in original code
|
||||
// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
|
||||
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||
if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
|
||||
offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// set for next tree level
|
||||
if(thid==0){
|
||||
currKnodeD[bid] = offsetD[bid];
|
||||
lastKnodeD[bid] = offset_2D[bid];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Find the index of the starting record
|
||||
if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
|
||||
RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Find the index of the ending record
|
||||
if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
|
||||
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
|
@ -0,0 +1,292 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// DEFINE/INCLUDE
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// COMMON
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../common.h" // (in main program directory) needed to recognized input variables
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// UTILITIES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../util/cuda/cuda.h" // (in path specified to compiler) needed by for device functions
|
||||
#include "../util/timer/timer.h" // (in path specified to compiler) needed by timer
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// KERNEL
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda.cu" // (in current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// HEADER
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda_wrapper.h" // (in current directory)
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void
|
||||
kernel_gpu_cuda_wrapper(record *records,
|
||||
long records_mem,
|
||||
knode *knodes,
|
||||
long knodes_elem,
|
||||
long knodes_mem,
|
||||
|
||||
int order,
|
||||
long maxheight,
|
||||
int count,
|
||||
|
||||
long *currKnode,
|
||||
long *offset,
|
||||
int *keys,
|
||||
record *ans)
|
||||
{
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// CPU VARIABLES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// timer
|
||||
long long time0;
|
||||
long long time1;
|
||||
long long time2;
|
||||
long long time3;
|
||||
long long time4;
|
||||
long long time5;
|
||||
long long time6;
|
||||
|
||||
time0 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU SETUP
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// INITIAL DRIVER OVERHEAD
|
||||
//====================================================================================================100
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
//====================================================================================================100
|
||||
// EXECUTION PARAMETERS
|
||||
//====================================================================================================100
|
||||
|
||||
int numBlocks;
|
||||
numBlocks = count; // max # of blocks can be 65,535
|
||||
int threadsPerBlock;
|
||||
threadsPerBlock = order < 1024 ? order : 1024;
|
||||
|
||||
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
|
||||
|
||||
time1 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY (MALLOC)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// recordsD
|
||||
//==================================================50
|
||||
|
||||
record *recordsD;
|
||||
cudaMalloc((void**)&recordsD, records_mem);
|
||||
checkCUDAError("cudaMalloc recordsD");
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
knode *knodesD;
|
||||
cudaMalloc((void**)&knodesD, knodes_mem);
|
||||
checkCUDAError("cudaMalloc recordsD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
long *currKnodeD;
|
||||
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
long *offsetD;
|
||||
cudaMalloc((void**)&offsetD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc offsetD");
|
||||
|
||||
//==================================================50
|
||||
// keysD
|
||||
//==================================================50
|
||||
|
||||
int *keysD;
|
||||
cudaMalloc((void**)&keysD, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc keysD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansD
|
||||
//==================================================50
|
||||
|
||||
record *ansD;
|
||||
cudaMalloc((void**)&ansD, count*sizeof(record));
|
||||
checkCUDAError("cudaMalloc ansD");
|
||||
|
||||
time2 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// GPU MEMORY (MALLOC) COPY IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// recordsD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy memD");
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy memD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
|
||||
|
||||
//==================================================50
|
||||
// keysD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy keysD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy ansD");
|
||||
|
||||
time3 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// findK kernel
|
||||
//======================================================================================================================================================150
|
||||
|
||||
findK<<<numBlocks, threadsPerBlock>>>( maxheight,
|
||||
|
||||
knodesD,
|
||||
knodes_elem,
|
||||
|
||||
recordsD,
|
||||
|
||||
currKnodeD,
|
||||
offsetD,
|
||||
keysD,
|
||||
ansD);
|
||||
cudaThreadSynchronize();
|
||||
checkCUDAError("findK");
|
||||
|
||||
time4 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY (CONTD.)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
|
||||
checkCUDAError("cudaMemcpy ansD");
|
||||
|
||||
time5 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY DEALLOCATION
|
||||
//======================================================================================================================================================150
|
||||
|
||||
cudaFree(recordsD);
|
||||
cudaFree(knodesD);
|
||||
|
||||
cudaFree(currKnodeD);
|
||||
cudaFree(offsetD);
|
||||
cudaFree(keysD);
|
||||
cudaFree(ansD);
|
||||
|
||||
time6 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// DISPLAY TIMING
|
||||
//======================================================================================================================================================150
|
||||
|
||||
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("Total time:\n");
|
||||
printf("%.12f s\n", (float) (time6-time0) / 1000000);
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// END
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,23 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// KERNEL_GPU_CUDA_WRAPPER HEADER
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
|
||||
long knodes_elem, long knodes_mem,
|
||||
|
||||
int order, long maxheight, int count,
|
||||
|
||||
long *currKnode, long *offset, int *keys,
|
||||
record *ans);
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,347 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// INCLUDE
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// COMMON
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../common.h" // (in the main program folder) needed to recognized input parameters
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// UTILITIES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../util/cuda/cuda.h" // (in library path specified to compiler) needed by for device functions
|
||||
#include "../util/timer/timer.h" // (in library path specified to compiler) needed by timer
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// KERNEL
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda_2.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// HEADER
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda_wrapper_2.h" // (in the current directory)
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// FUNCTION
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void
|
||||
kernel_gpu_cuda_wrapper_2( knode *knodes,
|
||||
long knodes_elem,
|
||||
long knodes_mem,
|
||||
|
||||
int order,
|
||||
long maxheight,
|
||||
int count,
|
||||
|
||||
long *currKnode,
|
||||
long *offset,
|
||||
long *lastKnode,
|
||||
long *offset_2,
|
||||
int *start,
|
||||
int *end,
|
||||
int *recstart,
|
||||
int *reclength)
|
||||
{
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// CPU VARIABLES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// timer
|
||||
long long time0;
|
||||
long long time1;
|
||||
long long time2;
|
||||
long long time3;
|
||||
long long time4;
|
||||
long long time5;
|
||||
long long time6;
|
||||
|
||||
time0 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU SETUP
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// INITIAL DRIVER OVERHEAD
|
||||
//====================================================================================================100
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
//====================================================================================================100
|
||||
// EXECUTION PARAMETERS
|
||||
//====================================================================================================100
|
||||
|
||||
int numBlocks;
|
||||
numBlocks = count;
|
||||
int threadsPerBlock;
|
||||
threadsPerBlock = order < 1024 ? order : 1024;
|
||||
|
||||
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
|
||||
|
||||
time1 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY MALLOC
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
knode *knodesD;
|
||||
cudaMalloc((void**)&knodesD, knodes_mem);
|
||||
checkCUDAError("cudaMalloc recordsD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
long *currKnodeD;
|
||||
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
long *offsetD;
|
||||
cudaMalloc((void**)&offsetD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc offsetD");
|
||||
|
||||
//==================================================50
|
||||
// lastKnodeD
|
||||
//==================================================50
|
||||
|
||||
long *lastKnodeD;
|
||||
cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc lastKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offset_2D
|
||||
//==================================================50
|
||||
|
||||
long *offset_2D;
|
||||
cudaMalloc((void**)&offset_2D, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc offset_2D");
|
||||
|
||||
//==================================================50
|
||||
// startD
|
||||
//==================================================50
|
||||
|
||||
int *startD;
|
||||
cudaMalloc((void**)&startD, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc startD");
|
||||
|
||||
//==================================================50
|
||||
// endD
|
||||
//==================================================50
|
||||
|
||||
int *endD;
|
||||
cudaMalloc((void**)&endD, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc endD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansDStart
|
||||
//==================================================50
|
||||
|
||||
int *ansDStart;
|
||||
cudaMalloc((void**)&ansDStart, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc ansDStart");
|
||||
|
||||
//==================================================50
|
||||
// ansDLength
|
||||
//==================================================50
|
||||
|
||||
int *ansDLength;
|
||||
cudaMalloc((void**)&ansDLength, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc ansDLength");
|
||||
|
||||
time2 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy memD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
|
||||
|
||||
//==================================================50
|
||||
// lastKnodeD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offset_2D
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
|
||||
|
||||
//==================================================50
|
||||
// startD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy startD");
|
||||
|
||||
//==================================================50
|
||||
// endD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy endD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansDStart
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy ansDStart");
|
||||
|
||||
//==================================================50
|
||||
// ansDLength
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy ansDLength");
|
||||
|
||||
time3 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// KERNEL
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// [GPU] findRangeK kernel
|
||||
findRangeK<<<numBlocks, threadsPerBlock>>>( maxheight,
|
||||
knodesD,
|
||||
knodes_elem,
|
||||
|
||||
currKnodeD,
|
||||
offsetD,
|
||||
lastKnodeD,
|
||||
offset_2D,
|
||||
startD,
|
||||
endD,
|
||||
ansDStart,
|
||||
ansDLength);
|
||||
cudaThreadSynchronize();
|
||||
checkCUDAError("findRangeK");
|
||||
|
||||
time4 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY (CONTD.)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansDStart
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
|
||||
checkCUDAError("cudaMemcpy ansDStart");
|
||||
|
||||
//==================================================50
|
||||
// ansDLength
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
|
||||
checkCUDAError("cudaMemcpy ansDLength");
|
||||
|
||||
time5 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY DEALLOCATION
|
||||
//======================================================================================================================================================150
|
||||
|
||||
cudaFree(knodesD);
|
||||
|
||||
cudaFree(currKnodeD);
|
||||
cudaFree(offsetD);
|
||||
cudaFree(lastKnodeD);
|
||||
cudaFree(offset_2D);
|
||||
cudaFree(startD);
|
||||
cudaFree(endD);
|
||||
cudaFree(ansDStart);
|
||||
cudaFree(ansDLength);
|
||||
|
||||
time6 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// DISPLAY TIMING
|
||||
//======================================================================================================================================================150
|
||||
|
||||
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("Total time:\n");
|
||||
printf("%.12f s\n", (float) (time6-time0) / 1000000);
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// END
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,23 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// KERNEL_GPU_CUDA_WRAPPER HEADER
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
|
||||
|
||||
int order, long maxheight, int count,
|
||||
|
||||
long *currKnode, long *offset, long *lastKnode,
|
||||
long *offset_2, int *start, int *end,
|
||||
int *recstart, int *reclength);
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,332 @@
|
|||
; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
|
||||
%struct.record = type { i32 }
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
|
||||
entry:
|
||||
%height.addr = alloca i64, align 8
|
||||
%knodesD.addr = alloca %struct.knode*, align 8
|
||||
%knodes_elem.addr = alloca i64, align 8
|
||||
%recordsD.addr = alloca %struct.record*, align 8
|
||||
%currKnodeD.addr = alloca i64*, align 8
|
||||
%offsetD.addr = alloca i64*, align 8
|
||||
%keysD.addr = alloca i32*, align 8
|
||||
%ansD.addr = alloca %struct.record*, align 8
|
||||
%thid = alloca i32, align 4
|
||||
%bid = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
store i64 %height, i64* %height.addr, align 8
|
||||
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
|
||||
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
|
||||
store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
|
||||
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
|
||||
store i64* %offsetD, i64** %offsetD.addr, align 8
|
||||
store i32* %keysD, i32** %keysD.addr, align 8
|
||||
store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call, i32* %thid, align 4
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %bid, align 4
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32, i32* %i, align 4
|
||||
%conv = sext i32 %0 to i64
|
||||
%1 = load i64, i64* %height.addr, align 8
|
||||
%cmp = icmp slt i64 %conv, %1
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%3 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%4 = load i32, i32* %bid, align 4
|
||||
%idxprom = sext i32 %4 to i64
|
||||
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
|
||||
%5 = load i64, i64* %arrayidx, align 8
|
||||
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
|
||||
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
|
||||
%6 = load i32, i32* %thid, align 4
|
||||
%idxprom3 = sext i32 %6 to i64
|
||||
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
|
||||
%7 = load i32, i32* %arrayidx4, align 4
|
||||
%8 = load i32*, i32** %keysD.addr, align 8
|
||||
%9 = load i32, i32* %bid, align 4
|
||||
%idxprom5 = sext i32 %9 to i64
|
||||
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
|
||||
%10 = load i32, i32* %arrayidx6, align 4
|
||||
%cmp7 = icmp sle i32 %7, %10
|
||||
br i1 %cmp7, label %land.lhs.true, label %if.end34
|
||||
|
||||
land.lhs.true: ; preds = %for.body
|
||||
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%12 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%13 = load i32, i32* %bid, align 4
|
||||
%idxprom8 = sext i32 %13 to i64
|
||||
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
|
||||
%14 = load i64, i64* %arrayidx9, align 8
|
||||
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
|
||||
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
|
||||
%15 = load i32, i32* %thid, align 4
|
||||
%add = add nsw i32 %15, 1
|
||||
%idxprom12 = sext i32 %add to i64
|
||||
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
|
||||
%16 = load i32, i32* %arrayidx13, align 4
|
||||
%17 = load i32*, i32** %keysD.addr, align 8
|
||||
%18 = load i32, i32* %bid, align 4
|
||||
%idxprom14 = sext i32 %18 to i64
|
||||
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
|
||||
%19 = load i32, i32* %arrayidx15, align 4
|
||||
%cmp16 = icmp sgt i32 %16, %19
|
||||
br i1 %cmp16, label %if.then, label %if.end34
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%21 = load i64*, i64** %offsetD.addr, align 8
|
||||
%22 = load i32, i32* %bid, align 4
|
||||
%idxprom17 = sext i32 %22 to i64
|
||||
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
|
||||
%23 = load i64, i64* %arrayidx18, align 8
|
||||
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
|
||||
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
|
||||
%24 = load i32, i32* %thid, align 4
|
||||
%idxprom20 = sext i32 %24 to i64
|
||||
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
|
||||
%25 = load i32, i32* %arrayidx21, align 4
|
||||
%conv22 = sext i32 %25 to i64
|
||||
%26 = load i64, i64* %knodes_elem.addr, align 8
|
||||
%cmp23 = icmp slt i64 %conv22, %26
|
||||
br i1 %cmp23, label %if.then24, label %if.end
|
||||
|
||||
if.then24: ; preds = %if.then
|
||||
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%28 = load i64*, i64** %offsetD.addr, align 8
|
||||
%29 = load i32, i32* %bid, align 4
|
||||
%idxprom25 = sext i32 %29 to i64
|
||||
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
|
||||
%30 = load i64, i64* %arrayidx26, align 8
|
||||
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
|
||||
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
|
||||
%31 = load i32, i32* %thid, align 4
|
||||
%idxprom29 = sext i32 %31 to i64
|
||||
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
|
||||
%32 = load i32, i32* %arrayidx30, align 4
|
||||
%conv31 = sext i32 %32 to i64
|
||||
%33 = load i64*, i64** %offsetD.addr, align 8
|
||||
%34 = load i32, i32* %bid, align 4
|
||||
%idxprom32 = sext i32 %34 to i64
|
||||
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
|
||||
store i64 %conv31, i64* %arrayidx33, align 8
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then24, %if.then
|
||||
br label %if.end34
|
||||
|
||||
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%35 = load i32, i32* %thid, align 4
|
||||
%cmp35 = icmp eq i32 %35, 0
|
||||
br i1 %cmp35, label %if.then36, label %if.end41
|
||||
|
||||
if.then36: ; preds = %if.end34
|
||||
%36 = load i64*, i64** %offsetD.addr, align 8
|
||||
%37 = load i32, i32* %bid, align 4
|
||||
%idxprom37 = sext i32 %37 to i64
|
||||
%arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
|
||||
%38 = load i64, i64* %arrayidx38, align 8
|
||||
%39 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%40 = load i32, i32* %bid, align 4
|
||||
%idxprom39 = sext i32 %40 to i64
|
||||
%arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
|
||||
store i64 %38, i64* %arrayidx40, align 8
|
||||
br label %if.end41
|
||||
|
||||
if.end41: ; preds = %if.then36, %if.end34
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end41
|
||||
%41 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %41, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%43 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%44 = load i32, i32* %bid, align 4
|
||||
%idxprom42 = sext i32 %44 to i64
|
||||
%arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
|
||||
%45 = load i64, i64* %arrayidx43, align 8
|
||||
%arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
|
||||
%keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
|
||||
%46 = load i32, i32* %thid, align 4
|
||||
%idxprom46 = sext i32 %46 to i64
|
||||
%arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
|
||||
%47 = load i32, i32* %arrayidx47, align 4
|
||||
%48 = load i32*, i32** %keysD.addr, align 8
|
||||
%49 = load i32, i32* %bid, align 4
|
||||
%idxprom48 = sext i32 %49 to i64
|
||||
%arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
|
||||
%50 = load i32, i32* %arrayidx49, align 4
|
||||
%cmp50 = icmp eq i32 %47, %50
|
||||
br i1 %cmp50, label %if.then51, label %if.end63
|
||||
|
||||
if.then51: ; preds = %for.end
|
||||
%51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
|
||||
%52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%53 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%54 = load i32, i32* %bid, align 4
|
||||
%idxprom52 = sext i32 %54 to i64
|
||||
%arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
|
||||
%55 = load i64, i64* %arrayidx53, align 8
|
||||
%arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
|
||||
%indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
|
||||
%56 = load i32, i32* %thid, align 4
|
||||
%idxprom56 = sext i32 %56 to i64
|
||||
%arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
|
||||
%57 = load i32, i32* %arrayidx57, align 4
|
||||
%idxprom58 = sext i32 %57 to i64
|
||||
%arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
|
||||
%value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
|
||||
%58 = load i32, i32* %value, align 4
|
||||
%59 = load %struct.record*, %struct.record** %ansD.addr, align 8
|
||||
%60 = load i32, i32* %bid, align 4
|
||||
%idxprom60 = sext i32 %60 to i64
|
||||
%arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
|
||||
%value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
|
||||
store i32 %58, i32* %value62, align 4
|
||||
br label %if.end63
|
||||
|
||||
if.end63: ; preds = %if.then51, %for.end
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,475 @@
|
|||
; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
|
||||
entry:
|
||||
%height.addr = alloca i64, align 8
|
||||
%knodesD.addr = alloca %struct.knode*, align 8
|
||||
%knodes_elem.addr = alloca i64, align 8
|
||||
%currKnodeD.addr = alloca i64*, align 8
|
||||
%offsetD.addr = alloca i64*, align 8
|
||||
%lastKnodeD.addr = alloca i64*, align 8
|
||||
%offset_2D.addr = alloca i64*, align 8
|
||||
%startD.addr = alloca i32*, align 8
|
||||
%endD.addr = alloca i32*, align 8
|
||||
%RecstartD.addr = alloca i32*, align 8
|
||||
%ReclenD.addr = alloca i32*, align 8
|
||||
%thid = alloca i32, align 4
|
||||
%bid = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
store i64 %height, i64* %height.addr, align 8
|
||||
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
|
||||
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
|
||||
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
|
||||
store i64* %offsetD, i64** %offsetD.addr, align 8
|
||||
store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
|
||||
store i64* %offset_2D, i64** %offset_2D.addr, align 8
|
||||
store i32* %startD, i32** %startD.addr, align 8
|
||||
store i32* %endD, i32** %endD.addr, align 8
|
||||
store i32* %RecstartD, i32** %RecstartD.addr, align 8
|
||||
store i32* %ReclenD, i32** %ReclenD.addr, align 8
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call, i32* %thid, align 4
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %bid, align 4
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32, i32* %i, align 4
|
||||
%conv = sext i32 %0 to i64
|
||||
%1 = load i64, i64* %height.addr, align 8
|
||||
%cmp = icmp slt i64 %conv, %1
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%3 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%4 = load i32, i32* %bid, align 4
|
||||
%idxprom = sext i32 %4 to i64
|
||||
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
|
||||
%5 = load i64, i64* %arrayidx, align 8
|
||||
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
|
||||
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
|
||||
%6 = load i32, i32* %thid, align 4
|
||||
%idxprom3 = sext i32 %6 to i64
|
||||
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
|
||||
%7 = load i32, i32* %arrayidx4, align 4
|
||||
%8 = load i32*, i32** %startD.addr, align 8
|
||||
%9 = load i32, i32* %bid, align 4
|
||||
%idxprom5 = sext i32 %9 to i64
|
||||
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
|
||||
%10 = load i32, i32* %arrayidx6, align 4
|
||||
%cmp7 = icmp sle i32 %7, %10
|
||||
br i1 %cmp7, label %land.lhs.true, label %if.end34
|
||||
|
||||
land.lhs.true: ; preds = %for.body
|
||||
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%12 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%13 = load i32, i32* %bid, align 4
|
||||
%idxprom8 = sext i32 %13 to i64
|
||||
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
|
||||
%14 = load i64, i64* %arrayidx9, align 8
|
||||
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
|
||||
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
|
||||
%15 = load i32, i32* %thid, align 4
|
||||
%add = add nsw i32 %15, 1
|
||||
%idxprom12 = sext i32 %add to i64
|
||||
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
|
||||
%16 = load i32, i32* %arrayidx13, align 4
|
||||
%17 = load i32*, i32** %startD.addr, align 8
|
||||
%18 = load i32, i32* %bid, align 4
|
||||
%idxprom14 = sext i32 %18 to i64
|
||||
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
|
||||
%19 = load i32, i32* %arrayidx15, align 4
|
||||
%cmp16 = icmp sgt i32 %16, %19
|
||||
br i1 %cmp16, label %if.then, label %if.end34
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%21 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%22 = load i32, i32* %bid, align 4
|
||||
%idxprom17 = sext i32 %22 to i64
|
||||
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
|
||||
%23 = load i64, i64* %arrayidx18, align 8
|
||||
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
|
||||
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
|
||||
%24 = load i32, i32* %thid, align 4
|
||||
%idxprom20 = sext i32 %24 to i64
|
||||
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
|
||||
%25 = load i32, i32* %arrayidx21, align 4
|
||||
%conv22 = sext i32 %25 to i64
|
||||
%26 = load i64, i64* %knodes_elem.addr, align 8
|
||||
%cmp23 = icmp slt i64 %conv22, %26
|
||||
br i1 %cmp23, label %if.then24, label %if.end
|
||||
|
||||
if.then24: ; preds = %if.then
|
||||
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%28 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%29 = load i32, i32* %bid, align 4
|
||||
%idxprom25 = sext i32 %29 to i64
|
||||
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
|
||||
%30 = load i64, i64* %arrayidx26, align 8
|
||||
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
|
||||
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
|
||||
%31 = load i32, i32* %thid, align 4
|
||||
%idxprom29 = sext i32 %31 to i64
|
||||
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
|
||||
%32 = load i32, i32* %arrayidx30, align 4
|
||||
%conv31 = sext i32 %32 to i64
|
||||
%33 = load i64*, i64** %offsetD.addr, align 8
|
||||
%34 = load i32, i32* %bid, align 4
|
||||
%idxprom32 = sext i32 %34 to i64
|
||||
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
|
||||
store i64 %conv31, i64* %arrayidx33, align 8
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then24, %if.then
|
||||
br label %if.end34
|
||||
|
||||
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
|
||||
%35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%36 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%37 = load i32, i32* %bid, align 4
|
||||
%idxprom35 = sext i32 %37 to i64
|
||||
%arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
|
||||
%38 = load i64, i64* %arrayidx36, align 8
|
||||
%arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
|
||||
%keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
|
||||
%39 = load i32, i32* %thid, align 4
|
||||
%idxprom39 = sext i32 %39 to i64
|
||||
%arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
|
||||
%40 = load i32, i32* %arrayidx40, align 4
|
||||
%41 = load i32*, i32** %endD.addr, align 8
|
||||
%42 = load i32, i32* %bid, align 4
|
||||
%idxprom41 = sext i32 %42 to i64
|
||||
%arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
|
||||
%43 = load i32, i32* %arrayidx42, align 4
|
||||
%cmp43 = icmp sle i32 %40, %43
|
||||
br i1 %cmp43, label %land.lhs.true44, label %if.end75
|
||||
|
||||
land.lhs.true44: ; preds = %if.end34
|
||||
%44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%45 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%46 = load i32, i32* %bid, align 4
|
||||
%idxprom45 = sext i32 %46 to i64
|
||||
%arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
|
||||
%47 = load i64, i64* %arrayidx46, align 8
|
||||
%arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
|
||||
%keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
|
||||
%48 = load i32, i32* %thid, align 4
|
||||
%add49 = add nsw i32 %48, 1
|
||||
%idxprom50 = sext i32 %add49 to i64
|
||||
%arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
|
||||
%49 = load i32, i32* %arrayidx51, align 4
|
||||
%50 = load i32*, i32** %endD.addr, align 8
|
||||
%51 = load i32, i32* %bid, align 4
|
||||
%idxprom52 = sext i32 %51 to i64
|
||||
%arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
|
||||
%52 = load i32, i32* %arrayidx53, align 4
|
||||
%cmp54 = icmp sgt i32 %49, %52
|
||||
br i1 %cmp54, label %if.then55, label %if.end75
|
||||
|
||||
if.then55: ; preds = %land.lhs.true44
|
||||
%53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%54 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%55 = load i32, i32* %bid, align 4
|
||||
%idxprom56 = sext i32 %55 to i64
|
||||
%arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
|
||||
%56 = load i64, i64* %arrayidx57, align 8
|
||||
%arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
|
||||
%indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
|
||||
%57 = load i32, i32* %thid, align 4
|
||||
%idxprom60 = sext i32 %57 to i64
|
||||
%arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
|
||||
%58 = load i32, i32* %arrayidx61, align 4
|
||||
%conv62 = sext i32 %58 to i64
|
||||
%59 = load i64, i64* %knodes_elem.addr, align 8
|
||||
%cmp63 = icmp slt i64 %conv62, %59
|
||||
br i1 %cmp63, label %if.then64, label %if.end74
|
||||
|
||||
if.then64: ; preds = %if.then55
|
||||
%60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%61 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%62 = load i32, i32* %bid, align 4
|
||||
%idxprom65 = sext i32 %62 to i64
|
||||
%arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
|
||||
%63 = load i64, i64* %arrayidx66, align 8
|
||||
%arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
|
||||
%indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
|
||||
%64 = load i32, i32* %thid, align 4
|
||||
%idxprom69 = sext i32 %64 to i64
|
||||
%arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
|
||||
%65 = load i32, i32* %arrayidx70, align 4
|
||||
%conv71 = sext i32 %65 to i64
|
||||
%66 = load i64*, i64** %offset_2D.addr, align 8
|
||||
%67 = load i32, i32* %bid, align 4
|
||||
%idxprom72 = sext i32 %67 to i64
|
||||
%arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
|
||||
store i64 %conv71, i64* %arrayidx73, align 8
|
||||
br label %if.end74
|
||||
|
||||
if.end74: ; preds = %if.then64, %if.then55
|
||||
br label %if.end75
|
||||
|
||||
if.end75: ; preds = %if.end74, %land.lhs.true44, %if.end34
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%68 = load i32, i32* %thid, align 4
|
||||
%cmp76 = icmp eq i32 %68, 0
|
||||
br i1 %cmp76, label %if.then77, label %if.end86
|
||||
|
||||
if.then77: ; preds = %if.end75
|
||||
%69 = load i64*, i64** %offsetD.addr, align 8
|
||||
%70 = load i32, i32* %bid, align 4
|
||||
%idxprom78 = sext i32 %70 to i64
|
||||
%arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
|
||||
%71 = load i64, i64* %arrayidx79, align 8
|
||||
%72 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%73 = load i32, i32* %bid, align 4
|
||||
%idxprom80 = sext i32 %73 to i64
|
||||
%arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
|
||||
store i64 %71, i64* %arrayidx81, align 8
|
||||
%74 = load i64*, i64** %offset_2D.addr, align 8
|
||||
%75 = load i32, i32* %bid, align 4
|
||||
%idxprom82 = sext i32 %75 to i64
|
||||
%arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
|
||||
%76 = load i64, i64* %arrayidx83, align 8
|
||||
%77 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%78 = load i32, i32* %bid, align 4
|
||||
%idxprom84 = sext i32 %78 to i64
|
||||
%arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
|
||||
store i64 %76, i64* %arrayidx85, align 8
|
||||
br label %if.end86
|
||||
|
||||
if.end86: ; preds = %if.then77, %if.end75
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end86
|
||||
%79 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %79, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%81 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%82 = load i32, i32* %bid, align 4
|
||||
%idxprom87 = sext i32 %82 to i64
|
||||
%arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
|
||||
%83 = load i64, i64* %arrayidx88, align 8
|
||||
%arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
|
||||
%keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
|
||||
%84 = load i32, i32* %thid, align 4
|
||||
%idxprom91 = sext i32 %84 to i64
|
||||
%arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
|
||||
%85 = load i32, i32* %arrayidx92, align 4
|
||||
%86 = load i32*, i32** %startD.addr, align 8
|
||||
%87 = load i32, i32* %bid, align 4
|
||||
%idxprom93 = sext i32 %87 to i64
|
||||
%arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
|
||||
%88 = load i32, i32* %arrayidx94, align 4
|
||||
%cmp95 = icmp eq i32 %85, %88
|
||||
br i1 %cmp95, label %if.then96, label %if.end105
|
||||
|
||||
if.then96: ; preds = %for.end
|
||||
%89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%90 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%91 = load i32, i32* %bid, align 4
|
||||
%idxprom97 = sext i32 %91 to i64
|
||||
%arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
|
||||
%92 = load i64, i64* %arrayidx98, align 8
|
||||
%arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
|
||||
%indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
|
||||
%93 = load i32, i32* %thid, align 4
|
||||
%idxprom101 = sext i32 %93 to i64
|
||||
%arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
|
||||
%94 = load i32, i32* %arrayidx102, align 4
|
||||
%95 = load i32*, i32** %RecstartD.addr, align 8
|
||||
%96 = load i32, i32* %bid, align 4
|
||||
%idxprom103 = sext i32 %96 to i64
|
||||
%arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
|
||||
store i32 %94, i32* %arrayidx104, align 4
|
||||
br label %if.end105
|
||||
|
||||
if.end105: ; preds = %if.then96, %for.end
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%98 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%99 = load i32, i32* %bid, align 4
|
||||
%idxprom106 = sext i32 %99 to i64
|
||||
%arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
|
||||
%100 = load i64, i64* %arrayidx107, align 8
|
||||
%arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
|
||||
%keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
|
||||
%101 = load i32, i32* %thid, align 4
|
||||
%idxprom110 = sext i32 %101 to i64
|
||||
%arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
|
||||
%102 = load i32, i32* %arrayidx111, align 4
|
||||
%103 = load i32*, i32** %endD.addr, align 8
|
||||
%104 = load i32, i32* %bid, align 4
|
||||
%idxprom112 = sext i32 %104 to i64
|
||||
%arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
|
||||
%105 = load i32, i32* %arrayidx113, align 4
|
||||
%cmp114 = icmp eq i32 %102, %105
|
||||
br i1 %cmp114, label %if.then115, label %if.end127
|
||||
|
||||
if.then115: ; preds = %if.end105
|
||||
%106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%107 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%108 = load i32, i32* %bid, align 4
|
||||
%idxprom116 = sext i32 %108 to i64
|
||||
%arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
|
||||
%109 = load i64, i64* %arrayidx117, align 8
|
||||
%arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
|
||||
%indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
|
||||
%110 = load i32, i32* %thid, align 4
|
||||
%idxprom120 = sext i32 %110 to i64
|
||||
%arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
|
||||
%111 = load i32, i32* %arrayidx121, align 4
|
||||
%112 = load i32*, i32** %RecstartD.addr, align 8
|
||||
%113 = load i32, i32* %bid, align 4
|
||||
%idxprom122 = sext i32 %113 to i64
|
||||
%arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
|
||||
%114 = load i32, i32* %arrayidx123, align 4
|
||||
%sub = sub nsw i32 %111, %114
|
||||
%add124 = add nsw i32 %sub, 1
|
||||
%115 = load i32*, i32** %ReclenD.addr, align 8
|
||||
%116 = load i32, i32* %bid, align 4
|
||||
%idxprom125 = sext i32 %116 to i64
|
||||
%arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
|
||||
store i32 %add124, i32* %arrayidx126, align 4
|
||||
br label %if.end127
|
||||
|
||||
if.end127: ; preds = %if.then115, %if.end105
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,40 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
clang -c -emit-llvm util/timer/timer.c
|
||||
clang -c -emit-llvm util/num/num.c
|
||||
#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
|
||||
#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
|
||||
#clang++ kernel/kernel_gpu_cuda_wrapper.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
clang -c -emit-llvm main.c
|
||||
|
||||
llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
|
||||
llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
|
||||
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
|
||||
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
|
||||
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj main.bc
|
||||
llc --relocation-model=pic --filetype=obj cuda.bc
|
||||
llc --relocation-model=pic --filetype=obj num.bc
|
||||
llc --relocation-model=pic --filetype=obj timer.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel1.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel2.bc
|
||||
llc --relocation-model=pic --filetype=obj host1.bc
|
||||
llc --relocation-model=pic --filetype=obj host2.bc
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o b+tree.out \
|
||||
-fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
|
||||
-lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
|
||||
command ../../rodinia-data/b+tree/command.txt
|
||||
if grep -q "0 840187 6001" output.txt; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -0,0 +1,75 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// SET_DEVICE CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE/DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "cuda.h" // (in library path specified to compiler)
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTIONS
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// SET DEVICE
|
||||
//====================================================================================================100
|
||||
|
||||
void setdevice(void){
|
||||
|
||||
// variables
|
||||
int num_devices;
|
||||
int device;
|
||||
|
||||
// work
|
||||
cudaGetDeviceCount(&num_devices);
|
||||
if (num_devices > 1) {
|
||||
|
||||
// variables
|
||||
int max_multiprocessors;
|
||||
int max_device;
|
||||
cudaDeviceProp properties;
|
||||
|
||||
// initialize variables
|
||||
max_multiprocessors = 0;
|
||||
max_device = 0;
|
||||
|
||||
for (device = 0; device < num_devices; device++) {
|
||||
cudaGetDeviceProperties(&properties, device);
|
||||
if (max_multiprocessors < properties.multiProcessorCount) {
|
||||
max_multiprocessors = properties.multiProcessorCount;
|
||||
max_device = device;
|
||||
}
|
||||
}
|
||||
cudaSetDevice(max_device);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//====================================================================================================100
|
||||
// GET LAST ERROR
|
||||
//====================================================================================================100
|
||||
|
||||
void checkCUDAError(const char *msg)
|
||||
{
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if( cudaSuccess != err) {
|
||||
// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
|
||||
printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
|
||||
fflush(NULL);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,37 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// SET_DEVICE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE/DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include <stdio.h> // (in library path known to compiler) needed by printf
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTION PROTOTYPES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// SET DEVICE
|
||||
//====================================================================================================100
|
||||
|
||||
void setdevice(void);
|
||||
|
||||
//====================================================================================================100
|
||||
// GET LAST ERROR
|
||||
//====================================================================================================100
|
||||
|
||||
void checkCUDAError(const char *msg);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END SET_DEVICE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,55 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// DESCRIPTION
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
// Returns: 0 if string does not represent integer
|
||||
// 1 if string represents integer
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// NUM CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// ISINTEGER FUNCTION
|
||||
//======================================================================================================================================================150
|
||||
|
||||
int isInteger(char *str) {
|
||||
|
||||
//====================================================================================================100
|
||||
// make sure it's not empty
|
||||
//====================================================================================================100
|
||||
|
||||
if (*str == '\0') {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//====================================================================================================100
|
||||
// if any digit is not a number, return false
|
||||
//====================================================================================================100
|
||||
|
||||
for (; *str != '\0'; str++) {
|
||||
if (*str < 48 ||
|
||||
*str >
|
||||
57) { // digit characters (need to include . if checking for float)
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//====================================================================================================100
|
||||
// it got past all my checks so I think it's a number
|
||||
//====================================================================================================100
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END NUM CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,21 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// FILE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// ISINTEGER FUNCTION PROTOTYPE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
int isInteger(char *str);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END FILE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,36 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// TIMER CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE/DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTIONS
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DISPLAY TIME
|
||||
//====================================================================================================100
|
||||
|
||||
// Returns the current system time in microseconds
|
||||
long long get_time() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return (tv.tv_sec * 1000000) + tv.tv_usec;
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END TIMER CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,21 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// TIMER HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTION PROTOTYPES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
long long get_time();
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END TIMER HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,662 @@
|
|||
#include <fstream>
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_timer.h>
|
||||
#include <iostream>
|
||||
|
||||
/*
|
||||
* Options
|
||||
*
|
||||
*/
|
||||
#define GAMMA 1.4f
|
||||
#define iterations 2
|
||||
// #ifndef block_length
|
||||
// #define block_length 192
|
||||
// #endif
|
||||
|
||||
#define NDIM 3
|
||||
#define NNB 4
|
||||
|
||||
#define RK 3 // 3rd order RK
|
||||
#define ff_mach 1.2f
|
||||
#define deg_angle_of_attack 0.0f
|
||||
|
||||
/*
|
||||
* not options
|
||||
*/
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define BLOCK_SIZE_0 RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_0 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_0 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE_1 RD_WG_SIZE_1
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_1 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_1 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_2_0
|
||||
#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE_2 RD_WG_SIZE_2
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_2 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_2 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_3_0
|
||||
#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
|
||||
#elif defined(RD_WG_SIZE_3)
|
||||
#define BLOCK_SIZE_3 RD_WG_SIZE_3
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_3 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_3 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_4_0
|
||||
#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
|
||||
#elif defined(RD_WG_SIZE_4)
|
||||
#define BLOCK_SIZE_4 RD_WG_SIZE_4
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_4 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_4 192
|
||||
#endif
|
||||
|
||||
// #if block_length > 128
|
||||
// #warning "the kernels may fail too launch on some systems if the block length
|
||||
// is too large" #endif
|
||||
|
||||
#define VAR_DENSITY 0
|
||||
#define VAR_MOMENTUM 1
|
||||
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
|
||||
#define NVAR (VAR_DENSITY_ENERGY + 1)
|
||||
|
||||
/*
|
||||
* Generic functions
|
||||
*/
|
||||
template <typename T> T *alloc(int N) {
|
||||
T *t;
|
||||
checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
|
||||
return t;
|
||||
}
|
||||
|
||||
template <typename T> void dealloc(T *array) {
|
||||
checkCudaErrors(cudaFree((void *)array));
|
||||
}
|
||||
|
||||
template <typename T> void copy(T *dst, T *src, int N) {
|
||||
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
template <typename T> void upload(T *dst, T *src, int N) {
|
||||
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template <typename T> void download(T *dst, T *src, int N) {
|
||||
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void dump(float *variables, int nel, int nelr) {
|
||||
float *h_variables = new float[nelr * NVAR];
|
||||
download(h_variables, variables, nelr * NVAR);
|
||||
|
||||
{
|
||||
std::ofstream file("density");
|
||||
file << nel << " " << nelr << std::endl;
|
||||
for (int i = 0; i < nel; i++)
|
||||
file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::ofstream file("momentum");
|
||||
file << nel << " " << nelr << std::endl;
|
||||
for (int i = 0; i < nel; i++) {
|
||||
for (int j = 0; j != NDIM; j++)
|
||||
file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
|
||||
file << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::ofstream file("density_energy");
|
||||
file << nel << " " << nelr << std::endl;
|
||||
for (int i = 0; i < nel; i++)
|
||||
file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
|
||||
}
|
||||
delete[] h_variables;
|
||||
}
|
||||
|
||||
/*
|
||||
* Element-based Cell-centered FVM solver functions
|
||||
*/
|
||||
__constant__ float ff_variable[NVAR];
|
||||
__constant__ float3 ff_flux_contribution_momentum_x[1];
|
||||
__constant__ float3 ff_flux_contribution_momentum_y[1];
|
||||
__constant__ float3 ff_flux_contribution_momentum_z[1];
|
||||
__constant__ float3 ff_flux_contribution_density_energy[1];
|
||||
|
||||
__global__ void cuda_initialize_variables(int nelr, float *variables) {
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
for (int j = 0; j < NVAR; j++)
|
||||
variables[i + j * nelr] = ff_variable[j];
|
||||
}
|
||||
void initialize_variables(int nelr, float *variables) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
|
||||
cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
|
||||
getLastCudaError("initialize_variables failed");
|
||||
}
|
||||
|
||||
__device__ __host__ inline void compute_flux_contribution(
|
||||
float &density, float3 &momentum, float &density_energy, float &pressure,
|
||||
float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
|
||||
float3 &fc_momentum_z, float3 &fc_density_energy) {
|
||||
fc_momentum_x.x = velocity.x * momentum.x + pressure;
|
||||
fc_momentum_x.y = velocity.x * momentum.y;
|
||||
fc_momentum_x.z = velocity.x * momentum.z;
|
||||
|
||||
fc_momentum_y.x = fc_momentum_x.y;
|
||||
fc_momentum_y.y = velocity.y * momentum.y + pressure;
|
||||
fc_momentum_y.z = velocity.y * momentum.z;
|
||||
|
||||
fc_momentum_z.x = fc_momentum_x.z;
|
||||
fc_momentum_z.y = fc_momentum_y.z;
|
||||
fc_momentum_z.z = velocity.z * momentum.z + pressure;
|
||||
|
||||
float de_p = density_energy + pressure;
|
||||
fc_density_energy.x = velocity.x * de_p;
|
||||
fc_density_energy.y = velocity.y * de_p;
|
||||
fc_density_energy.z = velocity.z * de_p;
|
||||
}
|
||||
|
||||
__device__ inline void compute_velocity(float &density, float3 &momentum,
|
||||
float3 &velocity) {
|
||||
velocity.x = momentum.x / density;
|
||||
velocity.y = momentum.y / density;
|
||||
velocity.z = momentum.z / density;
|
||||
}
|
||||
|
||||
__device__ inline float compute_speed_sqd(float3 &velocity) {
|
||||
return velocity.x * velocity.x + velocity.y * velocity.y +
|
||||
velocity.z * velocity.z;
|
||||
}
|
||||
|
||||
__device__ inline float compute_pressure(float &density, float &density_energy,
|
||||
float &speed_sqd) {
|
||||
return (float(GAMMA) - float(1.0f)) *
|
||||
(density_energy - float(0.5f) * density * speed_sqd);
|
||||
}
|
||||
|
||||
__device__ inline float compute_speed_of_sound(float &density,
|
||||
float &pressure) {
|
||||
return sqrtf(float(GAMMA) * pressure / density);
|
||||
}
|
||||
|
||||
__global__ void cuda_compute_step_factor(int nelr, float *variables,
|
||||
float *areas, float *step_factors) {
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
float density = variables[i + VAR_DENSITY * nelr];
|
||||
float3 momentum;
|
||||
momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
|
||||
momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
|
||||
momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
|
||||
|
||||
float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
|
||||
|
||||
float3 velocity;
|
||||
compute_velocity(density, momentum, velocity);
|
||||
float speed_sqd = compute_speed_sqd(velocity);
|
||||
float pressure = compute_pressure(density, density_energy, speed_sqd);
|
||||
float speed_of_sound = compute_speed_of_sound(density, pressure);
|
||||
|
||||
// dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time
|
||||
// stepping, this later would need to be divided by the area, so we just do it
|
||||
// all at once
|
||||
step_factors[i] =
|
||||
float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
|
||||
}
|
||||
void compute_step_factor(int nelr, float *variables, float *areas,
|
||||
float *step_factors) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
|
||||
cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
|
||||
getLastCudaError("compute_step_factor failed");
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*
|
||||
*/
|
||||
__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
|
||||
float *normals, float *variables,
|
||||
float *fluxes) {
|
||||
const float smoothing_coefficient = float(0.2f);
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
int j, nb;
|
||||
float3 normal;
|
||||
float normal_len;
|
||||
float factor;
|
||||
|
||||
float density_i = variables[i + VAR_DENSITY * nelr];
|
||||
float3 momentum_i;
|
||||
momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
|
||||
momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
|
||||
momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
|
||||
|
||||
float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
|
||||
|
||||
float3 velocity_i;
|
||||
compute_velocity(density_i, momentum_i, velocity_i);
|
||||
float speed_sqd_i = compute_speed_sqd(velocity_i);
|
||||
float speed_i = sqrtf(speed_sqd_i);
|
||||
float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
|
||||
float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
|
||||
float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
|
||||
flux_contribution_i_momentum_z;
|
||||
float3 flux_contribution_i_density_energy;
|
||||
compute_flux_contribution(
|
||||
density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
|
||||
flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
|
||||
flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
|
||||
|
||||
float flux_i_density = float(0.0f);
|
||||
float3 flux_i_momentum;
|
||||
flux_i_momentum.x = float(0.0f);
|
||||
flux_i_momentum.y = float(0.0f);
|
||||
flux_i_momentum.z = float(0.0f);
|
||||
float flux_i_density_energy = float(0.0f);
|
||||
|
||||
float3 velocity_nb;
|
||||
float density_nb, density_energy_nb;
|
||||
float3 momentum_nb;
|
||||
float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
|
||||
flux_contribution_nb_momentum_z;
|
||||
float3 flux_contribution_nb_density_energy;
|
||||
float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
|
||||
|
||||
#pragma unroll
|
||||
for (j = 0; j < NNB; j++) {
|
||||
nb = elements_surrounding_elements[i + j * nelr];
|
||||
normal.x = normals[i + (j + 0 * NNB) * nelr];
|
||||
normal.y = normals[i + (j + 1 * NNB) * nelr];
|
||||
normal.z = normals[i + (j + 2 * NNB) * nelr];
|
||||
normal_len =
|
||||
sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
|
||||
|
||||
if (nb >= 0) // a legitimate neighbor
|
||||
{
|
||||
density_nb = variables[nb + VAR_DENSITY * nelr];
|
||||
momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
|
||||
momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
|
||||
momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
|
||||
density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
|
||||
compute_velocity(density_nb, momentum_nb, velocity_nb);
|
||||
speed_sqd_nb = compute_speed_sqd(velocity_nb);
|
||||
pressure_nb =
|
||||
compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
|
||||
speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
|
||||
compute_flux_contribution(
|
||||
density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
|
||||
flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
|
||||
flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
|
||||
|
||||
// artificial viscosity
|
||||
factor = -normal_len * smoothing_coefficient * float(0.5f) *
|
||||
(speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
|
||||
speed_of_sound_nb);
|
||||
flux_i_density += factor * (density_i - density_nb);
|
||||
flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
|
||||
flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
|
||||
flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
|
||||
flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
|
||||
|
||||
// accumulate cell-centered fluxes
|
||||
factor = float(0.5f) * normal.x;
|
||||
flux_i_density += factor * (momentum_nb.x + momentum_i.x);
|
||||
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
|
||||
flux_contribution_i_density_energy.x);
|
||||
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
|
||||
flux_contribution_i_momentum_x.x);
|
||||
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
|
||||
flux_contribution_i_momentum_y.x);
|
||||
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
|
||||
flux_contribution_i_momentum_z.x);
|
||||
|
||||
factor = float(0.5f) * normal.y;
|
||||
flux_i_density += factor * (momentum_nb.y + momentum_i.y);
|
||||
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
|
||||
flux_contribution_i_density_energy.y);
|
||||
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
|
||||
flux_contribution_i_momentum_x.y);
|
||||
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
|
||||
flux_contribution_i_momentum_y.y);
|
||||
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
|
||||
flux_contribution_i_momentum_z.y);
|
||||
|
||||
factor = float(0.5f) * normal.z;
|
||||
flux_i_density += factor * (momentum_nb.z + momentum_i.z);
|
||||
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
|
||||
flux_contribution_i_density_energy.z);
|
||||
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
|
||||
flux_contribution_i_momentum_x.z);
|
||||
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
|
||||
flux_contribution_i_momentum_y.z);
|
||||
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
|
||||
flux_contribution_i_momentum_z.z);
|
||||
} else if (nb == -1) // a wing boundary
|
||||
{
|
||||
flux_i_momentum.x += normal.x * pressure_i;
|
||||
flux_i_momentum.y += normal.y * pressure_i;
|
||||
flux_i_momentum.z += normal.z * pressure_i;
|
||||
} else if (nb == -2) // a far field boundary
|
||||
{
|
||||
factor = float(0.5f) * normal.x;
|
||||
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
|
||||
flux_i_density_energy +=
|
||||
factor * (ff_flux_contribution_density_energy[0].x +
|
||||
flux_contribution_i_density_energy.x);
|
||||
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
|
||||
flux_contribution_i_momentum_x.x);
|
||||
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
|
||||
flux_contribution_i_momentum_y.x);
|
||||
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
|
||||
flux_contribution_i_momentum_z.x);
|
||||
|
||||
factor = float(0.5f) * normal.y;
|
||||
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
|
||||
flux_i_density_energy +=
|
||||
factor * (ff_flux_contribution_density_energy[0].y +
|
||||
flux_contribution_i_density_energy.y);
|
||||
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
|
||||
flux_contribution_i_momentum_x.y);
|
||||
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
|
||||
flux_contribution_i_momentum_y.y);
|
||||
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
|
||||
flux_contribution_i_momentum_z.y);
|
||||
|
||||
factor = float(0.5f) * normal.z;
|
||||
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
|
||||
flux_i_density_energy +=
|
||||
factor * (ff_flux_contribution_density_energy[0].z +
|
||||
flux_contribution_i_density_energy.z);
|
||||
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
|
||||
flux_contribution_i_momentum_x.z);
|
||||
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
|
||||
flux_contribution_i_momentum_y.z);
|
||||
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
|
||||
flux_contribution_i_momentum_z.z);
|
||||
}
|
||||
}
|
||||
|
||||
fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
|
||||
fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
|
||||
fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
|
||||
fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
|
||||
fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
|
||||
}
|
||||
void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
|
||||
float *variables, float *fluxes) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
|
||||
cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
|
||||
variables, fluxes);
|
||||
getLastCudaError("compute_flux failed");
|
||||
}
|
||||
|
||||
__global__ void cuda_time_step(int j, int nelr, float *old_variables,
|
||||
float *variables, float *step_factors,
|
||||
float *fluxes) {
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
float factor = step_factors[i] / float(RK + 1 - j);
|
||||
|
||||
variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
|
||||
factor * fluxes[i + VAR_DENSITY * nelr];
|
||||
variables[i + VAR_DENSITY_ENERGY * nelr] =
|
||||
old_variables[i + VAR_DENSITY_ENERGY * nelr] +
|
||||
factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
|
||||
variables[i + (VAR_MOMENTUM + 0) * nelr] =
|
||||
old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
|
||||
factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
|
||||
variables[i + (VAR_MOMENTUM + 1) * nelr] =
|
||||
old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
|
||||
factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
|
||||
variables[i + (VAR_MOMENTUM + 2) * nelr] =
|
||||
old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
|
||||
factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
|
||||
}
|
||||
void time_step(int j, int nelr, float *old_variables, float *variables,
|
||||
float *step_factors, float *fluxes) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
|
||||
cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
|
||||
fluxes);
|
||||
getLastCudaError("update failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* Main function
|
||||
*/
|
||||
int main(int argc, char **argv) {
|
||||
printf("WG size of kernel:initialize = %d, WG size of "
|
||||
"kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
|
||||
"%d, WG size of kernel:time_step = %d\n",
|
||||
BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
|
||||
|
||||
if (argc < 2) {
|
||||
std::cout << "specify data file name" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
const char *data_file_name = argv[1];
|
||||
|
||||
cudaDeviceProp prop;
|
||||
int dev;
|
||||
|
||||
checkCudaErrors(cudaSetDevice(0));
|
||||
|
||||
// set far field conditions and load them into constant memory on the gpu
|
||||
{
|
||||
float h_ff_variable[NVAR];
|
||||
const float angle_of_attack =
|
||||
float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
|
||||
|
||||
h_ff_variable[VAR_DENSITY] = float(1.4);
|
||||
|
||||
float ff_pressure = float(1.0f);
|
||||
float ff_speed_of_sound =
|
||||
sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
|
||||
float ff_speed = float(ff_mach) * ff_speed_of_sound;
|
||||
|
||||
float3 ff_velocity;
|
||||
ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
|
||||
ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
|
||||
ff_velocity.z = 0.0f;
|
||||
|
||||
h_ff_variable[VAR_MOMENTUM + 0] =
|
||||
h_ff_variable[VAR_DENSITY] * ff_velocity.x;
|
||||
h_ff_variable[VAR_MOMENTUM + 1] =
|
||||
h_ff_variable[VAR_DENSITY] * ff_velocity.y;
|
||||
h_ff_variable[VAR_MOMENTUM + 2] =
|
||||
h_ff_variable[VAR_DENSITY] * ff_velocity.z;
|
||||
|
||||
h_ff_variable[VAR_DENSITY_ENERGY] =
|
||||
h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
|
||||
(ff_pressure / float(GAMMA - 1.0f));
|
||||
|
||||
float3 h_ff_momentum;
|
||||
h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
|
||||
h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
|
||||
h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
|
||||
float3 h_ff_flux_contribution_momentum_x;
|
||||
float3 h_ff_flux_contribution_momentum_y;
|
||||
float3 h_ff_flux_contribution_momentum_z;
|
||||
float3 h_ff_flux_contribution_density_energy;
|
||||
compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
|
||||
h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
|
||||
ff_velocity, h_ff_flux_contribution_momentum_x,
|
||||
h_ff_flux_contribution_momentum_y,
|
||||
h_ff_flux_contribution_momentum_z,
|
||||
h_ff_flux_contribution_density_energy);
|
||||
|
||||
// copy far field conditions to the gpu
|
||||
checkCudaErrors(
|
||||
cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
|
||||
&h_ff_flux_contribution_momentum_x,
|
||||
sizeof(float3)));
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
|
||||
&h_ff_flux_contribution_momentum_y,
|
||||
sizeof(float3)));
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
|
||||
&h_ff_flux_contribution_momentum_z,
|
||||
sizeof(float3)));
|
||||
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
|
||||
&h_ff_flux_contribution_density_energy,
|
||||
sizeof(float3)));
|
||||
}
|
||||
int nel;
|
||||
int nelr;
|
||||
|
||||
// read in domain geometry
|
||||
float *areas;
|
||||
int *elements_surrounding_elements;
|
||||
float *normals;
|
||||
{
|
||||
std::ifstream file(data_file_name);
|
||||
|
||||
file >> nel;
|
||||
nelr =
|
||||
BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
|
||||
|
||||
float *h_areas = new float[nelr];
|
||||
int *h_elements_surrounding_elements = new int[nelr * NNB];
|
||||
float *h_normals = new float[nelr * NDIM * NNB];
|
||||
|
||||
// read in data
|
||||
for (int i = 0; i < nel; i++) {
|
||||
file >> h_areas[i];
|
||||
for (int j = 0; j < NNB; j++) {
|
||||
file >> h_elements_surrounding_elements[i + j * nelr];
|
||||
if (h_elements_surrounding_elements[i + j * nelr] < 0)
|
||||
h_elements_surrounding_elements[i + j * nelr] = -1;
|
||||
h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
|
||||
// Fortran numbering
|
||||
|
||||
for (int k = 0; k < NDIM; k++) {
|
||||
file >> h_normals[i + (j + k * NNB) * nelr];
|
||||
h_normals[i + (j + k * NNB) * nelr] =
|
||||
-h_normals[i + (j + k * NNB) * nelr];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fill in remaining data
|
||||
int last = nel - 1;
|
||||
for (int i = nel; i < nelr; i++) {
|
||||
h_areas[i] = h_areas[last];
|
||||
for (int j = 0; j < NNB; j++) {
|
||||
// duplicate the last element
|
||||
h_elements_surrounding_elements[i + j * nelr] =
|
||||
h_elements_surrounding_elements[last + j * nelr];
|
||||
for (int k = 0; k < NDIM; k++)
|
||||
h_normals[last + (j + k * NNB) * nelr] =
|
||||
h_normals[last + (j + k * NNB) * nelr];
|
||||
}
|
||||
}
|
||||
|
||||
areas = alloc<float>(nelr);
|
||||
upload<float>(areas, h_areas, nelr);
|
||||
|
||||
elements_surrounding_elements = alloc<int>(nelr * NNB);
|
||||
upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
|
||||
nelr * NNB);
|
||||
|
||||
normals = alloc<float>(nelr * NDIM * NNB);
|
||||
upload<float>(normals, h_normals, nelr * NDIM * NNB);
|
||||
|
||||
delete[] h_areas;
|
||||
delete[] h_elements_surrounding_elements;
|
||||
delete[] h_normals;
|
||||
}
|
||||
|
||||
// Create arrays and set initial conditions
|
||||
float *variables = alloc<float>(nelr * NVAR);
|
||||
initialize_variables(nelr, variables);
|
||||
|
||||
float *old_variables = alloc<float>(nelr * NVAR);
|
||||
float *fluxes = alloc<float>(nelr * NVAR);
|
||||
float *step_factors = alloc<float>(nelr);
|
||||
|
||||
// make sure all memory is floatly allocated before we start timing
|
||||
initialize_variables(nelr, old_variables);
|
||||
initialize_variables(nelr, fluxes);
|
||||
cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
|
||||
// make sure CUDA isn't still doing something before we start timing
|
||||
cudaThreadSynchronize();
|
||||
|
||||
// these need to be computed the first time in order to compute time step
|
||||
std::cout << "Starting..." << std::endl;
|
||||
|
||||
StopWatchInterface *timer = 0;
|
||||
// unsigned int timer = 0;
|
||||
|
||||
// CUT_SAFE_CALL( cutCreateTimer( &timer));
|
||||
// CUT_SAFE_CALL( cutStartTimer( timer));
|
||||
sdkCreateTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
// Begin iterations
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
copy<float>(old_variables, variables, nelr * NVAR);
|
||||
|
||||
// for the first iteration we compute the time step
|
||||
compute_step_factor(nelr, variables, areas, step_factors);
|
||||
getLastCudaError("compute_step_factor failed");
|
||||
|
||||
for (int j = 0; j < RK; j++) {
|
||||
compute_flux(nelr, elements_surrounding_elements, normals, variables,
|
||||
fluxes);
|
||||
getLastCudaError("compute_flux failed");
|
||||
time_step(j, nelr, old_variables, variables, step_factors, fluxes);
|
||||
getLastCudaError("time_step failed");
|
||||
}
|
||||
}
|
||||
|
||||
cudaThreadSynchronize();
|
||||
// CUT_SAFE_CALL( cutStopTimer(timer) );
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
|
||||
<< " seconds per iteration" << std::endl;
|
||||
|
||||
std::cout << "Saving solution..." << std::endl;
|
||||
dump(variables, nel, nelr);
|
||||
std::cout << "Saved solution..." << std::endl;
|
||||
|
||||
std::cout << "Cleaning up..." << std::endl;
|
||||
dealloc<float>(areas);
|
||||
dealloc<int>(elements_surrounding_elements);
|
||||
dealloc<float>(normals);
|
||||
|
||||
dealloc<float>(variables);
|
||||
dealloc<float>(old_variables);
|
||||
dealloc<float>(fluxes);
|
||||
dealloc<float>(step_factors);
|
||||
|
||||
std::cout << "Done..." << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
# # #!/bin/bash
|
||||
clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
|
||||
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
|
||||
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./a.out ../rodinia-data/cfd/fvcorr.domn.097K
|
||||
# ./demo 1024
|
||||
# # # ./demo -f ../../data/matrix3.txt
|
||||
# # # run -f ../../data/gaussian/matrix3.txt
|
|
@ -0,0 +1,396 @@
|
|||
; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "gaussian.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockDim_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
|
||||
entry:
|
||||
%m_cuda.addr = alloca float*, align 8
|
||||
%a_cuda.addr = alloca float*, align 8
|
||||
%Size.addr = alloca i32, align 4
|
||||
%t.addr = alloca i32, align 4
|
||||
store float* %m_cuda, float** %m_cuda.addr, align 8
|
||||
store float* %a_cuda, float** %a_cuda.addr, align 8
|
||||
store i32 %Size, i32* %Size.addr, align 4
|
||||
store i32 %t, i32* %t.addr, align 4
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call1, %call2
|
||||
%add = add i32 %call, %mul
|
||||
%0 = load i32, i32* %Size.addr, align 4
|
||||
%sub = sub nsw i32 %0, 1
|
||||
%1 = load i32, i32* %t.addr, align 4
|
||||
%sub3 = sub nsw i32 %sub, %1
|
||||
%cmp = icmp uge i32 %add, %sub3
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
br label %return
|
||||
|
||||
if.end: ; preds = %entry
|
||||
%2 = load float*, float** %a_cuda.addr, align 8
|
||||
%3 = load i32, i32* %Size.addr, align 4
|
||||
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul6 = mul i32 %call4, %call5
|
||||
%call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add8 = add i32 %mul6, %call7
|
||||
%4 = load i32, i32* %t.addr, align 4
|
||||
%add9 = add i32 %add8, %4
|
||||
%add10 = add i32 %add9, 1
|
||||
%mul11 = mul i32 %3, %add10
|
||||
%idx.ext = zext i32 %mul11 to i64
|
||||
%add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
|
||||
%5 = load i32, i32* %t.addr, align 4
|
||||
%idx.ext12 = sext i32 %5 to i64
|
||||
%add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
|
||||
%6 = load float, float* %add.ptr13, align 4
|
||||
%7 = load float*, float** %a_cuda.addr, align 8
|
||||
%8 = load i32, i32* %Size.addr, align 4
|
||||
%9 = load i32, i32* %t.addr, align 4
|
||||
%mul14 = mul nsw i32 %8, %9
|
||||
%idx.ext15 = sext i32 %mul14 to i64
|
||||
%add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
|
||||
%10 = load i32, i32* %t.addr, align 4
|
||||
%idx.ext17 = sext i32 %10 to i64
|
||||
%add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
|
||||
%11 = load float, float* %add.ptr18, align 4
|
||||
%div = fdiv float %6, %11
|
||||
%12 = load float*, float** %m_cuda.addr, align 8
|
||||
%13 = load i32, i32* %Size.addr, align 4
|
||||
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul21 = mul i32 %call19, %call20
|
||||
%call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add23 = add i32 %mul21, %call22
|
||||
%14 = load i32, i32* %t.addr, align 4
|
||||
%add24 = add i32 %add23, %14
|
||||
%add25 = add i32 %add24, 1
|
||||
%mul26 = mul i32 %13, %add25
|
||||
%idx.ext27 = zext i32 %mul26 to i64
|
||||
%add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
|
||||
%15 = load i32, i32* %t.addr, align 4
|
||||
%idx.ext29 = sext i32 %15 to i64
|
||||
%add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
|
||||
store float %div, float* %add.ptr30, align 4
|
||||
br label %return
|
||||
|
||||
return: ; preds = %if.end, %if.then
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
|
||||
entry:
|
||||
%m_cuda.addr = alloca float*, align 8
|
||||
%a_cuda.addr = alloca float*, align 8
|
||||
%b_cuda.addr = alloca float*, align 8
|
||||
%Size.addr = alloca i32, align 4
|
||||
%j1.addr = alloca i32, align 4
|
||||
%t.addr = alloca i32, align 4
|
||||
%xidx = alloca i32, align 4
|
||||
%yidx = alloca i32, align 4
|
||||
store float* %m_cuda, float** %m_cuda.addr, align 8
|
||||
store float* %a_cuda, float** %a_cuda.addr, align 8
|
||||
store float* %b_cuda, float** %b_cuda.addr, align 8
|
||||
store i32 %Size, i32* %Size.addr, align 4
|
||||
store i32 %j1, i32* %j1.addr, align 4
|
||||
store i32 %t, i32* %t.addr, align 4
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call1, %call2
|
||||
%add = add i32 %call, %mul
|
||||
%0 = load i32, i32* %Size.addr, align 4
|
||||
%sub = sub nsw i32 %0, 1
|
||||
%1 = load i32, i32* %t.addr, align 4
|
||||
%sub3 = sub nsw i32 %sub, %1
|
||||
%cmp = icmp uge i32 %add, %sub3
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
br label %if.end58
|
||||
|
||||
if.end: ; preds = %entry
|
||||
%call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
|
||||
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
|
||||
%call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
|
||||
%mul7 = mul i32 %call5, %call6
|
||||
%add8 = add i32 %call4, %mul7
|
||||
%2 = load i32, i32* %Size.addr, align 4
|
||||
%3 = load i32, i32* %t.addr, align 4
|
||||
%sub9 = sub nsw i32 %2, %3
|
||||
%cmp10 = icmp uge i32 %add8, %sub9
|
||||
br i1 %cmp10, label %if.then11, label %if.end12
|
||||
|
||||
if.then11: ; preds = %if.end
|
||||
br label %if.end58
|
||||
|
||||
if.end12: ; preds = %if.end
|
||||
%call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%mul15 = mul i32 %call13, %call14
|
||||
%call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add17 = add i32 %mul15, %call16
|
||||
store i32 %add17, i32* %xidx, align 4
|
||||
%call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
|
||||
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
|
||||
%mul20 = mul i32 %call18, %call19
|
||||
%call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
|
||||
%add22 = add i32 %mul20, %call21
|
||||
store i32 %add22, i32* %yidx, align 4
|
||||
%4 = load float*, float** %m_cuda.addr, align 8
|
||||
%5 = load i32, i32* %Size.addr, align 4
|
||||
%6 = load i32, i32* %xidx, align 4
|
||||
%add23 = add nsw i32 %6, 1
|
||||
%7 = load i32, i32* %t.addr, align 4
|
||||
%add24 = add nsw i32 %add23, %7
|
||||
%mul25 = mul nsw i32 %5, %add24
|
||||
%8 = load i32, i32* %t.addr, align 4
|
||||
%add26 = add nsw i32 %mul25, %8
|
||||
%idxprom = sext i32 %add26 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
|
||||
%9 = load float, float* %arrayidx, align 4
|
||||
%10 = load float*, float** %a_cuda.addr, align 8
|
||||
%11 = load i32, i32* %Size.addr, align 4
|
||||
%12 = load i32, i32* %t.addr, align 4
|
||||
%mul27 = mul nsw i32 %11, %12
|
||||
%13 = load i32, i32* %yidx, align 4
|
||||
%14 = load i32, i32* %t.addr, align 4
|
||||
%add28 = add nsw i32 %13, %14
|
||||
%add29 = add nsw i32 %mul27, %add28
|
||||
%idxprom30 = sext i32 %add29 to i64
|
||||
%arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
|
||||
%15 = load float, float* %arrayidx31, align 4
|
||||
%mul32 = fmul contract float %9, %15
|
||||
%16 = load float*, float** %a_cuda.addr, align 8
|
||||
%17 = load i32, i32* %Size.addr, align 4
|
||||
%18 = load i32, i32* %xidx, align 4
|
||||
%add33 = add nsw i32 %18, 1
|
||||
%19 = load i32, i32* %t.addr, align 4
|
||||
%add34 = add nsw i32 %add33, %19
|
||||
%mul35 = mul nsw i32 %17, %add34
|
||||
%20 = load i32, i32* %yidx, align 4
|
||||
%21 = load i32, i32* %t.addr, align 4
|
||||
%add36 = add nsw i32 %20, %21
|
||||
%add37 = add nsw i32 %mul35, %add36
|
||||
%idxprom38 = sext i32 %add37 to i64
|
||||
%arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
|
||||
%22 = load float, float* %arrayidx39, align 4
|
||||
%sub40 = fsub contract float %22, %mul32
|
||||
store float %sub40, float* %arrayidx39, align 4
|
||||
%23 = load i32, i32* %yidx, align 4
|
||||
%cmp41 = icmp eq i32 %23, 0
|
||||
br i1 %cmp41, label %if.then42, label %if.end58
|
||||
|
||||
if.then42: ; preds = %if.end12
|
||||
%24 = load float*, float** %m_cuda.addr, align 8
|
||||
%25 = load i32, i32* %Size.addr, align 4
|
||||
%26 = load i32, i32* %xidx, align 4
|
||||
%add43 = add nsw i32 %26, 1
|
||||
%27 = load i32, i32* %t.addr, align 4
|
||||
%add44 = add nsw i32 %add43, %27
|
||||
%mul45 = mul nsw i32 %25, %add44
|
||||
%28 = load i32, i32* %yidx, align 4
|
||||
%29 = load i32, i32* %t.addr, align 4
|
||||
%add46 = add nsw i32 %28, %29
|
||||
%add47 = add nsw i32 %mul45, %add46
|
||||
%idxprom48 = sext i32 %add47 to i64
|
||||
%arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
|
||||
%30 = load float, float* %arrayidx49, align 4
|
||||
%31 = load float*, float** %b_cuda.addr, align 8
|
||||
%32 = load i32, i32* %t.addr, align 4
|
||||
%idxprom50 = sext i32 %32 to i64
|
||||
%arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
|
||||
%33 = load float, float* %arrayidx51, align 4
|
||||
%mul52 = fmul contract float %30, %33
|
||||
%34 = load float*, float** %b_cuda.addr, align 8
|
||||
%35 = load i32, i32* %xidx, align 4
|
||||
%add53 = add nsw i32 %35, 1
|
||||
%36 = load i32, i32* %t.addr, align 4
|
||||
%add54 = add nsw i32 %add53, %36
|
||||
%idxprom55 = sext i32 %add54 to i64
|
||||
%arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
|
||||
%37 = load float, float* %arrayidx56, align 4
|
||||
%sub57 = fsub contract float %37, %mul52
|
||||
store float %sub57, float* %arrayidx56, align 4
|
||||
br label %if.end58
|
||||
|
||||
if.end58: ; preds = %if.then, %if.then11, %if.then42, %if.end12
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
|
||||
!llvm.ident = !{!9}
|
||||
!nvvmir.version = !{!10}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
|
||||
!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
|
||||
!5 = !{null, !"align", i32 8}
|
||||
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!7 = !{null, !"align", i32 16}
|
||||
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!10 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,522 @@
|
|||
/*-----------------------------------------------------------
|
||||
** gaussian.cu -- The program is to solve a linear system Ax = b
|
||||
** by using Gaussian Elimination. The algorithm on page 101
|
||||
** ("Foundations of Parallel Programming") is used.
|
||||
** The sequential version is gaussian.c. This parallel
|
||||
** implementation converts three independent for() loops
|
||||
** into three Fans. Use the data file ge_3.dat to verify
|
||||
** the correction of the output.
|
||||
**
|
||||
** Written by Andreas Kura, 02/15/95
|
||||
** Modified by Chong-wei Xu, 04/20/95
|
||||
** Modified by Chris Gregg for CUDA, 07/20/2009
|
||||
**-----------------------------------------------------------
|
||||
*/
|
||||
#include "cuda_runtime.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef TIMING
|
||||
#include "timing.h"
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define MAXBLOCKSIZE RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define MAXBLOCKSIZE RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define MAXBLOCKSIZE RD_WG_SIZE
|
||||
#else
|
||||
#define MAXBLOCKSIZE 512
|
||||
#endif
|
||||
|
||||
// 2D defines. Go from specific to general
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE_XY RD_WG_SIZE_1
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_XY RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_XY 1
|
||||
#endif
|
||||
|
||||
#ifdef TIMING
|
||||
struct timeval tv;
|
||||
struct timeval tv_total_start, tv_total_end;
|
||||
struct timeval tv_h2d_start, tv_h2d_end;
|
||||
struct timeval tv_d2h_start, tv_d2h_end;
|
||||
struct timeval tv_kernel_start, tv_kernel_end;
|
||||
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||||
struct timeval tv_close_start, tv_close_end;
|
||||
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
|
||||
d2h_time = 0, close_time = 0, total_time = 0;
|
||||
#endif
|
||||
|
||||
int Size;
|
||||
float *a, *b, *finalVec;
|
||||
float *m;
|
||||
|
||||
FILE *fp;
|
||||
|
||||
void InitProblemOnce(char *filename);
|
||||
void InitPerRun();
|
||||
void ForwardSub();
|
||||
void BackSub();
|
||||
__global__ void Fan1(float *m, float *a, int Size, int t);
|
||||
__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
|
||||
void InitMat(float *ary, int nrow, int ncol);
|
||||
void InitAry(float *ary, int ary_size);
|
||||
void PrintMat(float *ary, int nrow, int ncolumn);
|
||||
void PrintAry(float *ary, int ary_size);
|
||||
void PrintDeviceProperties();
|
||||
void checkCUDAError(const char *msg);
|
||||
|
||||
unsigned int totalKernelTime = 0;
|
||||
|
||||
// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
|
||||
void create_matrix(float *m, int size) {
|
||||
int i, j;
|
||||
float lamda = -0.01;
|
||||
float coe[2 * size - 1];
|
||||
float coe_i = 0.0;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
coe_i = 10 * exp(lamda * i);
|
||||
j = size - 1 + i;
|
||||
coe[j] = coe_i;
|
||||
j = size - 1 - i;
|
||||
coe[j] = coe_i;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
m[i * size + j] = coe[size - 1 - i + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
|
||||
MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
|
||||
int verbose = 1;
|
||||
int i, j;
|
||||
char flag;
|
||||
if (argc < 2) {
|
||||
printf("Usage: gaussian -f filename / -s size [-q]\n\n");
|
||||
printf("-q (quiet) suppresses printing the matrix and result values.\n");
|
||||
printf("-f (filename) path of input file\n");
|
||||
printf(
|
||||
"-s (size) size of matrix. Create matrix and rhs in this program \n");
|
||||
printf(
|
||||
"The first line of the file contains the dimension of the matrix, n.");
|
||||
printf("The second line of the file is a newline.\n");
|
||||
printf("The next n lines contain n tab separated values for the matrix.");
|
||||
printf("The next line of the file is a newline.\n");
|
||||
printf("The next line of the file is a 1xn vector with tab separated "
|
||||
"values.\n");
|
||||
printf("The next line of the file is a newline. (optional)\n");
|
||||
printf("The final line of the file is the pre-computed solution. "
|
||||
"(optional)\n");
|
||||
printf("Example: matrix4.txt:\n");
|
||||
printf("4\n");
|
||||
printf("\n");
|
||||
printf("-0.6 -0.5 0.7 0.3\n");
|
||||
printf("-0.3 -0.9 0.3 0.7\n");
|
||||
printf("-0.4 -0.5 -0.3 -0.8\n");
|
||||
printf("0.0 -0.1 0.2 0.9\n");
|
||||
printf("\n");
|
||||
printf("-0.85 -0.68 0.24 -0.53\n");
|
||||
printf("\n");
|
||||
printf("0.7 0.0 -0.4 -0.5\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
cudaSetDevice(0);
|
||||
|
||||
PrintDeviceProperties();
|
||||
// char filename[100];
|
||||
// sprintf(filename,"matrices/matrix%d.txt",size);
|
||||
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (argv[i][0] == '-') { // flag
|
||||
flag = argv[i][1];
|
||||
switch (flag) {
|
||||
case 's': // platform
|
||||
i++;
|
||||
Size = atoi(argv[i]);
|
||||
printf("Create matrix internally in parse, size = %d \n", Size);
|
||||
|
||||
a = (float *)malloc(Size * Size * sizeof(float));
|
||||
create_matrix(a, Size);
|
||||
|
||||
b = (float *)malloc(Size * sizeof(float));
|
||||
for (j = 0; j < Size; j++)
|
||||
b[j] = 1.0;
|
||||
|
||||
m = (float *)malloc(Size * Size * sizeof(float));
|
||||
break;
|
||||
case 'f': // platform
|
||||
i++;
|
||||
printf("Read file from %s \n", argv[i]);
|
||||
InitProblemOnce(argv[i]);
|
||||
break;
|
||||
case 'q': // quiet
|
||||
verbose = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InitProblemOnce(filename);
|
||||
|
||||
InitPerRun();
|
||||
// begin timing
|
||||
struct timeval time_start;
|
||||
gettimeofday(&time_start, NULL);
|
||||
|
||||
// run kernels
|
||||
ForwardSub();
|
||||
|
||||
// end timing
|
||||
struct timeval time_end;
|
||||
gettimeofday(&time_end, NULL);
|
||||
unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
|
||||
(time_start.tv_sec * 1000000 + time_start.tv_usec);
|
||||
|
||||
if (verbose) {
|
||||
printf("Matrix m is: \n");
|
||||
PrintMat(m, Size, Size);
|
||||
|
||||
printf("Matrix a is: \n");
|
||||
PrintMat(a, Size, Size);
|
||||
|
||||
printf("Array b is: \n");
|
||||
PrintAry(b, Size);
|
||||
}
|
||||
BackSub();
|
||||
if (verbose) {
|
||||
printf("The final solution is: \n");
|
||||
PrintAry(finalVec, Size);
|
||||
}
|
||||
printf("\nTime total (including memory transfers)\t%f sec\n",
|
||||
time_total * 1e-6);
|
||||
printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
|
||||
|
||||
/*printf("%d,%d\n",size,time_total);
|
||||
fprintf(stderr,"%d,%d\n",size,time_total);*/
|
||||
|
||||
free(m);
|
||||
free(a);
|
||||
free(b);
|
||||
|
||||
#ifdef TIMING
|
||||
printf("Exec: %f\n", kernel_time);
|
||||
#endif
|
||||
}
|
||||
/*------------------------------------------------------
|
||||
** PrintDeviceProperties
|
||||
**-----------------------------------------------------
|
||||
*/
|
||||
void PrintDeviceProperties() {
|
||||
cudaDeviceProp deviceProp;
|
||||
int nDevCount = 0;
|
||||
|
||||
cudaGetDeviceCount(&nDevCount);
|
||||
printf("Total Device found: %d", nDevCount);
|
||||
for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
|
||||
memset(&deviceProp, 0, sizeof(deviceProp));
|
||||
if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
|
||||
printf("\nDevice Name \t\t - %s ", deviceProp.name);
|
||||
printf("\n**************************************");
|
||||
printf("\nTotal Global Memory\t\t\t - %lu KB",
|
||||
deviceProp.totalGlobalMem / 1024);
|
||||
printf("\nShared memory available per block \t - %lu KB",
|
||||
deviceProp.sharedMemPerBlock / 1024);
|
||||
printf("\nNumber of registers per thread block \t - %d",
|
||||
deviceProp.regsPerBlock);
|
||||
printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
|
||||
printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
|
||||
printf("\nMaximum threads per block \t\t - %d",
|
||||
deviceProp.maxThreadsPerBlock);
|
||||
printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
|
||||
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
|
||||
deviceProp.maxThreadsDim[2]);
|
||||
printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
|
||||
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
|
||||
deviceProp.maxGridSize[2]);
|
||||
printf("\nTotal constant memory \t\t\t - %zu bytes",
|
||||
deviceProp.totalConstMem);
|
||||
printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
|
||||
printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
|
||||
printf("\nTexture Alignment \t\t\t - %zu bytes",
|
||||
deviceProp.textureAlignment);
|
||||
printf("\nDevice Overlap \t\t\t\t - %s",
|
||||
deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
|
||||
printf("\nNumber of Multi processors \t\t - %d\n\n",
|
||||
deviceProp.multiProcessorCount);
|
||||
} else
|
||||
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitProblemOnce -- Initialize all of matrices and
|
||||
** vectors by opening a data file specified by the user.
|
||||
**
|
||||
** We used dynamic array *a, *b, and *m to allocate
|
||||
** the memory storages.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitProblemOnce(char *filename) {
|
||||
// char *filename = argv[1];
|
||||
|
||||
// printf("Enter the data file name: ");
|
||||
// scanf("%s", filename);
|
||||
printf("The file name is: %s\n", filename);
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
|
||||
fscanf(fp, "%d", &Size);
|
||||
|
||||
a = (float *)malloc(Size * Size * sizeof(float));
|
||||
|
||||
InitMat(a, Size, Size);
|
||||
printf("The input matrix a is:\n");
|
||||
PrintMat(a, Size, Size);
|
||||
b = (float *)malloc(Size * sizeof(float));
|
||||
|
||||
InitAry(b, Size);
|
||||
printf("The input array b is:\n");
|
||||
PrintAry(b, Size);
|
||||
|
||||
m = (float *)malloc(Size * Size * sizeof(float));
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitPerRun() -- Initialize the contents of the
|
||||
** multipier matrix **m
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitPerRun() {
|
||||
int i;
|
||||
for (i = 0; i < Size * Size; i++)
|
||||
*(m + i) = 0.0;
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan1() -- Calculate multiplier matrix
|
||||
** Pay attention to the index. Index i give the range
|
||||
** which starts from 0 to range-1. The real values of
|
||||
** the index should be adjust and related with the value
|
||||
** of t which is defined on the ForwardSub().
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
|
||||
// if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
|
||||
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
|
||||
// Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
|
||||
// }
|
||||
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
|
||||
return;
|
||||
*(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
|
||||
*(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
|
||||
*(a_cuda + Size * t + t);
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan2() -- Modify the matrix A into LUD
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
|
||||
__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
|
||||
int j1, int t) {
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
|
||||
return;
|
||||
if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
|
||||
return;
|
||||
|
||||
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
// printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
|
||||
// blockDim.x: %d, blockDim.y:
|
||||
// %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
|
||||
|
||||
a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
|
||||
m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
|
||||
// a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
|
||||
if (yidx == 0) {
|
||||
// printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
|
||||
// printf("xidx:%d,yidx:%d\n",xidx,yidx);
|
||||
b_cuda[xidx + 1 + t] -=
|
||||
m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** ForwardSub() -- Forward substitution of Gaussian
|
||||
** elimination.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void ForwardSub() {
|
||||
int t;
|
||||
float *m_cuda, *a_cuda, *b_cuda;
|
||||
|
||||
int A = 1;
|
||||
int B = 2;
|
||||
int C = 3;
|
||||
int D = 4;
|
||||
int E = 5;
|
||||
int F = 6;
|
||||
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
|
||||
// A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
|
||||
// threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
|
||||
|
||||
// allocate memory on GPU
|
||||
cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
|
||||
|
||||
cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
|
||||
|
||||
cudaMalloc((void **)&b_cuda, Size * sizeof(float));
|
||||
|
||||
// copy memory to GPU
|
||||
cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
int block_size, grid_size;
|
||||
|
||||
block_size = MAXBLOCKSIZE;
|
||||
grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
|
||||
printf("1d grid size: %d\n", grid_size);
|
||||
|
||||
dim3 dimBlock(block_size);
|
||||
dim3 dimGrid(grid_size);
|
||||
// dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
|
||||
|
||||
int blockSize2d, gridSize2d;
|
||||
blockSize2d = BLOCK_SIZE_XY;
|
||||
gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
|
||||
|
||||
dim3 dimBlockXY(blockSize2d, blockSize2d);
|
||||
|
||||
printf("BlockXY: %d \n", blockSize2d);
|
||||
dim3 dimGridXY(gridSize2d, gridSize2d);
|
||||
|
||||
#ifdef TIMING
|
||||
gettimeofday(&tv_kernel_start, NULL);
|
||||
#endif
|
||||
printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
|
||||
// begin timing kernels
|
||||
struct timeval time_start;
|
||||
gettimeofday(&time_start, NULL);
|
||||
for (t = 0; t < (Size - 1); t++) {
|
||||
Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
|
||||
cudaDeviceSynchronize();
|
||||
Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
|
||||
cudaDeviceSynchronize();
|
||||
checkCUDAError("Fan2");
|
||||
}
|
||||
// end timing kernels
|
||||
struct timeval time_end;
|
||||
gettimeofday(&time_end, NULL);
|
||||
totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
|
||||
(time_start.tv_sec * 1000000 + time_start.tv_usec);
|
||||
|
||||
#ifdef TIMING
|
||||
tvsub(&time_end, &tv_kernel_start, &tv);
|
||||
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
|
||||
#endif
|
||||
|
||||
// copy memory back to CPU
|
||||
cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaFree(m_cuda);
|
||||
cudaFree(a_cuda);
|
||||
cudaFree(b_cuda);
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** BackSub() -- Backward substitution
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
|
||||
void BackSub() {
|
||||
// create a new vector to hold the final answer
|
||||
finalVec = (float *)malloc(Size * sizeof(float));
|
||||
// solve "bottom up"
|
||||
int i, j;
|
||||
for (i = 0; i < Size; i++) {
|
||||
finalVec[Size - i - 1] = b[Size - i - 1];
|
||||
for (j = 0; j < i; j++) {
|
||||
finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
|
||||
finalVec[Size - j - 1];
|
||||
}
|
||||
finalVec[Size - i - 1] =
|
||||
finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
|
||||
}
|
||||
}
|
||||
|
||||
void InitMat(float *ary, int nrow, int ncol) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nrow; i++) {
|
||||
for (j = 0; j < ncol; j++) {
|
||||
fscanf(fp, "%f", ary + Size * i + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintMat() -- Print the contents of the matrix
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintMat(float *ary, int nrow, int ncol) {
|
||||
return;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nrow; i++) {
|
||||
for (j = 0; j < ncol; j++) {
|
||||
printf("%8.2f ", *(ary + Size * i + j));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitAry() -- Initialize the array (vector) by reading
|
||||
** data from the data file
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitAry(float *ary, int ary_size) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ary_size; i++) {
|
||||
fscanf(fp, "%f", &ary[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintAry() -- Print the contents of the array (vector)
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintAry(float *ary, int ary_size) {
|
||||
int i;
|
||||
for (i = 0; i < ary_size; i++) {
|
||||
printf("%.2f ", ary[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
void checkCUDAError(const char *msg) {
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (cudaSuccess != err) {
|
||||
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime \
|
||||
-L../../build/runtime/threadPool \
|
||||
-o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
|
||||
|
||||
if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,317 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* avilib.h
|
||||
*
|
||||
* Copyright (C) Thomas Östreich - June 2001
|
||||
* multiple audio track support Copyright (C) 2002 Thomas Östreich
|
||||
*
|
||||
* Original code:
|
||||
* Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
|
||||
*
|
||||
* This file is part of transcode, a linux video stream processing tool
|
||||
*
|
||||
* transcode is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* transcode is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
// #include <windows.h>
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifndef AVILIB_H
|
||||
#define AVILIB_H
|
||||
|
||||
#define AVI_MAX_TRACKS 8
|
||||
|
||||
typedef struct {
|
||||
unsigned long key;
|
||||
unsigned long pos;
|
||||
unsigned long len;
|
||||
} video_index_entry;
|
||||
|
||||
typedef struct {
|
||||
unsigned long pos;
|
||||
unsigned long len;
|
||||
unsigned long tot;
|
||||
} audio_index_entry;
|
||||
|
||||
typedef struct track_s {
|
||||
|
||||
long a_fmt; /* Audio format, see #defines below */
|
||||
long a_chans; /* Audio channels, 0 for no audio */
|
||||
long a_rate; /* Rate in Hz */
|
||||
long a_bits; /* bits per audio sample */
|
||||
long mp3rate; /* mp3 bitrate kbs*/
|
||||
|
||||
long audio_strn; /* Audio stream number */
|
||||
long audio_bytes; /* Total number of bytes of audio data */
|
||||
long audio_chunks; /* Chunks of audio data in the file */
|
||||
|
||||
char audio_tag[4]; /* Tag of audio data */
|
||||
long audio_posc; /* Audio position: chunk */
|
||||
long audio_posb; /* Audio position: byte within chunk */
|
||||
|
||||
long a_codech_off; /* absolut offset of audio codec information */
|
||||
long a_codecf_off; /* absolut offset of audio codec information */
|
||||
|
||||
audio_index_entry *audio_index;
|
||||
|
||||
} track_t;
|
||||
|
||||
typedef struct {
|
||||
|
||||
long fdes; /* File descriptor of AVI file */
|
||||
long mode; /* 0 for reading, 1 for writing */
|
||||
|
||||
long width; /* Width of a video frame */
|
||||
long height; /* Height of a video frame */
|
||||
double fps; /* Frames per second */
|
||||
char compressor[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
|
||||
char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
|
||||
long video_strn; /* Video stream number */
|
||||
long video_frames; /* Number of video frames */
|
||||
char video_tag[4]; /* Tag of video data */
|
||||
long video_pos; /* Number of next frame to be read
|
||||
(if index present) */
|
||||
|
||||
unsigned long max_len; /* maximum video chunk present */
|
||||
|
||||
track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
|
||||
|
||||
unsigned long pos; /* position in file */
|
||||
long n_idx; /* number of index entries actually filled */
|
||||
long max_idx; /* number of index entries actually allocated */
|
||||
|
||||
long v_codech_off; /* absolut offset of video codec (strh) info */
|
||||
long v_codecf_off; /* absolut offset of video codec (strf) info */
|
||||
|
||||
unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
|
||||
video_index_entry *video_index;
|
||||
|
||||
unsigned long last_pos; /* Position of last frame written */
|
||||
unsigned long last_len; /* Length of last frame written */
|
||||
int must_use_index; /* Flag if frames are duplicated */
|
||||
unsigned long movi_start;
|
||||
|
||||
int anum; // total number of audio tracks
|
||||
int aptr; // current audio working track
|
||||
|
||||
} avi_t;
|
||||
|
||||
#define AVI_MODE_WRITE 0
|
||||
#define AVI_MODE_READ 1
|
||||
|
||||
/* The error codes delivered by avi_open_input_file */
|
||||
|
||||
#define AVI_ERR_SIZELIM \
|
||||
1 /* The write of the data would exceed \
|
||||
the maximum size of the AVI file. \
|
||||
This is more a warning than an \
|
||||
error since the file may be closed safely */
|
||||
|
||||
#define AVI_ERR_OPEN \
|
||||
2 /* Error opening the AVI file - wrong path \
|
||||
name or file nor readable/writable \
|
||||
*/
|
||||
|
||||
#define AVI_ERR_READ 3 /* Error reading from AVI File */
|
||||
|
||||
#define AVI_ERR_WRITE \
|
||||
4 /* Error writing to AVI File, \
|
||||
disk full ??? */
|
||||
|
||||
#define AVI_ERR_WRITE_INDEX \
|
||||
5 /* Could not write index to AVI file \
|
||||
during close, file may still be \
|
||||
usable */
|
||||
|
||||
#define AVI_ERR_CLOSE \
|
||||
6 /* Could not write header to AVI file \
|
||||
or not truncate the file during \
|
||||
close, file is most probably corrupted */
|
||||
|
||||
#define AVI_ERR_NOT_PERM \
|
||||
7 /* Operation not permitted: \
|
||||
trying to read from a file open \
|
||||
for writing or vice versa */
|
||||
|
||||
#define AVI_ERR_NO_MEM 8 /* malloc failed */
|
||||
|
||||
#define AVI_ERR_NO_AVI 9 /* Not an AVI file */
|
||||
|
||||
#define AVI_ERR_NO_HDRL \
|
||||
10 /* AVI file has no has no header list, \
|
||||
corrupted ??? */
|
||||
|
||||
#define AVI_ERR_NO_MOVI \
|
||||
11 /* AVI file has no has no MOVI list, \
|
||||
corrupted ??? */
|
||||
|
||||
#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
|
||||
|
||||
#define AVI_ERR_NO_IDX \
|
||||
13 /* The file has been opened with \
|
||||
getIndex==0, but an operation has \
|
||||
been performed that needs an index */
|
||||
|
||||
/* Possible Audio formats */
|
||||
|
||||
#ifndef WAVE_FORMAT_PCM
|
||||
#define WAVE_FORMAT_UNKNOWN (0x0000)
|
||||
#define WAVE_FORMAT_PCM (0x0001)
|
||||
#define WAVE_FORMAT_ADPCM (0x0002)
|
||||
#define WAVE_FORMAT_IBM_CVSD (0x0005)
|
||||
#define WAVE_FORMAT_ALAW (0x0006)
|
||||
#define WAVE_FORMAT_MULAW (0x0007)
|
||||
#define WAVE_FORMAT_OKI_ADPCM (0x0010)
|
||||
#define WAVE_FORMAT_DVI_ADPCM (0x0011)
|
||||
#define WAVE_FORMAT_DIGISTD (0x0015)
|
||||
#define WAVE_FORMAT_DIGIFIX (0x0016)
|
||||
#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
|
||||
#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
|
||||
#define WAVE_FORMAT_GSM610 (0x0031)
|
||||
#define IBM_FORMAT_MULAW (0x0101)
|
||||
#define IBM_FORMAT_ALAW (0x0102)
|
||||
#define IBM_FORMAT_ADPCM (0x0103)
|
||||
#endif
|
||||
|
||||
avi_t *AVI_open_output_file(char *filename);
|
||||
void AVI_set_video(avi_t *AVI, int width, int height, double fps,
|
||||
char *compressor);
|
||||
void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
|
||||
long mp3rate);
|
||||
int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
|
||||
int AVI_dup_frame(avi_t *AVI);
|
||||
int AVI_write_audio(avi_t *AVI, char *data, long bytes);
|
||||
int AVI_append_audio(avi_t *AVI, char *data, long bytes);
|
||||
long AVI_bytes_remain(avi_t *AVI);
|
||||
int AVI_close(avi_t *AVI);
|
||||
long AVI_bytes_written(avi_t *AVI);
|
||||
|
||||
avi_t *AVI_open_input_file(char *filename, int getIndex);
|
||||
avi_t *AVI_open_fd(int fd, int getIndex);
|
||||
int avi_parse_input_file(avi_t *AVI, int getIndex);
|
||||
long AVI_audio_mp3rate(avi_t *AVI);
|
||||
long AVI_video_frames(avi_t *AVI);
|
||||
int AVI_video_width(avi_t *AVI);
|
||||
int AVI_video_height(avi_t *AVI);
|
||||
double AVI_frame_rate(avi_t *AVI);
|
||||
char *AVI_video_compressor(avi_t *AVI);
|
||||
|
||||
int AVI_audio_channels(avi_t *AVI);
|
||||
int AVI_audio_bits(avi_t *AVI);
|
||||
int AVI_audio_format(avi_t *AVI);
|
||||
long AVI_audio_rate(avi_t *AVI);
|
||||
long AVI_audio_bytes(avi_t *AVI);
|
||||
long AVI_audio_chunks(avi_t *AVI);
|
||||
|
||||
long AVI_max_video_chunk(avi_t *AVI);
|
||||
|
||||
long AVI_frame_size(avi_t *AVI, long frame);
|
||||
long AVI_audio_size(avi_t *AVI, long frame);
|
||||
int AVI_seek_start(avi_t *AVI);
|
||||
int AVI_set_video_position(avi_t *AVI, long frame);
|
||||
long AVI_get_video_position(avi_t *AVI, long frame);
|
||||
long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
|
||||
|
||||
int AVI_set_audio_position(avi_t *AVI, long byte);
|
||||
int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
|
||||
|
||||
long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
|
||||
|
||||
long AVI_audio_codech_offset(avi_t *AVI);
|
||||
long AVI_audio_codecf_offset(avi_t *AVI);
|
||||
long AVI_video_codech_offset(avi_t *AVI);
|
||||
long AVI_video_codecf_offset(avi_t *AVI);
|
||||
|
||||
int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
|
||||
long max_audbuf, long *len);
|
||||
|
||||
void AVI_print_error(char *str);
|
||||
char *AVI_strerror();
|
||||
char *AVI_syserror();
|
||||
|
||||
int AVI_scan(char *name);
|
||||
int AVI_dump(char *name, int mode);
|
||||
|
||||
char *AVI_codec2str(short cc);
|
||||
int AVI_file_check(char *import_file);
|
||||
|
||||
void AVI_info(avi_t *avifile);
|
||||
uint64_t AVI_max_size();
|
||||
int avi_update_header(avi_t *AVI);
|
||||
|
||||
int AVI_set_audio_track(avi_t *AVI, int track);
|
||||
int AVI_get_audio_track(avi_t *AVI);
|
||||
int AVI_audio_tracks(avi_t *AVI);
|
||||
|
||||
struct riff_struct {
|
||||
unsigned char id[4]; /* RIFF */
|
||||
unsigned long len;
|
||||
unsigned char wave_id[4]; /* WAVE */
|
||||
};
|
||||
|
||||
struct chunk_struct {
|
||||
unsigned char id[4];
|
||||
unsigned long len;
|
||||
};
|
||||
|
||||
struct common_struct {
|
||||
unsigned short wFormatTag;
|
||||
unsigned short wChannels;
|
||||
unsigned long dwSamplesPerSec;
|
||||
unsigned long dwAvgBytesPerSec;
|
||||
unsigned short wBlockAlign;
|
||||
unsigned short wBitsPerSample; /* Only for PCM */
|
||||
};
|
||||
|
||||
struct wave_header {
|
||||
struct riff_struct riff;
|
||||
struct chunk_struct format;
|
||||
struct common_struct common;
|
||||
struct chunk_struct data;
|
||||
};
|
||||
|
||||
struct AVIStreamHeader {
|
||||
long fccType;
|
||||
long fccHandler;
|
||||
long dwFlags;
|
||||
long dwPriority;
|
||||
long dwInitialFrames;
|
||||
long dwScale;
|
||||
long dwRate;
|
||||
long dwStart;
|
||||
long dwLength;
|
||||
long dwSuggestedBufferSize;
|
||||
long dwQuality;
|
||||
long dwSampleSize;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,130 @@
|
|||
// #ifdef __cplusplus
|
||||
// extern "C" {
|
||||
// #endif
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
#include "avimod.h"
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// FUNCTIONS
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
// Flips the specified image and crops it to the specified dimensions
|
||||
// If scaled == true, all values are scaled to the range [0.0, 1.0
|
||||
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
|
||||
int converted) {
|
||||
|
||||
// fixed dimensions for cropping or not cropping, square vertices starting
|
||||
// from initial point in top left corner going down and right
|
||||
int top;
|
||||
int bottom;
|
||||
int left;
|
||||
int right;
|
||||
if (cropped == 1) {
|
||||
top = 0;
|
||||
bottom = 0;
|
||||
left = 0;
|
||||
right = 0;
|
||||
} else {
|
||||
top = 0;
|
||||
bottom = height - 1;
|
||||
left = 0;
|
||||
right = width - 1;
|
||||
}
|
||||
|
||||
// dimensions of new cropped image
|
||||
int height_new = bottom - top + 1;
|
||||
int width_new = right - left + 1;
|
||||
|
||||
// counters
|
||||
int i, j;
|
||||
|
||||
// allocate memory for cropped/flipped frame
|
||||
fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
|
||||
|
||||
// crop/flip and scale frame
|
||||
fp temp;
|
||||
if (scaled) {
|
||||
fp scale = 1.0 / 255.0;
|
||||
for (i = 0; i < height_new; i++) { // rows
|
||||
for (j = 0; j < width_new; j++) { // colums
|
||||
temp =
|
||||
(fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
|
||||
if (temp < 0) {
|
||||
result[i * width_new + j] = temp + 256;
|
||||
} else {
|
||||
result[i * width_new + j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < height_new; i++) { // rows
|
||||
for (j = 0; j < width_new; j++) { // colums
|
||||
temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
|
||||
if (temp < 0) {
|
||||
result[i * width_new + j] = temp + 256;
|
||||
} else {
|
||||
result[i * width_new + j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// convert storage method (from row-major to column-major)
|
||||
fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
|
||||
if (converted == 1) {
|
||||
for (i = 0; i < width_new; i++) { // rows
|
||||
for (j = 0; j < height_new; j++) { // colums
|
||||
result_converted[i * height_new + j] = result[j * width_new + i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result_converted = result;
|
||||
}
|
||||
free(result);
|
||||
|
||||
// return
|
||||
return result_converted;
|
||||
}
|
||||
|
||||
// Returns the specified frame from the specified video file
|
||||
// If cropped == true, the frame is cropped to pre-determined dimensions
|
||||
// (hardcoded to the boundaries of the blood vessel in the test video)
|
||||
// If scaled == true, all values are scaled to the range [0.0, 1.0]
|
||||
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
|
||||
int converted) {
|
||||
|
||||
// variable
|
||||
int dummy;
|
||||
int width = AVI_video_width(cell_file);
|
||||
int height = AVI_video_height(cell_file);
|
||||
int status;
|
||||
|
||||
// There are 600 frames in this file (i.e. frame_num = 600 causes an error)
|
||||
AVI_set_video_position(cell_file, frame_num);
|
||||
|
||||
// Read in the frame from the AVI
|
||||
char *image_buf = (char *)malloc(width * height * sizeof(char));
|
||||
status = AVI_read_frame(cell_file, image_buf, &dummy);
|
||||
if (status == -1) {
|
||||
AVI_print_error((char *)"Error with AVI_read_frame");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// The image is read in upside-down, so we need to flip it
|
||||
fp *image_chopped;
|
||||
image_chopped =
|
||||
chop_flip_image(image_buf, height, width, cropped, scaled, converted);
|
||||
|
||||
// free image buffer
|
||||
free(image_buf);
|
||||
|
||||
// return
|
||||
return image_chopped;
|
||||
}
|
||||
|
||||
// #ifdef __cplusplus
|
||||
// }
|
||||
// #endif
|
|
@ -0,0 +1,24 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
#define fp float
|
||||
|
||||
#include "avilib.h"
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
|
||||
int converted);
|
||||
|
||||
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
|
||||
int converted);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,396 @@
|
|||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
#define fp float
|
||||
|
||||
/* #define NUMBER_THREADS 512 */
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define NUMBER_THREADS RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define NUMBER_THREADS RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define NUMBER_THREADS RD_WG_SIZE
|
||||
#else
|
||||
#define NUMBER_THREADS 256
|
||||
#endif
|
||||
|
||||
#define ENDO_POINTS 20
|
||||
#define EPI_POINTS 31
|
||||
#define ALL_POINTS 51
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// PARAMS_COMMON_CHANGE STRUCT
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
typedef struct params_common_change {
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_frame;
|
||||
int frame_no;
|
||||
|
||||
} params_common_change;
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// PARAMS_COMMON STRUCTURE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
typedef struct params_common {
|
||||
|
||||
//======================================================================================================================================================
|
||||
// HARDCODED INPUTS FROM MATLAB
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// CONSTANTS
|
||||
//====================================================================================================
|
||||
|
||||
int sSize;
|
||||
int tSize;
|
||||
int maxMove;
|
||||
fp alpha;
|
||||
|
||||
//====================================================================================================
|
||||
// FRAME
|
||||
//====================================================================================================
|
||||
|
||||
int no_frames;
|
||||
int frame_rows;
|
||||
int frame_cols;
|
||||
int frame_elem;
|
||||
int frame_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// ENDO POINTS
|
||||
//====================================================================================================
|
||||
|
||||
int endoPoints;
|
||||
int endo_mem;
|
||||
|
||||
int *endoRow;
|
||||
int *endoCol;
|
||||
int *tEndoRowLoc;
|
||||
int *tEndoColLoc;
|
||||
|
||||
int *d_endoRow;
|
||||
int *d_endoCol;
|
||||
int *d_tEndoRowLoc;
|
||||
int *d_tEndoColLoc;
|
||||
|
||||
fp *d_endoT;
|
||||
|
||||
//====================================================================================================
|
||||
// EPI POINTS
|
||||
//====================================================================================================
|
||||
int epiPoints;
|
||||
int epi_mem;
|
||||
|
||||
int *epiRow;
|
||||
int *epiCol;
|
||||
int *tEpiRowLoc;
|
||||
int *tEpiColLoc;
|
||||
|
||||
int *d_epiRow;
|
||||
int *d_epiCol;
|
||||
int *d_tEpiRowLoc;
|
||||
int *d_tEpiColLoc;
|
||||
|
||||
fp *d_epiT;
|
||||
|
||||
//====================================================================================================
|
||||
// ALL POINTS
|
||||
//====================================================================================================
|
||||
|
||||
int allPoints;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in_rows;
|
||||
int in_cols;
|
||||
int in_elem;
|
||||
int in_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// AREA AROUND POINT FROM FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in2_rows;
|
||||
int in2_cols;
|
||||
int in2_elem;
|
||||
int in2_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
int conv_rows;
|
||||
int conv_cols;
|
||||
int conv_elem;
|
||||
int conv_mem;
|
||||
int ioffset;
|
||||
int joffset;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 1
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// PAD ARRAY, VERTICAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
int in2_pad_add_rows;
|
||||
int in2_pad_add_cols;
|
||||
int in2_pad_cumv_rows;
|
||||
int in2_pad_cumv_cols;
|
||||
int in2_pad_cumv_elem;
|
||||
int in2_pad_cumv_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_pad_cumv_sel_rows;
|
||||
int in2_pad_cumv_sel_cols;
|
||||
int in2_pad_cumv_sel_elem;
|
||||
int in2_pad_cumv_sel_mem;
|
||||
int in2_pad_cumv_sel_rowlow;
|
||||
int in2_pad_cumv_sel_rowhig;
|
||||
int in2_pad_cumv_sel_collow;
|
||||
int in2_pad_cumv_sel_colhig;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
int in2_pad_cumv_sel2_rowlow;
|
||||
int in2_pad_cumv_sel2_rowhig;
|
||||
int in2_pad_cumv_sel2_collow;
|
||||
int in2_pad_cumv_sel2_colhig;
|
||||
int in2_sub_cumh_rows;
|
||||
int in2_sub_cumh_cols;
|
||||
int in2_sub_cumh_elem;
|
||||
int in2_sub_cumh_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sub_cumh_sel_rows;
|
||||
int in2_sub_cumh_sel_cols;
|
||||
int in2_sub_cumh_sel_elem;
|
||||
int in2_sub_cumh_sel_mem;
|
||||
int in2_sub_cumh_sel_rowlow;
|
||||
int in2_sub_cumh_sel_rowhig;
|
||||
int in2_sub_cumh_sel_collow;
|
||||
int in2_sub_cumh_sel_colhig;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sub_cumh_sel2_rowlow;
|
||||
int in2_sub_cumh_sel2_rowhig;
|
||||
int in2_sub_cumh_sel2_collow;
|
||||
int in2_sub_cumh_sel2_colhig;
|
||||
int in2_sub2_rows;
|
||||
int in2_sub2_cols;
|
||||
int in2_sub2_elem;
|
||||
int in2_sub2_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 2
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// MULTIPLICATION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sqr_rows;
|
||||
int in2_sqr_cols;
|
||||
int in2_sqr_elem;
|
||||
int in2_sqr_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sqr_sub2_rows;
|
||||
int in2_sqr_sub2_cols;
|
||||
int in2_sqr_sub2_elem;
|
||||
int in2_sqr_sub2_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FINAL
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in_sqr_rows;
|
||||
int in_sqr_cols;
|
||||
int in_sqr_elem;
|
||||
int in_sqr_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE MASK CREATE
|
||||
//======================================================================================================================================================
|
||||
|
||||
int tMask_rows;
|
||||
int tMask_cols;
|
||||
int tMask_elem;
|
||||
int tMask_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT MASK INITIALIZE
|
||||
//======================================================================================================================================================
|
||||
|
||||
int mask_rows;
|
||||
int mask_cols;
|
||||
int mask_elem;
|
||||
int mask_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// MASK CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
int mask_conv_rows;
|
||||
int mask_conv_cols;
|
||||
int mask_conv_elem;
|
||||
int mask_conv_mem;
|
||||
int mask_conv_ioffset;
|
||||
int mask_conv_joffset;
|
||||
|
||||
} params_common;
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// PARAMS_UNIQUE STRUCTURE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
typedef struct params_unique {
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT NUMBER
|
||||
//======================================================================================================================================================
|
||||
|
||||
int *d_Row;
|
||||
int *d_Col;
|
||||
int *d_tRowLoc;
|
||||
int *d_tColLoc;
|
||||
fp *d_T;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT NUMBER
|
||||
//======================================================================================================================================================
|
||||
|
||||
int point_no;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in_pointer;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// AREA AROUND POINT FROM FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_in2;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_conv;
|
||||
fp *d_in_mod;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// PAD ARRAY, VERTICAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_pad_cumv;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_pad_cumv_sel;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sub_cumh;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sub_cumh_sel;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sub2;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 2
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// MULTIPLICATION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sqr;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sqr_sub2;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FINAL
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_in_sqr;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE MASK
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_tMask;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT MASK INITIALIZE
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_mask;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// MASK CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_mask_conv;
|
||||
|
||||
} params_unique;
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// END OF STRUCTURE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,795 @@
|
|||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
//======================================================================================================================================================
|
||||
// LIBRARIES
|
||||
//======================================================================================================================================================
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <avilib.h>
|
||||
#include <avimod.h>
|
||||
#include <cuda.h>
|
||||
|
||||
//======================================================================================================================================================
|
||||
// STRUCTURES, GLOBAL STRUCTURE VARIABLES
|
||||
//======================================================================================================================================================
|
||||
|
||||
#include "define.c"
|
||||
|
||||
params_common_change common_change;
|
||||
__constant__ params_common_change d_common_change;
|
||||
|
||||
params_common common;
|
||||
__constant__ params_common d_common;
|
||||
|
||||
params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
|
||||
// more than usually needed
|
||||
__constant__ params_unique d_unique[ALL_POINTS];
|
||||
|
||||
//======================================================================================================================================================
|
||||
// KERNEL CODE
|
||||
//======================================================================================================================================================
|
||||
|
||||
#include "kernel.cu"
|
||||
|
||||
// WRITE DATA FUNCTION
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
void write_data(char *filename, int frameNo, int frames_processed,
|
||||
int endoPoints, int *input_a, int *input_b, int epiPoints,
|
||||
int *input_2a, int *input_2b) {
|
||||
|
||||
//================================================================================80
|
||||
// VARIABLES
|
||||
//================================================================================80
|
||||
|
||||
FILE *fid;
|
||||
int i, j;
|
||||
char c;
|
||||
|
||||
//================================================================================80
|
||||
// OPEN FILE FOR READING
|
||||
//================================================================================80
|
||||
|
||||
fid = fopen(filename, "w+");
|
||||
if (fid == NULL) {
|
||||
printf("The file was not opened for writing\n");
|
||||
return;
|
||||
}
|
||||
|
||||
//================================================================================80
|
||||
// WRITE VALUES TO THE FILE
|
||||
//================================================================================80
|
||||
fprintf(fid, "Total AVI Frames: %d\n", frameNo);
|
||||
fprintf(fid, "Frames Processed: %d\n", frames_processed);
|
||||
fprintf(fid, "endoPoints: %d\n", endoPoints);
|
||||
fprintf(fid, "epiPoints: %d", epiPoints);
|
||||
for (j = 0; j < frames_processed; j++) {
|
||||
fprintf(fid, "\n---Frame %d---", j);
|
||||
fprintf(fid, "\n--endo--\n", j);
|
||||
for (i = 0; i < endoPoints; i++) {
|
||||
fprintf(fid, "%d\t", input_a[j + i * frameNo]);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
for (i = 0; i < endoPoints; i++) {
|
||||
// if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
|
||||
fprintf(fid, "%d\t", input_b[j + i * frameNo]);
|
||||
}
|
||||
fprintf(fid, "\n--epi--\n", j);
|
||||
for (i = 0; i < epiPoints; i++) {
|
||||
// if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
|
||||
fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
for (i = 0; i < epiPoints; i++) {
|
||||
// if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
|
||||
fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
|
||||
}
|
||||
}
|
||||
// ================================================================================80
|
||||
// CLOSE FILE
|
||||
// ================================================================================80
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// MAIN FUNCTION
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
int main(int argc, char *argv[]) {
|
||||
cudaSetDevice(0);
|
||||
printf("WG size of kernel = %d \n", NUMBER_THREADS);
|
||||
//======================================================================================================================================================
|
||||
// VARIABLES
|
||||
//======================================================================================================================================================
|
||||
|
||||
// CUDA kernel execution parameters
|
||||
dim3 threads;
|
||||
dim3 blocks;
|
||||
|
||||
// counter
|
||||
int i;
|
||||
int frames_processed;
|
||||
|
||||
// frames
|
||||
char *video_file_name;
|
||||
avi_t *frames;
|
||||
fp *frame;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
if (argc != 3) {
|
||||
printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// open movie file
|
||||
video_file_name = argv[1];
|
||||
frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
|
||||
if (frames == NULL) {
|
||||
AVI_print_error((char *)"Error with AVI_open_input_file");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// common
|
||||
common.no_frames = AVI_video_frames(frames);
|
||||
common.frame_rows = AVI_video_height(frames);
|
||||
common.frame_cols = AVI_video_width(frames);
|
||||
common.frame_elem = common.frame_rows * common.frame_cols;
|
||||
common.frame_mem = sizeof(fp) * common.frame_elem;
|
||||
|
||||
// pointers
|
||||
cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CHECK INPUT ARGUMENTS
|
||||
//======================================================================================================================================================
|
||||
|
||||
frames_processed = atoi(argv[2]);
|
||||
if (frames_processed < 0 || frames_processed > common.no_frames) {
|
||||
printf("ERROR: %d is an incorrect number of frames specified, select in "
|
||||
"the range of 0-%d\n",
|
||||
frames_processed, common.no_frames);
|
||||
return 0;
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// HARDCODED INPUTS FROM MATLAB
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// CONSTANTS
|
||||
//====================================================================================================
|
||||
|
||||
common.sSize = 40;
|
||||
common.tSize = 25;
|
||||
common.maxMove = 10;
|
||||
common.alpha = 0.87;
|
||||
|
||||
//====================================================================================================
|
||||
// ENDO POINTS
|
||||
//====================================================================================================
|
||||
|
||||
common.endoPoints = ENDO_POINTS;
|
||||
common.endo_mem = sizeof(int) * common.endoPoints;
|
||||
|
||||
common.endoRow = (int *)malloc(common.endo_mem);
|
||||
common.endoRow[0] = 369;
|
||||
common.endoRow[1] = 400;
|
||||
common.endoRow[2] = 429;
|
||||
common.endoRow[3] = 452;
|
||||
common.endoRow[4] = 476;
|
||||
common.endoRow[5] = 486;
|
||||
common.endoRow[6] = 479;
|
||||
common.endoRow[7] = 458;
|
||||
common.endoRow[8] = 433;
|
||||
common.endoRow[9] = 404;
|
||||
common.endoRow[10] = 374;
|
||||
common.endoRow[11] = 346;
|
||||
common.endoRow[12] = 318;
|
||||
common.endoRow[13] = 294;
|
||||
common.endoRow[14] = 277;
|
||||
common.endoRow[15] = 269;
|
||||
common.endoRow[16] = 275;
|
||||
common.endoRow[17] = 287;
|
||||
common.endoRow[18] = 311;
|
||||
common.endoRow[19] = 339;
|
||||
cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
|
||||
cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.endoCol = (int *)malloc(common.endo_mem);
|
||||
common.endoCol[0] = 408;
|
||||
common.endoCol[1] = 406;
|
||||
common.endoCol[2] = 397;
|
||||
common.endoCol[3] = 383;
|
||||
common.endoCol[4] = 354;
|
||||
common.endoCol[5] = 322;
|
||||
common.endoCol[6] = 294;
|
||||
common.endoCol[7] = 270;
|
||||
common.endoCol[8] = 250;
|
||||
common.endoCol[9] = 237;
|
||||
common.endoCol[10] = 235;
|
||||
common.endoCol[11] = 241;
|
||||
common.endoCol[12] = 254;
|
||||
common.endoCol[13] = 273;
|
||||
common.endoCol[14] = 300;
|
||||
common.endoCol[15] = 328;
|
||||
common.endoCol[16] = 356;
|
||||
common.endoCol[17] = 383;
|
||||
common.endoCol[18] = 401;
|
||||
common.endoCol[19] = 411;
|
||||
cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
|
||||
cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEndoRowLoc,
|
||||
common.endo_mem * common.no_frames);
|
||||
|
||||
common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEndoColLoc,
|
||||
common.endo_mem * common.no_frames);
|
||||
|
||||
//====================================================================================================
|
||||
// EPI POINTS
|
||||
//====================================================================================================
|
||||
|
||||
common.epiPoints = EPI_POINTS;
|
||||
common.epi_mem = sizeof(int) * common.epiPoints;
|
||||
|
||||
common.epiRow = (int *)malloc(common.epi_mem);
|
||||
common.epiRow[0] = 390;
|
||||
common.epiRow[1] = 419;
|
||||
common.epiRow[2] = 448;
|
||||
common.epiRow[3] = 474;
|
||||
common.epiRow[4] = 501;
|
||||
common.epiRow[5] = 519;
|
||||
common.epiRow[6] = 535;
|
||||
common.epiRow[7] = 542;
|
||||
common.epiRow[8] = 543;
|
||||
common.epiRow[9] = 538;
|
||||
common.epiRow[10] = 528;
|
||||
common.epiRow[11] = 511;
|
||||
common.epiRow[12] = 491;
|
||||
common.epiRow[13] = 466;
|
||||
common.epiRow[14] = 438;
|
||||
common.epiRow[15] = 406;
|
||||
common.epiRow[16] = 376;
|
||||
common.epiRow[17] = 347;
|
||||
common.epiRow[18] = 318;
|
||||
common.epiRow[19] = 291;
|
||||
common.epiRow[20] = 275;
|
||||
common.epiRow[21] = 259;
|
||||
common.epiRow[22] = 256;
|
||||
common.epiRow[23] = 252;
|
||||
common.epiRow[24] = 252;
|
||||
common.epiRow[25] = 257;
|
||||
common.epiRow[26] = 266;
|
||||
common.epiRow[27] = 283;
|
||||
common.epiRow[28] = 305;
|
||||
common.epiRow[29] = 331;
|
||||
common.epiRow[30] = 360;
|
||||
cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
|
||||
cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.epiCol = (int *)malloc(common.epi_mem);
|
||||
common.epiCol[0] = 457;
|
||||
common.epiCol[1] = 454;
|
||||
common.epiCol[2] = 446;
|
||||
common.epiCol[3] = 431;
|
||||
common.epiCol[4] = 411;
|
||||
common.epiCol[5] = 388;
|
||||
common.epiCol[6] = 361;
|
||||
common.epiCol[7] = 331;
|
||||
common.epiCol[8] = 301;
|
||||
common.epiCol[9] = 273;
|
||||
common.epiCol[10] = 243;
|
||||
common.epiCol[11] = 218;
|
||||
common.epiCol[12] = 196;
|
||||
common.epiCol[13] = 178;
|
||||
common.epiCol[14] = 166;
|
||||
common.epiCol[15] = 157;
|
||||
common.epiCol[16] = 155;
|
||||
common.epiCol[17] = 165;
|
||||
common.epiCol[18] = 177;
|
||||
common.epiCol[19] = 197;
|
||||
common.epiCol[20] = 218;
|
||||
common.epiCol[21] = 248;
|
||||
common.epiCol[22] = 276;
|
||||
common.epiCol[23] = 304;
|
||||
common.epiCol[24] = 333;
|
||||
common.epiCol[25] = 361;
|
||||
common.epiCol[26] = 391;
|
||||
common.epiCol[27] = 415;
|
||||
common.epiCol[28] = 434;
|
||||
common.epiCol[29] = 448;
|
||||
common.epiCol[30] = 455;
|
||||
cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
|
||||
cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
|
||||
|
||||
common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
|
||||
|
||||
//====================================================================================================
|
||||
// ALL POINTS
|
||||
//====================================================================================================
|
||||
|
||||
common.allPoints = ALL_POINTS;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE SIZES
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.in_rows = common.tSize + 1 + common.tSize;
|
||||
common.in_cols = common.in_rows;
|
||||
common.in_elem = common.in_rows * common.in_cols;
|
||||
common.in_mem = sizeof(fp) * common.in_elem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CREATE ARRAY OF TEMPLATES FOR ALL POINTS
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
|
||||
cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
|
||||
|
||||
//======================================================================================================================================================
|
||||
// SPECIFIC TO ENDO OR EPI TO BE SET HERE
|
||||
//======================================================================================================================================================
|
||||
|
||||
for (i = 0; i < common.endoPoints; i++) {
|
||||
unique[i].point_no = i;
|
||||
unique[i].d_Row = common.d_endoRow;
|
||||
unique[i].d_Col = common.d_endoCol;
|
||||
unique[i].d_tRowLoc = common.d_tEndoRowLoc;
|
||||
unique[i].d_tColLoc = common.d_tEndoColLoc;
|
||||
unique[i].d_T = common.d_endoT;
|
||||
}
|
||||
for (i = common.endoPoints; i < common.allPoints; i++) {
|
||||
unique[i].point_no = i - common.endoPoints;
|
||||
unique[i].d_Row = common.d_epiRow;
|
||||
unique[i].d_Col = common.d_epiCol;
|
||||
unique[i].d_tRowLoc = common.d_tEpiRowLoc;
|
||||
unique[i].d_tColLoc = common.d_tEpiColLoc;
|
||||
unique[i].d_T = common.d_epiT;
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
|
||||
//======================================================================================================================================================
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
unique[i].in_pointer = unique[i].point_no * common.in_elem;
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// AREA AROUND POINT FROM FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_rows = 2 * common.sSize + 1;
|
||||
common.in2_cols = 2 * common.sSize + 1;
|
||||
common.in2_elem = common.in2_rows * common.in2_cols;
|
||||
common.in2_mem = sizeof(float) * common.in2_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.conv_rows =
|
||||
common.in_rows + common.in2_rows - 1; // number of rows in I
|
||||
common.conv_cols =
|
||||
common.in_cols + common.in2_cols - 1; // number of columns in I
|
||||
common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
|
||||
common.conv_mem = sizeof(float) * common.conv_elem;
|
||||
common.ioffset = 0;
|
||||
common.joffset = 0;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_pad_add_rows = common.in_rows;
|
||||
common.in2_pad_add_cols = common.in_cols;
|
||||
|
||||
common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
|
||||
common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
|
||||
common.in2_pad_cumv_elem =
|
||||
common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
|
||||
common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
|
||||
common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
|
||||
common.in2_pad_cumv_sel_collow = 1;
|
||||
common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
|
||||
common.in2_pad_cumv_sel_rows =
|
||||
common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
|
||||
common.in2_pad_cumv_sel_cols =
|
||||
common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
|
||||
common.in2_pad_cumv_sel_elem =
|
||||
common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
|
||||
common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
|
||||
common.in2_pad_cumv_sel_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_pad_cumv_sel2_rowlow = 1;
|
||||
common.in2_pad_cumv_sel2_rowhig =
|
||||
common.in2_pad_cumv_rows - common.in_rows - 1;
|
||||
common.in2_pad_cumv_sel2_collow = 1;
|
||||
common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
|
||||
common.in2_sub_cumh_rows =
|
||||
common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
|
||||
common.in2_sub_cumh_cols =
|
||||
common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
|
||||
common.in2_sub_cumh_elem =
|
||||
common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
|
||||
common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sub_cumh_sel_rowlow = 1;
|
||||
common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
|
||||
common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
|
||||
common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
|
||||
common.in2_sub_cumh_sel_rows =
|
||||
common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
|
||||
common.in2_sub_cumh_sel_cols =
|
||||
common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
|
||||
common.in2_sub_cumh_sel_elem =
|
||||
common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
|
||||
common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
|
||||
common.in2_sub_cumh_sel_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sub_cumh_sel2_rowlow = 1;
|
||||
common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
|
||||
common.in2_sub_cumh_sel2_collow = 1;
|
||||
common.in2_sub_cumh_sel2_colhig =
|
||||
common.in2_sub_cumh_cols - common.in_cols - 1;
|
||||
common.in2_sub2_rows =
|
||||
common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
|
||||
common.in2_sub2_cols =
|
||||
common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
|
||||
common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
|
||||
common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 2
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// MULTIPLICATION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sqr_rows = common.in2_rows;
|
||||
common.in2_sqr_cols = common.in2_cols;
|
||||
common.in2_sqr_elem = common.in2_elem;
|
||||
common.in2_sqr_mem = common.in2_mem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sqr_sub2_rows = common.in2_sub2_rows;
|
||||
common.in2_sqr_sub2_cols = common.in2_sub2_cols;
|
||||
common.in2_sqr_sub2_elem = common.in2_sub2_elem;
|
||||
common.in2_sqr_sub2_mem = common.in2_sub2_mem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FINAL
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.in_sqr_rows = common.in_rows;
|
||||
common.in_sqr_cols = common.in_cols;
|
||||
common.in_sqr_elem = common.in_elem;
|
||||
common.in_sqr_mem = common.in_mem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE MASK CREATE
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
|
||||
common.tMask_cols = common.tMask_rows;
|
||||
common.tMask_elem = common.tMask_rows * common.tMask_cols;
|
||||
common.tMask_mem = sizeof(float) * common.tMask_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT MASK INITIALIZE
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.mask_rows = common.maxMove;
|
||||
common.mask_cols = common.mask_rows;
|
||||
common.mask_elem = common.mask_rows * common.mask_cols;
|
||||
common.mask_mem = sizeof(float) * common.mask_elem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// MASK CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.mask_conv_rows = common.tMask_rows; // number of rows in I
|
||||
common.mask_conv_cols = common.tMask_cols; // number of columns in I
|
||||
common.mask_conv_elem =
|
||||
common.mask_conv_rows * common.mask_conv_cols; // number of elements
|
||||
common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
|
||||
common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
|
||||
if ((common.mask_rows - 1) % 2 > 0.5) {
|
||||
common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
|
||||
}
|
||||
common.mask_conv_joffset = (common.mask_cols - 1) / 2;
|
||||
if ((common.mask_cols - 1) % 2 > 0.5) {
|
||||
common.mask_conv_joffset = common.mask_conv_joffset + 1;
|
||||
}
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// KERNEL
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// THREAD BLOCK
|
||||
//====================================================================================================
|
||||
|
||||
// All kernels operations within kernel use same max size of threads. Size of
|
||||
// block size is set to the size appropriate for max size operation (on padded
|
||||
// matrix). Other use subsets of that.
|
||||
threads.x = NUMBER_THREADS; // define the number of threads in the block
|
||||
threads.y = 1;
|
||||
blocks.x = common.allPoints; // define the number of blocks in the grid
|
||||
blocks.y = 1;
|
||||
|
||||
//====================================================================================================
|
||||
// COPY ARGUMENTS
|
||||
//====================================================================================================
|
||||
|
||||
cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
|
||||
cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
|
||||
|
||||
//====================================================================================================
|
||||
// PRINT FRAME PROGRESS START
|
||||
//====================================================================================================
|
||||
|
||||
printf("frame progress: ");
|
||||
fflush(NULL);
|
||||
|
||||
//====================================================================================================
|
||||
// LAUNCH
|
||||
//====================================================================================================
|
||||
|
||||
for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
|
||||
common_change.frame_no++) {
|
||||
printf("get frame\n");
|
||||
// Extract a cropped version of the first frame from the video file
|
||||
frame = get_frame(
|
||||
frames, // pointer to video file
|
||||
common_change.frame_no, // number of frame that needs to be returned
|
||||
0, // cropped?
|
||||
0, // scaled?
|
||||
1); // converted
|
||||
printf("memcpy\n");
|
||||
// copy frame to GPU memory
|
||||
cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
printf("toSymbol\n");
|
||||
cudaMemcpyToSymbol(d_common_change, &common_change,
|
||||
sizeof(params_common_change));
|
||||
|
||||
// launch GPU kernel
|
||||
printf("launch\n");
|
||||
kernel<<<1, 32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
printf("return\n");
|
||||
// free frame after each loop iteration, since AVI library allocates memory
|
||||
// for every frame fetched
|
||||
printf("free\n");
|
||||
free(frame);
|
||||
|
||||
// print frame progress
|
||||
printf("%d ", common_change.frame_no);
|
||||
fflush(NULL);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// PRINT FRAME PROGRESS END
|
||||
//====================================================================================================
|
||||
|
||||
printf("\n");
|
||||
fflush(NULL);
|
||||
|
||||
//====================================================================================================
|
||||
// OUTPUT
|
||||
//====================================================================================================
|
||||
|
||||
cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
|
||||
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
|
||||
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
|
||||
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
|
||||
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
|
||||
#ifdef OUTPUT
|
||||
|
||||
//==================================================50
|
||||
// DUMP DATA TO FILE
|
||||
//==================================================50
|
||||
write_data("result.txt", common.no_frames, frames_processed,
|
||||
common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
|
||||
common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
|
||||
|
||||
//==================================================50
|
||||
// End
|
||||
//==================================================50
|
||||
|
||||
#endif
|
||||
|
||||
//======================================================================================================================================================
|
||||
// DEALLOCATION
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// COMMON
|
||||
//====================================================================================================
|
||||
|
||||
// frame
|
||||
cudaFree(common_change.d_frame);
|
||||
|
||||
// endo points
|
||||
free(common.endoRow);
|
||||
free(common.endoCol);
|
||||
free(common.tEndoRowLoc);
|
||||
free(common.tEndoColLoc);
|
||||
|
||||
cudaFree(common.d_endoRow);
|
||||
cudaFree(common.d_endoCol);
|
||||
cudaFree(common.d_tEndoRowLoc);
|
||||
cudaFree(common.d_tEndoColLoc);
|
||||
|
||||
cudaFree(common.d_endoT);
|
||||
|
||||
// epi points
|
||||
free(common.epiRow);
|
||||
free(common.epiCol);
|
||||
free(common.tEpiRowLoc);
|
||||
free(common.tEpiColLoc);
|
||||
|
||||
cudaFree(common.d_epiRow);
|
||||
cudaFree(common.d_epiCol);
|
||||
cudaFree(common.d_tEpiRowLoc);
|
||||
cudaFree(common.d_tEpiColLoc);
|
||||
|
||||
cudaFree(common.d_epiT);
|
||||
|
||||
//====================================================================================================
|
||||
// POINTERS
|
||||
//====================================================================================================
|
||||
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaFree(unique[i].d_in2);
|
||||
|
||||
cudaFree(unique[i].d_conv);
|
||||
cudaFree(unique[i].d_in2_pad_cumv);
|
||||
cudaFree(unique[i].d_in2_pad_cumv_sel);
|
||||
cudaFree(unique[i].d_in2_sub_cumh);
|
||||
cudaFree(unique[i].d_in2_sub_cumh_sel);
|
||||
cudaFree(unique[i].d_in2_sub2);
|
||||
cudaFree(unique[i].d_in2_sqr);
|
||||
cudaFree(unique[i].d_in2_sqr_sub2);
|
||||
cudaFree(unique[i].d_in_sqr);
|
||||
|
||||
cudaFree(unique[i].d_tMask);
|
||||
cudaFree(unique[i].d_mask_conv);
|
||||
}
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// MAIN FUNCTION
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
|
@ -0,0 +1,17 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd AVI; make; cd ..;
|
||||
|
||||
clang++ -DOUTPUT main.cu -I./AVI --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
|
||||
|
||||
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
|
||||
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20
|
|
@ -0,0 +1,5 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Set Device
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void setdevice(void) { cudaSetDevice(0); }
|
|
@ -0,0 +1,719 @@
|
|||
; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "hotspot.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
|
||||
entry:
|
||||
%iteration.addr = alloca i32, align 4
|
||||
%power.addr = alloca float*, align 8
|
||||
%temp_src.addr = alloca float*, align 8
|
||||
%temp_dst.addr = alloca float*, align 8
|
||||
%grid_cols.addr = alloca i32, align 4
|
||||
%grid_rows.addr = alloca i32, align 4
|
||||
%border_cols.addr = alloca i32, align 4
|
||||
%border_rows.addr = alloca i32, align 4
|
||||
%Cap.addr = alloca float, align 4
|
||||
%Rx.addr = alloca float, align 4
|
||||
%Ry.addr = alloca float, align 4
|
||||
%Rz.addr = alloca float, align 4
|
||||
%step.addr = alloca float, align 4
|
||||
%time_elapsed.addr = alloca float, align 4
|
||||
%amb_temp = alloca float, align 4
|
||||
%step_div_Cap = alloca float, align 4
|
||||
%Rx_1 = alloca float, align 4
|
||||
%Ry_1 = alloca float, align 4
|
||||
%Rz_1 = alloca float, align 4
|
||||
%bx = alloca i32, align 4
|
||||
%by = alloca i32, align 4
|
||||
%tx = alloca i32, align 4
|
||||
%ty = alloca i32, align 4
|
||||
%small_block_rows = alloca i32, align 4
|
||||
%small_block_cols = alloca i32, align 4
|
||||
%blkY = alloca i32, align 4
|
||||
%blkX = alloca i32, align 4
|
||||
%blkYmax = alloca i32, align 4
|
||||
%blkXmax = alloca i32, align 4
|
||||
%yidx = alloca i32, align 4
|
||||
%xidx = alloca i32, align 4
|
||||
%loadYidx = alloca i32, align 4
|
||||
%loadXidx = alloca i32, align 4
|
||||
%index = alloca i32, align 4
|
||||
%validYmin = alloca i32, align 4
|
||||
%validYmax = alloca i32, align 4
|
||||
%validXmin = alloca i32, align 4
|
||||
%validXmax = alloca i32, align 4
|
||||
%N = alloca i32, align 4
|
||||
%S = alloca i32, align 4
|
||||
%W = alloca i32, align 4
|
||||
%E = alloca i32, align 4
|
||||
%computed = alloca i8, align 1
|
||||
%i = alloca i32, align 4
|
||||
store i32 %iteration, i32* %iteration.addr, align 4
|
||||
store float* %power, float** %power.addr, align 8
|
||||
store float* %temp_src, float** %temp_src.addr, align 8
|
||||
store float* %temp_dst, float** %temp_dst.addr, align 8
|
||||
store i32 %grid_cols, i32* %grid_cols.addr, align 4
|
||||
store i32 %grid_rows, i32* %grid_rows.addr, align 4
|
||||
store i32 %border_cols, i32* %border_cols.addr, align 4
|
||||
store i32 %border_rows, i32* %border_rows.addr, align 4
|
||||
store float %Cap, float* %Cap.addr, align 4
|
||||
store float %Rx, float* %Rx.addr, align 4
|
||||
store float %Ry, float* %Ry.addr, align 4
|
||||
store float %Rz, float* %Rz.addr, align 4
|
||||
store float %step, float* %step.addr, align 4
|
||||
store float %time_elapsed, float* %time_elapsed.addr, align 4
|
||||
store float 8.000000e+01, float* %amb_temp, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call, i32* %bx, align 4
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call1, i32* %by, align 4
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call2, i32* %tx, align 4
|
||||
%call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call3, i32* %ty, align 4
|
||||
%0 = load float, float* %step.addr, align 4
|
||||
%1 = load float, float* %Cap.addr, align 4
|
||||
%div = fdiv float %0, %1
|
||||
store float %div, float* %step_div_Cap, align 4
|
||||
%2 = load float, float* %Rx.addr, align 4
|
||||
%div4 = fdiv float 1.000000e+00, %2
|
||||
store float %div4, float* %Rx_1, align 4
|
||||
%3 = load float, float* %Ry.addr, align 4
|
||||
%div5 = fdiv float 1.000000e+00, %3
|
||||
store float %div5, float* %Ry_1, align 4
|
||||
%4 = load float, float* %Rz.addr, align 4
|
||||
%div6 = fdiv float 1.000000e+00, %4
|
||||
store float %div6, float* %Rz_1, align 4
|
||||
%5 = load i32, i32* %iteration.addr, align 4
|
||||
%mul = mul nsw i32 %5, 2
|
||||
%sub = sub nsw i32 16, %mul
|
||||
store i32 %sub, i32* %small_block_rows, align 4
|
||||
%6 = load i32, i32* %iteration.addr, align 4
|
||||
%mul7 = mul nsw i32 %6, 2
|
||||
%sub8 = sub nsw i32 16, %mul7
|
||||
store i32 %sub8, i32* %small_block_cols, align 4
|
||||
%7 = load i32, i32* %small_block_rows, align 4
|
||||
%8 = load i32, i32* %by, align 4
|
||||
%mul9 = mul nsw i32 %7, %8
|
||||
%9 = load i32, i32* %border_rows.addr, align 4
|
||||
%sub10 = sub nsw i32 %mul9, %9
|
||||
store i32 %sub10, i32* %blkY, align 4
|
||||
%10 = load i32, i32* %small_block_cols, align 4
|
||||
%11 = load i32, i32* %bx, align 4
|
||||
%mul11 = mul nsw i32 %10, %11
|
||||
%12 = load i32, i32* %border_cols.addr, align 4
|
||||
%sub12 = sub nsw i32 %mul11, %12
|
||||
store i32 %sub12, i32* %blkX, align 4
|
||||
%13 = load i32, i32* %blkY, align 4
|
||||
%add = add nsw i32 %13, 16
|
||||
%sub13 = sub nsw i32 %add, 1
|
||||
store i32 %sub13, i32* %blkYmax, align 4
|
||||
%14 = load i32, i32* %blkX, align 4
|
||||
%add14 = add nsw i32 %14, 16
|
||||
%sub15 = sub nsw i32 %add14, 1
|
||||
store i32 %sub15, i32* %blkXmax, align 4
|
||||
%15 = load i32, i32* %blkY, align 4
|
||||
%16 = load i32, i32* %ty, align 4
|
||||
%add16 = add nsw i32 %15, %16
|
||||
store i32 %add16, i32* %yidx, align 4
|
||||
%17 = load i32, i32* %blkX, align 4
|
||||
%18 = load i32, i32* %tx, align 4
|
||||
%add17 = add nsw i32 %17, %18
|
||||
store i32 %add17, i32* %xidx, align 4
|
||||
%19 = load i32, i32* %yidx, align 4
|
||||
store i32 %19, i32* %loadYidx, align 4
|
||||
%20 = load i32, i32* %xidx, align 4
|
||||
store i32 %20, i32* %loadXidx, align 4
|
||||
%21 = load i32, i32* %grid_cols.addr, align 4
|
||||
%22 = load i32, i32* %loadYidx, align 4
|
||||
%mul18 = mul nsw i32 %21, %22
|
||||
%23 = load i32, i32* %loadXidx, align 4
|
||||
%add19 = add nsw i32 %mul18, %23
|
||||
store i32 %add19, i32* %index, align 4
|
||||
%24 = load i32, i32* %loadYidx, align 4
|
||||
%cmp = icmp sge i32 %24, 0
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%25 = load i32, i32* %loadYidx, align 4
|
||||
%26 = load i32, i32* %grid_rows.addr, align 4
|
||||
%sub20 = sub nsw i32 %26, 1
|
||||
%cmp21 = icmp sle i32 %25, %sub20
|
||||
br i1 %cmp21, label %land.lhs.true22, label %if.end
|
||||
|
||||
land.lhs.true22: ; preds = %land.lhs.true
|
||||
%27 = load i32, i32* %loadXidx, align 4
|
||||
%cmp23 = icmp sge i32 %27, 0
|
||||
br i1 %cmp23, label %land.lhs.true24, label %if.end
|
||||
|
||||
land.lhs.true24: ; preds = %land.lhs.true22
|
||||
%28 = load i32, i32* %loadXidx, align 4
|
||||
%29 = load i32, i32* %grid_cols.addr, align 4
|
||||
%sub25 = sub nsw i32 %29, 1
|
||||
%cmp26 = icmp sle i32 %28, %sub25
|
||||
br i1 %cmp26, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %land.lhs.true24
|
||||
%30 = load float*, float** %temp_src.addr, align 8
|
||||
%31 = load i32, i32* %index, align 4
|
||||
%idxprom = sext i32 %31 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
|
||||
%32 = load float, float* %arrayidx, align 4
|
||||
%33 = load i32, i32* %ty, align 4
|
||||
%idxprom27 = sext i32 %33 to i64
|
||||
%arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
|
||||
%34 = load i32, i32* %tx, align 4
|
||||
%idxprom29 = sext i32 %34 to i64
|
||||
%arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
|
||||
store float %32, float* %arrayidx30, align 4
|
||||
%35 = load float*, float** %power.addr, align 8
|
||||
%36 = load i32, i32* %index, align 4
|
||||
%idxprom31 = sext i32 %36 to i64
|
||||
%arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
|
||||
%37 = load float, float* %arrayidx32, align 4
|
||||
%38 = load i32, i32* %ty, align 4
|
||||
%idxprom33 = sext i32 %38 to i64
|
||||
%arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
|
||||
%39 = load i32, i32* %tx, align 4
|
||||
%idxprom35 = sext i32 %39 to i64
|
||||
%arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
|
||||
store float %37, float* %arrayidx36, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%40 = load i32, i32* %blkY, align 4
|
||||
%cmp37 = icmp slt i32 %40, 0
|
||||
br i1 %cmp37, label %cond.true, label %cond.false
|
||||
|
||||
cond.true: ; preds = %if.end
|
||||
%41 = load i32, i32* %blkY, align 4
|
||||
%sub38 = sub nsw i32 0, %41
|
||||
br label %cond.end
|
||||
|
||||
cond.false: ; preds = %if.end
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %cond.false, %cond.true
|
||||
%cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
|
||||
store i32 %cond, i32* %validYmin, align 4
|
||||
%42 = load i32, i32* %blkYmax, align 4
|
||||
%43 = load i32, i32* %grid_rows.addr, align 4
|
||||
%sub39 = sub nsw i32 %43, 1
|
||||
%cmp40 = icmp sgt i32 %42, %sub39
|
||||
br i1 %cmp40, label %cond.true41, label %cond.false45
|
||||
|
||||
cond.true41: ; preds = %cond.end
|
||||
%44 = load i32, i32* %blkYmax, align 4
|
||||
%45 = load i32, i32* %grid_rows.addr, align 4
|
||||
%sub42 = sub nsw i32 %44, %45
|
||||
%add43 = add nsw i32 %sub42, 1
|
||||
%sub44 = sub nsw i32 15, %add43
|
||||
br label %cond.end46
|
||||
|
||||
cond.false45: ; preds = %cond.end
|
||||
br label %cond.end46
|
||||
|
||||
cond.end46: ; preds = %cond.false45, %cond.true41
|
||||
%cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
|
||||
store i32 %cond47, i32* %validYmax, align 4
|
||||
%46 = load i32, i32* %blkX, align 4
|
||||
%cmp48 = icmp slt i32 %46, 0
|
||||
br i1 %cmp48, label %cond.true49, label %cond.false51
|
||||
|
||||
cond.true49: ; preds = %cond.end46
|
||||
%47 = load i32, i32* %blkX, align 4
|
||||
%sub50 = sub nsw i32 0, %47
|
||||
br label %cond.end52
|
||||
|
||||
cond.false51: ; preds = %cond.end46
|
||||
br label %cond.end52
|
||||
|
||||
cond.end52: ; preds = %cond.false51, %cond.true49
|
||||
%cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
|
||||
store i32 %cond53, i32* %validXmin, align 4
|
||||
%48 = load i32, i32* %blkXmax, align 4
|
||||
%49 = load i32, i32* %grid_cols.addr, align 4
|
||||
%sub54 = sub nsw i32 %49, 1
|
||||
%cmp55 = icmp sgt i32 %48, %sub54
|
||||
br i1 %cmp55, label %cond.true56, label %cond.false60
|
||||
|
||||
cond.true56: ; preds = %cond.end52
|
||||
%50 = load i32, i32* %blkXmax, align 4
|
||||
%51 = load i32, i32* %grid_cols.addr, align 4
|
||||
%sub57 = sub nsw i32 %50, %51
|
||||
%add58 = add nsw i32 %sub57, 1
|
||||
%sub59 = sub nsw i32 15, %add58
|
||||
br label %cond.end61
|
||||
|
||||
cond.false60: ; preds = %cond.end52
|
||||
br label %cond.end61
|
||||
|
||||
cond.end61: ; preds = %cond.false60, %cond.true56
|
||||
%cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
|
||||
store i32 %cond62, i32* %validXmax, align 4
|
||||
%52 = load i32, i32* %ty, align 4
|
||||
%sub63 = sub nsw i32 %52, 1
|
||||
store i32 %sub63, i32* %N, align 4
|
||||
%53 = load i32, i32* %ty, align 4
|
||||
%add64 = add nsw i32 %53, 1
|
||||
store i32 %add64, i32* %S, align 4
|
||||
%54 = load i32, i32* %tx, align 4
|
||||
%sub65 = sub nsw i32 %54, 1
|
||||
store i32 %sub65, i32* %W, align 4
|
||||
%55 = load i32, i32* %tx, align 4
|
||||
%add66 = add nsw i32 %55, 1
|
||||
store i32 %add66, i32* %E, align 4
|
||||
%56 = load i32, i32* %N, align 4
|
||||
%57 = load i32, i32* %validYmin, align 4
|
||||
%cmp67 = icmp slt i32 %56, %57
|
||||
br i1 %cmp67, label %cond.true68, label %cond.false69
|
||||
|
||||
cond.true68: ; preds = %cond.end61
|
||||
%58 = load i32, i32* %validYmin, align 4
|
||||
br label %cond.end70
|
||||
|
||||
cond.false69: ; preds = %cond.end61
|
||||
%59 = load i32, i32* %N, align 4
|
||||
br label %cond.end70
|
||||
|
||||
cond.end70: ; preds = %cond.false69, %cond.true68
|
||||
%cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
|
||||
store i32 %cond71, i32* %N, align 4
|
||||
%60 = load i32, i32* %S, align 4
|
||||
%61 = load i32, i32* %validYmax, align 4
|
||||
%cmp72 = icmp sgt i32 %60, %61
|
||||
br i1 %cmp72, label %cond.true73, label %cond.false74
|
||||
|
||||
cond.true73: ; preds = %cond.end70
|
||||
%62 = load i32, i32* %validYmax, align 4
|
||||
br label %cond.end75
|
||||
|
||||
cond.false74: ; preds = %cond.end70
|
||||
%63 = load i32, i32* %S, align 4
|
||||
br label %cond.end75
|
||||
|
||||
cond.end75: ; preds = %cond.false74, %cond.true73
|
||||
%cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
|
||||
store i32 %cond76, i32* %S, align 4
|
||||
%64 = load i32, i32* %W, align 4
|
||||
%65 = load i32, i32* %validXmin, align 4
|
||||
%cmp77 = icmp slt i32 %64, %65
|
||||
br i1 %cmp77, label %cond.true78, label %cond.false79
|
||||
|
||||
cond.true78: ; preds = %cond.end75
|
||||
%66 = load i32, i32* %validXmin, align 4
|
||||
br label %cond.end80
|
||||
|
||||
cond.false79: ; preds = %cond.end75
|
||||
%67 = load i32, i32* %W, align 4
|
||||
br label %cond.end80
|
||||
|
||||
cond.end80: ; preds = %cond.false79, %cond.true78
|
||||
%cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
|
||||
store i32 %cond81, i32* %W, align 4
|
||||
%68 = load i32, i32* %E, align 4
|
||||
%69 = load i32, i32* %validXmax, align 4
|
||||
%cmp82 = icmp sgt i32 %68, %69
|
||||
br i1 %cmp82, label %cond.true83, label %cond.false84
|
||||
|
||||
cond.true83: ; preds = %cond.end80
|
||||
%70 = load i32, i32* %validXmax, align 4
|
||||
br label %cond.end85
|
||||
|
||||
cond.false84: ; preds = %cond.end80
|
||||
%71 = load i32, i32* %E, align 4
|
||||
br label %cond.end85
|
||||
|
||||
cond.end85: ; preds = %cond.false84, %cond.true83
|
||||
%cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
|
||||
store i32 %cond86, i32* %E, align 4
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %cond.end85
|
||||
%72 = load i32, i32* %i, align 4
|
||||
%73 = load i32, i32* %iteration.addr, align 4
|
||||
%cmp87 = icmp slt i32 %72, %73
|
||||
br i1 %cmp87, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
store i8 0, i8* %computed, align 1
|
||||
%74 = load i32, i32* %tx, align 4
|
||||
%75 = load i32, i32* %i, align 4
|
||||
%add88 = add nsw i32 %75, 1
|
||||
%cmp89 = icmp sge i32 %74, %add88
|
||||
br i1 %cmp89, label %land.lhs.true90, label %if.end175
|
||||
|
||||
land.lhs.true90: ; preds = %for.body
|
||||
%76 = load i32, i32* %tx, align 4
|
||||
%77 = load i32, i32* %i, align 4
|
||||
%sub91 = sub nsw i32 16, %77
|
||||
%sub92 = sub nsw i32 %sub91, 2
|
||||
%cmp93 = icmp sle i32 %76, %sub92
|
||||
br i1 %cmp93, label %land.lhs.true94, label %if.end175
|
||||
|
||||
land.lhs.true94: ; preds = %land.lhs.true90
|
||||
%78 = load i32, i32* %ty, align 4
|
||||
%79 = load i32, i32* %i, align 4
|
||||
%add95 = add nsw i32 %79, 1
|
||||
%cmp96 = icmp sge i32 %78, %add95
|
||||
br i1 %cmp96, label %land.lhs.true97, label %if.end175
|
||||
|
||||
land.lhs.true97: ; preds = %land.lhs.true94
|
||||
%80 = load i32, i32* %ty, align 4
|
||||
%81 = load i32, i32* %i, align 4
|
||||
%sub98 = sub nsw i32 16, %81
|
||||
%sub99 = sub nsw i32 %sub98, 2
|
||||
%cmp100 = icmp sle i32 %80, %sub99
|
||||
br i1 %cmp100, label %land.lhs.true101, label %if.end175
|
||||
|
||||
land.lhs.true101: ; preds = %land.lhs.true97
|
||||
%82 = load i32, i32* %tx, align 4
|
||||
%83 = load i32, i32* %validXmin, align 4
|
||||
%cmp102 = icmp sge i32 %82, %83
|
||||
br i1 %cmp102, label %land.lhs.true103, label %if.end175
|
||||
|
||||
land.lhs.true103: ; preds = %land.lhs.true101
|
||||
%84 = load i32, i32* %tx, align 4
|
||||
%85 = load i32, i32* %validXmax, align 4
|
||||
%cmp104 = icmp sle i32 %84, %85
|
||||
br i1 %cmp104, label %land.lhs.true105, label %if.end175
|
||||
|
||||
land.lhs.true105: ; preds = %land.lhs.true103
|
||||
%86 = load i32, i32* %ty, align 4
|
||||
%87 = load i32, i32* %validYmin, align 4
|
||||
%cmp106 = icmp sge i32 %86, %87
|
||||
br i1 %cmp106, label %land.lhs.true107, label %if.end175
|
||||
|
||||
land.lhs.true107: ; preds = %land.lhs.true105
|
||||
%88 = load i32, i32* %ty, align 4
|
||||
%89 = load i32, i32* %validYmax, align 4
|
||||
%cmp108 = icmp sle i32 %88, %89
|
||||
br i1 %cmp108, label %if.then109, label %if.end175
|
||||
|
||||
if.then109: ; preds = %land.lhs.true107
|
||||
store i8 1, i8* %computed, align 1
|
||||
%90 = load i32, i32* %ty, align 4
|
||||
%idxprom110 = sext i32 %90 to i64
|
||||
%arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
|
||||
%91 = load i32, i32* %tx, align 4
|
||||
%idxprom112 = sext i32 %91 to i64
|
||||
%arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
|
||||
%92 = load float, float* %arrayidx113, align 4
|
||||
%conv = fpext float %92 to double
|
||||
%93 = load float, float* %step_div_Cap, align 4
|
||||
%conv114 = fpext float %93 to double
|
||||
%94 = load i32, i32* %ty, align 4
|
||||
%idxprom115 = sext i32 %94 to i64
|
||||
%arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
|
||||
%95 = load i32, i32* %tx, align 4
|
||||
%idxprom117 = sext i32 %95 to i64
|
||||
%arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
|
||||
%96 = load float, float* %arrayidx118, align 4
|
||||
%conv119 = fpext float %96 to double
|
||||
%97 = load i32, i32* %S, align 4
|
||||
%idxprom120 = sext i32 %97 to i64
|
||||
%arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
|
||||
%98 = load i32, i32* %tx, align 4
|
||||
%idxprom122 = sext i32 %98 to i64
|
||||
%arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
|
||||
%99 = load float, float* %arrayidx123, align 4
|
||||
%100 = load i32, i32* %N, align 4
|
||||
%idxprom124 = sext i32 %100 to i64
|
||||
%arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
|
||||
%101 = load i32, i32* %tx, align 4
|
||||
%idxprom126 = sext i32 %101 to i64
|
||||
%arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
|
||||
%102 = load float, float* %arrayidx127, align 4
|
||||
%add128 = fadd contract float %99, %102
|
||||
%conv129 = fpext float %add128 to double
|
||||
%103 = load i32, i32* %ty, align 4
|
||||
%idxprom130 = sext i32 %103 to i64
|
||||
%arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
|
||||
%104 = load i32, i32* %tx, align 4
|
||||
%idxprom132 = sext i32 %104 to i64
|
||||
%arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
|
||||
%105 = load float, float* %arrayidx133, align 4
|
||||
%conv134 = fpext float %105 to double
|
||||
%mul135 = fmul contract double 2.000000e+00, %conv134
|
||||
%sub136 = fsub contract double %conv129, %mul135
|
||||
%106 = load float, float* %Ry_1, align 4
|
||||
%conv137 = fpext float %106 to double
|
||||
%mul138 = fmul contract double %sub136, %conv137
|
||||
%add139 = fadd contract double %conv119, %mul138
|
||||
%107 = load i32, i32* %ty, align 4
|
||||
%idxprom140 = sext i32 %107 to i64
|
||||
%arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
|
||||
%108 = load i32, i32* %E, align 4
|
||||
%idxprom142 = sext i32 %108 to i64
|
||||
%arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
|
||||
%109 = load float, float* %arrayidx143, align 4
|
||||
%110 = load i32, i32* %ty, align 4
|
||||
%idxprom144 = sext i32 %110 to i64
|
||||
%arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
|
||||
%111 = load i32, i32* %W, align 4
|
||||
%idxprom146 = sext i32 %111 to i64
|
||||
%arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
|
||||
%112 = load float, float* %arrayidx147, align 4
|
||||
%add148 = fadd contract float %109, %112
|
||||
%conv149 = fpext float %add148 to double
|
||||
%113 = load i32, i32* %ty, align 4
|
||||
%idxprom150 = sext i32 %113 to i64
|
||||
%arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
|
||||
%114 = load i32, i32* %tx, align 4
|
||||
%idxprom152 = sext i32 %114 to i64
|
||||
%arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
|
||||
%115 = load float, float* %arrayidx153, align 4
|
||||
%conv154 = fpext float %115 to double
|
||||
%mul155 = fmul contract double 2.000000e+00, %conv154
|
||||
%sub156 = fsub contract double %conv149, %mul155
|
||||
%116 = load float, float* %Rx_1, align 4
|
||||
%conv157 = fpext float %116 to double
|
||||
%mul158 = fmul contract double %sub156, %conv157
|
||||
%add159 = fadd contract double %add139, %mul158
|
||||
%117 = load float, float* %amb_temp, align 4
|
||||
%118 = load i32, i32* %ty, align 4
|
||||
%idxprom160 = sext i32 %118 to i64
|
||||
%arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
|
||||
%119 = load i32, i32* %tx, align 4
|
||||
%idxprom162 = sext i32 %119 to i64
|
||||
%arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
|
||||
%120 = load float, float* %arrayidx163, align 4
|
||||
%sub164 = fsub contract float %117, %120
|
||||
%121 = load float, float* %Rz_1, align 4
|
||||
%mul165 = fmul contract float %sub164, %121
|
||||
%conv166 = fpext float %mul165 to double
|
||||
%add167 = fadd contract double %add159, %conv166
|
||||
%mul168 = fmul contract double %conv114, %add167
|
||||
%add169 = fadd contract double %conv, %mul168
|
||||
%conv170 = fptrunc double %add169 to float
|
||||
%122 = load i32, i32* %ty, align 4
|
||||
%idxprom171 = sext i32 %122 to i64
|
||||
%arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
|
||||
%123 = load i32, i32* %tx, align 4
|
||||
%idxprom173 = sext i32 %123 to i64
|
||||
%arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
|
||||
store float %conv170, float* %arrayidx174, align 4
|
||||
br label %if.end175
|
||||
|
||||
if.end175: ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%124 = load i32, i32* %i, align 4
|
||||
%125 = load i32, i32* %iteration.addr, align 4
|
||||
%sub176 = sub nsw i32 %125, 1
|
||||
%cmp177 = icmp eq i32 %124, %sub176
|
||||
br i1 %cmp177, label %if.then178, label %if.end179
|
||||
|
||||
if.then178: ; preds = %if.end175
|
||||
br label %for.end
|
||||
|
||||
if.end179: ; preds = %if.end175
|
||||
%126 = load i8, i8* %computed, align 1
|
||||
%tobool = trunc i8 %126 to i1
|
||||
br i1 %tobool, label %if.then180, label %if.end189
|
||||
|
||||
if.then180: ; preds = %if.end179
|
||||
%127 = load i32, i32* %ty, align 4
|
||||
%idxprom181 = sext i32 %127 to i64
|
||||
%arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
|
||||
%128 = load i32, i32* %tx, align 4
|
||||
%idxprom183 = sext i32 %128 to i64
|
||||
%arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
|
||||
%129 = load float, float* %arrayidx184, align 4
|
||||
%130 = load i32, i32* %ty, align 4
|
||||
%idxprom185 = sext i32 %130 to i64
|
||||
%arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
|
||||
%131 = load i32, i32* %tx, align 4
|
||||
%idxprom187 = sext i32 %131 to i64
|
||||
%arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
|
||||
store float %129, float* %arrayidx188, align 4
|
||||
br label %if.end189
|
||||
|
||||
if.end189: ; preds = %if.then180, %if.end179
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end189
|
||||
%132 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %132, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %if.then178, %for.cond
|
||||
%133 = load i8, i8* %computed, align 1
|
||||
%tobool190 = trunc i8 %133 to i1
|
||||
br i1 %tobool190, label %if.then191, label %if.end198
|
||||
|
||||
if.then191: ; preds = %for.end
|
||||
%134 = load i32, i32* %ty, align 4
|
||||
%idxprom192 = sext i32 %134 to i64
|
||||
%arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
|
||||
%135 = load i32, i32* %tx, align 4
|
||||
%idxprom194 = sext i32 %135 to i64
|
||||
%arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
|
||||
%136 = load float, float* %arrayidx195, align 4
|
||||
%137 = load float*, float** %temp_dst.addr, align 8
|
||||
%138 = load i32, i32* %index, align 4
|
||||
%idxprom196 = sext i32 %138 to i64
|
||||
%arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
|
||||
store float %136, float* %arrayidx197, align 4
|
||||
br label %if.end198
|
||||
|
||||
if.end198: ; preds = %if.then191, %for.end
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,353 @@
|
|||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE 16
|
||||
#endif
|
||||
|
||||
#define STR_SIZE 256
|
||||
|
||||
/* maximum power density possible (say 300W for a 10mm x 10mm chip) */
|
||||
#define MAX_PD (3.0e6)
|
||||
/* required precision in degrees */
|
||||
#define PRECISION 0.001
|
||||
#define SPEC_HEAT_SI 1.75e6
|
||||
#define K_SI 100
|
||||
/* capacitance fitting factor */
|
||||
#define FACTOR_CHIP 0.5
|
||||
|
||||
/* chip parameters */
|
||||
float t_chip = 0.0005;
|
||||
float chip_height = 0.016;
|
||||
float chip_width = 0.016;
|
||||
/* ambient temperature, assuming no package at all */
|
||||
float amb_temp = 80.0;
|
||||
|
||||
void run(int argc, char **argv);
|
||||
|
||||
/* define timer macros */
|
||||
#define pin_stats_reset() startCycle()
|
||||
#define pin_stats_pause(cycles) stopCycle(cycles)
|
||||
#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
|
||||
|
||||
void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
|
||||
|
||||
void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
|
||||
|
||||
int i, j, index = 0;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
|
||||
if ((fp = fopen(file, "w")) == 0)
|
||||
printf("The file was not opened\n");
|
||||
|
||||
for (i = 0; i < grid_rows; i++)
|
||||
for (j = 0; j < grid_cols; j++) {
|
||||
|
||||
sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
|
||||
fputs(str, fp);
|
||||
index++;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
|
||||
|
||||
int i, j;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
float val;
|
||||
|
||||
if ((fp = fopen(file, "r")) == 0)
|
||||
printf("The file was not opened\n");
|
||||
|
||||
for (i = 0; i <= grid_rows - 1; i++)
|
||||
for (j = 0; j <= grid_cols - 1; j++) {
|
||||
fgets(str, STR_SIZE, fp);
|
||||
if (feof(fp))
|
||||
fatal("not enough lines in file");
|
||||
// if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
|
||||
// ((i-1)*(grid_cols-2)+j-1)))
|
||||
if ((sscanf(str, "%f", &val) != 1))
|
||||
fatal("invalid file format");
|
||||
vect[i * grid_cols + j] = val;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
|
||||
#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
|
||||
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
|
||||
|
||||
__global__ void calculate_temp(int iteration, // number of iteration
|
||||
float *power, // power input
|
||||
float *temp_src, // temperature input/output
|
||||
float *temp_dst, // temperature input/output
|
||||
int grid_cols, // Col of grid
|
||||
int grid_rows, // Row of grid
|
||||
int border_cols, // border offset
|
||||
int border_rows, // border offset
|
||||
float Cap, // Capacitance
|
||||
float Rx, float Ry, float Rz, float step,
|
||||
float time_elapsed) {
|
||||
|
||||
__shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
|
||||
__shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
|
||||
__shared__ float temp_t[BLOCK_SIZE]
|
||||
[BLOCK_SIZE]; // saving temparary temperature result
|
||||
|
||||
float amb_temp = 80.0;
|
||||
float step_div_Cap;
|
||||
float Rx_1, Ry_1, Rz_1;
|
||||
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
step_div_Cap = step / Cap;
|
||||
|
||||
Rx_1 = 1 / Rx;
|
||||
Ry_1 = 1 / Ry;
|
||||
Rz_1 = 1 / Rz;
|
||||
|
||||
// each block finally computes result for a small block
|
||||
// after N iterations.
|
||||
// it is the non-overlapping small blocks that cover
|
||||
// all the input data
|
||||
|
||||
// calculate the small block size
|
||||
int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
|
||||
int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
|
||||
|
||||
// calculate the boundary for the block according to
|
||||
// the boundary of its small block
|
||||
int blkY = small_block_rows * by - border_rows;
|
||||
int blkX = small_block_cols * bx - border_cols;
|
||||
int blkYmax = blkY + BLOCK_SIZE - 1;
|
||||
int blkXmax = blkX + BLOCK_SIZE - 1;
|
||||
|
||||
// calculate the global thread coordination
|
||||
int yidx = blkY + ty;
|
||||
int xidx = blkX + tx;
|
||||
|
||||
// load data if it is within the valid input range
|
||||
int loadYidx = yidx, loadXidx = xidx;
|
||||
int index = grid_cols * loadYidx + loadXidx;
|
||||
|
||||
if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
|
||||
IN_RANGE(loadXidx, 0, grid_cols - 1)) {
|
||||
temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
|
||||
// global memory to shared memory
|
||||
power_on_cuda[ty][tx] =
|
||||
power[index]; // Load the power data from global memory to shared memory
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// effective range within this block that falls within
|
||||
// the valid range of the input data
|
||||
// used to rule out computation outside the boundary.
|
||||
int validYmin = (blkY < 0) ? -blkY : 0;
|
||||
int validYmax = (blkYmax > grid_rows - 1)
|
||||
? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
|
||||
: BLOCK_SIZE - 1;
|
||||
int validXmin = (blkX < 0) ? -blkX : 0;
|
||||
int validXmax = (blkXmax > grid_cols - 1)
|
||||
? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
|
||||
: BLOCK_SIZE - 1;
|
||||
|
||||
int N = ty - 1;
|
||||
int S = ty + 1;
|
||||
int W = tx - 1;
|
||||
int E = tx + 1;
|
||||
|
||||
N = (N < validYmin) ? validYmin : N;
|
||||
S = (S > validYmax) ? validYmax : S;
|
||||
W = (W < validXmin) ? validXmin : W;
|
||||
E = (E > validXmax) ? validXmax : E;
|
||||
|
||||
bool computed;
|
||||
for (int i = 0; i < iteration; i++) {
|
||||
computed = false;
|
||||
if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
|
||||
IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
|
||||
IN_RANGE(tx, validXmin, validXmax) &&
|
||||
IN_RANGE(ty, validYmin, validYmax)) {
|
||||
computed = true;
|
||||
temp_t[ty][tx] =
|
||||
temp_on_cuda[ty][tx] +
|
||||
step_div_Cap * (power_on_cuda[ty][tx] +
|
||||
(temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
|
||||
2.0 * temp_on_cuda[ty][tx]) *
|
||||
Ry_1 +
|
||||
(temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
|
||||
2.0 * temp_on_cuda[ty][tx]) *
|
||||
Rx_1 +
|
||||
(amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
|
||||
}
|
||||
__syncthreads();
|
||||
if (i == iteration - 1)
|
||||
break;
|
||||
if (computed) // Assign the computation range
|
||||
temp_on_cuda[ty][tx] = temp_t[ty][tx];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// update the global memory
|
||||
// after the last iteration, only threads coordinated within the
|
||||
// small block perform the calculation and switch on ``computed''
|
||||
if (computed) {
|
||||
temp_dst[index] = temp_t[ty][tx];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
compute N time steps
|
||||
*/
|
||||
|
||||
int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
|
||||
int row, int total_iterations, int num_iterations,
|
||||
int blockCols, int blockRows, int borderCols,
|
||||
int borderRows) {
|
||||
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||
dim3 dimGrid(blockCols, blockRows);
|
||||
|
||||
float grid_height = chip_height / row;
|
||||
float grid_width = chip_width / col;
|
||||
|
||||
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
|
||||
float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
|
||||
float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
|
||||
float Rz = t_chip / (K_SI * grid_height * grid_width);
|
||||
|
||||
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
|
||||
float step = PRECISION / max_slope;
|
||||
float t;
|
||||
float time_elapsed;
|
||||
time_elapsed = 0.001;
|
||||
|
||||
int src = 1, dst = 0;
|
||||
|
||||
for (t = 0; t < total_iterations; t += num_iterations) {
|
||||
int temp = src;
|
||||
src = dst;
|
||||
dst = temp;
|
||||
calculate_temp<<<dimGrid, dimBlock>>>(
|
||||
MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
|
||||
MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
|
||||
step, time_elapsed);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
void usage(int argc, char **argv) {
|
||||
fprintf(stderr,
|
||||
"Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
|
||||
"<temp_file> <power_file> <output_file>\n",
|
||||
argv[0]);
|
||||
fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid "
|
||||
"(positive integer)\n");
|
||||
fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
|
||||
fprintf(stderr, "\t<sim_time> - number of iterations\n");
|
||||
fprintf(stderr, "\t<temp_file> - name of the file containing the initial "
|
||||
"temperature values of each cell\n");
|
||||
fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
|
||||
"power values of each cell\n");
|
||||
fprintf(stderr, "\t<output_file> - name of the output file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
run(argc, argv);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
void run(int argc, char **argv) {
|
||||
int size;
|
||||
int grid_rows, grid_cols;
|
||||
float *FilesavingTemp, *FilesavingPower, *MatrixOut;
|
||||
char *tfile, *pfile, *ofile;
|
||||
|
||||
int total_iterations = 60;
|
||||
int pyramid_height = 1; // number of iterations
|
||||
|
||||
if (argc != 7)
|
||||
usage(argc, argv);
|
||||
if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
|
||||
(pyramid_height = atoi(argv[2])) <= 0 ||
|
||||
(total_iterations = atoi(argv[3])) <= 0)
|
||||
usage(argc, argv);
|
||||
|
||||
tfile = argv[4];
|
||||
pfile = argv[5];
|
||||
ofile = argv[6];
|
||||
|
||||
size = grid_rows * grid_cols;
|
||||
|
||||
/* --------------- pyramid parameters --------------- */
|
||||
#define EXPAND_RATE \
|
||||
2 // add one iteration will extend the pyramid base by 2 per each borderline
|
||||
int borderCols = (pyramid_height)*EXPAND_RATE / 2;
|
||||
int borderRows = (pyramid_height)*EXPAND_RATE / 2;
|
||||
int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
|
||||
int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
|
||||
int blockCols =
|
||||
grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
|
||||
int blockRows =
|
||||
grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
|
||||
|
||||
FilesavingTemp = (float *)malloc(size * sizeof(float));
|
||||
FilesavingPower = (float *)malloc(size * sizeof(float));
|
||||
MatrixOut = (float *)calloc(size, sizeof(float));
|
||||
|
||||
if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
|
||||
fatal("unable to allocate memory");
|
||||
|
||||
printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
|
||||
"%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
|
||||
pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
|
||||
blockCols, blockRows, smallBlockCol, smallBlockRow);
|
||||
|
||||
readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
|
||||
readinput(FilesavingPower, grid_rows, grid_cols, pfile);
|
||||
|
||||
float *MatrixTemp[2], *MatrixPower;
|
||||
cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
|
||||
cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
|
||||
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
|
||||
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
|
||||
cudaMemcpyHostToDevice);
|
||||
printf("Start computing the transient temperature\n");
|
||||
int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
|
||||
total_iterations, pyramid_height, blockCols,
|
||||
blockRows, borderCols, borderRows);
|
||||
printf("Ending simulation\n");
|
||||
cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
|
||||
cudaMemcpyDeviceToHost);
|
||||
|
||||
writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
|
||||
|
||||
cudaFree(MatrixPower);
|
||||
cudaFree(MatrixTemp[0]);
|
||||
cudaFree(MatrixTemp[1]);
|
||||
free(MatrixOut);
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
|
||||
-o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
|
||||
if head output.out | grep -q "323.829"; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -0,0 +1,587 @@
|
|||
; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "3D.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockDim_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
|
||||
entry:
|
||||
%p.addr = alloca float*, align 8
|
||||
%tIn.addr = alloca float*, align 8
|
||||
%tOut.addr = alloca float*, align 8
|
||||
%sdc.addr = alloca float, align 4
|
||||
%nx.addr = alloca i32, align 4
|
||||
%ny.addr = alloca i32, align 4
|
||||
%nz.addr = alloca i32, align 4
|
||||
%ce.addr = alloca float, align 4
|
||||
%cw.addr = alloca float, align 4
|
||||
%cn.addr = alloca float, align 4
|
||||
%cs.addr = alloca float, align 4
|
||||
%ct.addr = alloca float, align 4
|
||||
%cb.addr = alloca float, align 4
|
||||
%cc.addr = alloca float, align 4
|
||||
%amb_temp = alloca float, align 4
|
||||
%i = alloca i32, align 4
|
||||
%j = alloca i32, align 4
|
||||
%c = alloca i32, align 4
|
||||
%xy = alloca i32, align 4
|
||||
%W = alloca i32, align 4
|
||||
%E = alloca i32, align 4
|
||||
%N = alloca i32, align 4
|
||||
%S = alloca i32, align 4
|
||||
%temp1 = alloca float, align 4
|
||||
%temp2 = alloca float, align 4
|
||||
%temp3 = alloca float, align 4
|
||||
%k = alloca i32, align 4
|
||||
store float* %p, float** %p.addr, align 8
|
||||
store float* %tIn, float** %tIn.addr, align 8
|
||||
store float* %tOut, float** %tOut.addr, align 8
|
||||
store float %sdc, float* %sdc.addr, align 4
|
||||
store i32 %nx, i32* %nx.addr, align 4
|
||||
store i32 %ny, i32* %ny.addr, align 4
|
||||
store i32 %nz, i32* %nz.addr, align 4
|
||||
store float %ce, float* %ce.addr, align 4
|
||||
store float %cw, float* %cw.addr, align 4
|
||||
store float %cn, float* %cn.addr, align 4
|
||||
store float %cs, float* %cs.addr, align 4
|
||||
store float %ct, float* %ct.addr, align 4
|
||||
store float %cb, float* %cb.addr, align 4
|
||||
store float %cc, float* %cc.addr, align 4
|
||||
store float 8.000000e+01, float* %amb_temp, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call, %call1
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add = add i32 %mul, %call2
|
||||
store i32 %add, i32* %i, align 4
|
||||
%call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
|
||||
%call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
|
||||
%mul5 = mul i32 %call3, %call4
|
||||
%call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
|
||||
%add7 = add i32 %mul5, %call6
|
||||
store i32 %add7, i32* %j, align 4
|
||||
%0 = load i32, i32* %i, align 4
|
||||
%1 = load i32, i32* %j, align 4
|
||||
%2 = load i32, i32* %nx.addr, align 4
|
||||
%mul8 = mul nsw i32 %1, %2
|
||||
%add9 = add nsw i32 %0, %mul8
|
||||
store i32 %add9, i32* %c, align 4
|
||||
%3 = load i32, i32* %nx.addr, align 4
|
||||
%4 = load i32, i32* %ny.addr, align 4
|
||||
%mul10 = mul nsw i32 %3, %4
|
||||
store i32 %mul10, i32* %xy, align 4
|
||||
%5 = load i32, i32* %i, align 4
|
||||
%cmp = icmp eq i32 %5, 0
|
||||
br i1 %cmp, label %cond.true, label %cond.false
|
||||
|
||||
cond.true: ; preds = %entry
|
||||
%6 = load i32, i32* %c, align 4
|
||||
br label %cond.end
|
||||
|
||||
cond.false: ; preds = %entry
|
||||
%7 = load i32, i32* %c, align 4
|
||||
%sub = sub nsw i32 %7, 1
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %cond.false, %cond.true
|
||||
%cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
|
||||
store i32 %cond, i32* %W, align 4
|
||||
%8 = load i32, i32* %i, align 4
|
||||
%9 = load i32, i32* %nx.addr, align 4
|
||||
%sub11 = sub nsw i32 %9, 1
|
||||
%cmp12 = icmp eq i32 %8, %sub11
|
||||
br i1 %cmp12, label %cond.true13, label %cond.false14
|
||||
|
||||
cond.true13: ; preds = %cond.end
|
||||
%10 = load i32, i32* %c, align 4
|
||||
br label %cond.end16
|
||||
|
||||
cond.false14: ; preds = %cond.end
|
||||
%11 = load i32, i32* %c, align 4
|
||||
%add15 = add nsw i32 %11, 1
|
||||
br label %cond.end16
|
||||
|
||||
cond.end16: ; preds = %cond.false14, %cond.true13
|
||||
%cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
|
||||
store i32 %cond17, i32* %E, align 4
|
||||
%12 = load i32, i32* %j, align 4
|
||||
%cmp18 = icmp eq i32 %12, 0
|
||||
br i1 %cmp18, label %cond.true19, label %cond.false20
|
||||
|
||||
cond.true19: ; preds = %cond.end16
|
||||
%13 = load i32, i32* %c, align 4
|
||||
br label %cond.end22
|
||||
|
||||
cond.false20: ; preds = %cond.end16
|
||||
%14 = load i32, i32* %c, align 4
|
||||
%15 = load i32, i32* %nx.addr, align 4
|
||||
%sub21 = sub nsw i32 %14, %15
|
||||
br label %cond.end22
|
||||
|
||||
cond.end22: ; preds = %cond.false20, %cond.true19
|
||||
%cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
|
||||
store i32 %cond23, i32* %N, align 4
|
||||
%16 = load i32, i32* %j, align 4
|
||||
%17 = load i32, i32* %ny.addr, align 4
|
||||
%sub24 = sub nsw i32 %17, 1
|
||||
%cmp25 = icmp eq i32 %16, %sub24
|
||||
br i1 %cmp25, label %cond.true26, label %cond.false27
|
||||
|
||||
cond.true26: ; preds = %cond.end22
|
||||
%18 = load i32, i32* %c, align 4
|
||||
br label %cond.end29
|
||||
|
||||
cond.false27: ; preds = %cond.end22
|
||||
%19 = load i32, i32* %c, align 4
|
||||
%20 = load i32, i32* %nx.addr, align 4
|
||||
%add28 = add nsw i32 %19, %20
|
||||
br label %cond.end29
|
||||
|
||||
cond.end29: ; preds = %cond.false27, %cond.true26
|
||||
%cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
|
||||
store i32 %cond30, i32* %S, align 4
|
||||
%21 = load float*, float** %tIn.addr, align 8
|
||||
%22 = load i32, i32* %c, align 4
|
||||
%idxprom = sext i32 %22 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
|
||||
%23 = load float, float* %arrayidx, align 4
|
||||
store float %23, float* %temp2, align 4
|
||||
store float %23, float* %temp1, align 4
|
||||
%24 = load float*, float** %tIn.addr, align 8
|
||||
%25 = load i32, i32* %c, align 4
|
||||
%26 = load i32, i32* %xy, align 4
|
||||
%add31 = add nsw i32 %25, %26
|
||||
%idxprom32 = sext i32 %add31 to i64
|
||||
%arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
|
||||
%27 = load float, float* %arrayidx33, align 4
|
||||
store float %27, float* %temp3, align 4
|
||||
%28 = load float, float* %cc.addr, align 4
|
||||
%29 = load float, float* %temp2, align 4
|
||||
%mul34 = fmul contract float %28, %29
|
||||
%30 = load float, float* %cw.addr, align 4
|
||||
%31 = load float*, float** %tIn.addr, align 8
|
||||
%32 = load i32, i32* %W, align 4
|
||||
%idxprom35 = sext i32 %32 to i64
|
||||
%arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
|
||||
%33 = load float, float* %arrayidx36, align 4
|
||||
%mul37 = fmul contract float %30, %33
|
||||
%add38 = fadd contract float %mul34, %mul37
|
||||
%34 = load float, float* %ce.addr, align 4
|
||||
%35 = load float*, float** %tIn.addr, align 8
|
||||
%36 = load i32, i32* %E, align 4
|
||||
%idxprom39 = sext i32 %36 to i64
|
||||
%arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
|
||||
%37 = load float, float* %arrayidx40, align 4
|
||||
%mul41 = fmul contract float %34, %37
|
||||
%add42 = fadd contract float %add38, %mul41
|
||||
%38 = load float, float* %cs.addr, align 4
|
||||
%39 = load float*, float** %tIn.addr, align 8
|
||||
%40 = load i32, i32* %S, align 4
|
||||
%idxprom43 = sext i32 %40 to i64
|
||||
%arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
|
||||
%41 = load float, float* %arrayidx44, align 4
|
||||
%mul45 = fmul contract float %38, %41
|
||||
%add46 = fadd contract float %add42, %mul45
|
||||
%42 = load float, float* %cn.addr, align 4
|
||||
%43 = load float*, float** %tIn.addr, align 8
|
||||
%44 = load i32, i32* %N, align 4
|
||||
%idxprom47 = sext i32 %44 to i64
|
||||
%arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
|
||||
%45 = load float, float* %arrayidx48, align 4
|
||||
%mul49 = fmul contract float %42, %45
|
||||
%add50 = fadd contract float %add46, %mul49
|
||||
%46 = load float, float* %cb.addr, align 4
|
||||
%47 = load float, float* %temp1, align 4
|
||||
%mul51 = fmul contract float %46, %47
|
||||
%add52 = fadd contract float %add50, %mul51
|
||||
%48 = load float, float* %ct.addr, align 4
|
||||
%49 = load float, float* %temp3, align 4
|
||||
%mul53 = fmul contract float %48, %49
|
||||
%add54 = fadd contract float %add52, %mul53
|
||||
%50 = load float, float* %sdc.addr, align 4
|
||||
%51 = load float*, float** %p.addr, align 8
|
||||
%52 = load i32, i32* %c, align 4
|
||||
%idxprom55 = sext i32 %52 to i64
|
||||
%arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
|
||||
%53 = load float, float* %arrayidx56, align 4
|
||||
%mul57 = fmul contract float %50, %53
|
||||
%add58 = fadd contract float %add54, %mul57
|
||||
%54 = load float, float* %ct.addr, align 4
|
||||
%55 = load float, float* %amb_temp, align 4
|
||||
%mul59 = fmul contract float %54, %55
|
||||
%add60 = fadd contract float %add58, %mul59
|
||||
%56 = load float*, float** %tOut.addr, align 8
|
||||
%57 = load i32, i32* %c, align 4
|
||||
%idxprom61 = sext i32 %57 to i64
|
||||
%arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
|
||||
store float %add60, float* %arrayidx62, align 4
|
||||
%58 = load i32, i32* %xy, align 4
|
||||
%59 = load i32, i32* %c, align 4
|
||||
%add63 = add nsw i32 %59, %58
|
||||
store i32 %add63, i32* %c, align 4
|
||||
%60 = load i32, i32* %xy, align 4
|
||||
%61 = load i32, i32* %W, align 4
|
||||
%add64 = add nsw i32 %61, %60
|
||||
store i32 %add64, i32* %W, align 4
|
||||
%62 = load i32, i32* %xy, align 4
|
||||
%63 = load i32, i32* %E, align 4
|
||||
%add65 = add nsw i32 %63, %62
|
||||
store i32 %add65, i32* %E, align 4
|
||||
%64 = load i32, i32* %xy, align 4
|
||||
%65 = load i32, i32* %N, align 4
|
||||
%add66 = add nsw i32 %65, %64
|
||||
store i32 %add66, i32* %N, align 4
|
||||
%66 = load i32, i32* %xy, align 4
|
||||
%67 = load i32, i32* %S, align 4
|
||||
%add67 = add nsw i32 %67, %66
|
||||
store i32 %add67, i32* %S, align 4
|
||||
store i32 1, i32* %k, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %cond.end29
|
||||
%68 = load i32, i32* %k, align 4
|
||||
%69 = load i32, i32* %nz.addr, align 4
|
||||
%sub68 = sub nsw i32 %69, 1
|
||||
%cmp69 = icmp slt i32 %68, %sub68
|
||||
br i1 %cmp69, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%70 = load float, float* %temp2, align 4
|
||||
store float %70, float* %temp1, align 4
|
||||
%71 = load float, float* %temp3, align 4
|
||||
store float %71, float* %temp2, align 4
|
||||
%72 = load float*, float** %tIn.addr, align 8
|
||||
%73 = load i32, i32* %c, align 4
|
||||
%74 = load i32, i32* %xy, align 4
|
||||
%add70 = add nsw i32 %73, %74
|
||||
%idxprom71 = sext i32 %add70 to i64
|
||||
%arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
|
||||
%75 = load float, float* %arrayidx72, align 4
|
||||
store float %75, float* %temp3, align 4
|
||||
%76 = load float, float* %cc.addr, align 4
|
||||
%77 = load float, float* %temp2, align 4
|
||||
%mul73 = fmul contract float %76, %77
|
||||
%78 = load float, float* %cw.addr, align 4
|
||||
%79 = load float*, float** %tIn.addr, align 8
|
||||
%80 = load i32, i32* %W, align 4
|
||||
%idxprom74 = sext i32 %80 to i64
|
||||
%arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
|
||||
%81 = load float, float* %arrayidx75, align 4
|
||||
%mul76 = fmul contract float %78, %81
|
||||
%add77 = fadd contract float %mul73, %mul76
|
||||
%82 = load float, float* %ce.addr, align 4
|
||||
%83 = load float*, float** %tIn.addr, align 8
|
||||
%84 = load i32, i32* %E, align 4
|
||||
%idxprom78 = sext i32 %84 to i64
|
||||
%arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
|
||||
%85 = load float, float* %arrayidx79, align 4
|
||||
%mul80 = fmul contract float %82, %85
|
||||
%add81 = fadd contract float %add77, %mul80
|
||||
%86 = load float, float* %cs.addr, align 4
|
||||
%87 = load float*, float** %tIn.addr, align 8
|
||||
%88 = load i32, i32* %S, align 4
|
||||
%idxprom82 = sext i32 %88 to i64
|
||||
%arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
|
||||
%89 = load float, float* %arrayidx83, align 4
|
||||
%mul84 = fmul contract float %86, %89
|
||||
%add85 = fadd contract float %add81, %mul84
|
||||
%90 = load float, float* %cn.addr, align 4
|
||||
%91 = load float*, float** %tIn.addr, align 8
|
||||
%92 = load i32, i32* %N, align 4
|
||||
%idxprom86 = sext i32 %92 to i64
|
||||
%arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
|
||||
%93 = load float, float* %arrayidx87, align 4
|
||||
%mul88 = fmul contract float %90, %93
|
||||
%add89 = fadd contract float %add85, %mul88
|
||||
%94 = load float, float* %cb.addr, align 4
|
||||
%95 = load float, float* %temp1, align 4
|
||||
%mul90 = fmul contract float %94, %95
|
||||
%add91 = fadd contract float %add89, %mul90
|
||||
%96 = load float, float* %ct.addr, align 4
|
||||
%97 = load float, float* %temp3, align 4
|
||||
%mul92 = fmul contract float %96, %97
|
||||
%add93 = fadd contract float %add91, %mul92
|
||||
%98 = load float, float* %sdc.addr, align 4
|
||||
%99 = load float*, float** %p.addr, align 8
|
||||
%100 = load i32, i32* %c, align 4
|
||||
%idxprom94 = sext i32 %100 to i64
|
||||
%arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
|
||||
%101 = load float, float* %arrayidx95, align 4
|
||||
%mul96 = fmul contract float %98, %101
|
||||
%add97 = fadd contract float %add93, %mul96
|
||||
%102 = load float, float* %ct.addr, align 4
|
||||
%103 = load float, float* %amb_temp, align 4
|
||||
%mul98 = fmul contract float %102, %103
|
||||
%add99 = fadd contract float %add97, %mul98
|
||||
%104 = load float*, float** %tOut.addr, align 8
|
||||
%105 = load i32, i32* %c, align 4
|
||||
%idxprom100 = sext i32 %105 to i64
|
||||
%arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
|
||||
store float %add99, float* %arrayidx101, align 4
|
||||
%106 = load i32, i32* %xy, align 4
|
||||
%107 = load i32, i32* %c, align 4
|
||||
%add102 = add nsw i32 %107, %106
|
||||
store i32 %add102, i32* %c, align 4
|
||||
%108 = load i32, i32* %xy, align 4
|
||||
%109 = load i32, i32* %W, align 4
|
||||
%add103 = add nsw i32 %109, %108
|
||||
store i32 %add103, i32* %W, align 4
|
||||
%110 = load i32, i32* %xy, align 4
|
||||
%111 = load i32, i32* %E, align 4
|
||||
%add104 = add nsw i32 %111, %110
|
||||
store i32 %add104, i32* %E, align 4
|
||||
%112 = load i32, i32* %xy, align 4
|
||||
%113 = load i32, i32* %N, align 4
|
||||
%add105 = add nsw i32 %113, %112
|
||||
store i32 %add105, i32* %N, align 4
|
||||
%114 = load i32, i32* %xy, align 4
|
||||
%115 = load i32, i32* %S, align 4
|
||||
%add106 = add nsw i32 %115, %114
|
||||
store i32 %add106, i32* %S, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body
|
||||
%116 = load i32, i32* %k, align 4
|
||||
%inc = add nsw i32 %116, 1
|
||||
store i32 %inc, i32* %k, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%117 = load float, float* %temp2, align 4
|
||||
store float %117, float* %temp1, align 4
|
||||
%118 = load float, float* %temp3, align 4
|
||||
store float %118, float* %temp2, align 4
|
||||
%119 = load float, float* %cc.addr, align 4
|
||||
%120 = load float, float* %temp2, align 4
|
||||
%mul107 = fmul contract float %119, %120
|
||||
%121 = load float, float* %cw.addr, align 4
|
||||
%122 = load float*, float** %tIn.addr, align 8
|
||||
%123 = load i32, i32* %W, align 4
|
||||
%idxprom108 = sext i32 %123 to i64
|
||||
%arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
|
||||
%124 = load float, float* %arrayidx109, align 4
|
||||
%mul110 = fmul contract float %121, %124
|
||||
%add111 = fadd contract float %mul107, %mul110
|
||||
%125 = load float, float* %ce.addr, align 4
|
||||
%126 = load float*, float** %tIn.addr, align 8
|
||||
%127 = load i32, i32* %E, align 4
|
||||
%idxprom112 = sext i32 %127 to i64
|
||||
%arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
|
||||
%128 = load float, float* %arrayidx113, align 4
|
||||
%mul114 = fmul contract float %125, %128
|
||||
%add115 = fadd contract float %add111, %mul114
|
||||
%129 = load float, float* %cs.addr, align 4
|
||||
%130 = load float*, float** %tIn.addr, align 8
|
||||
%131 = load i32, i32* %S, align 4
|
||||
%idxprom116 = sext i32 %131 to i64
|
||||
%arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
|
||||
%132 = load float, float* %arrayidx117, align 4
|
||||
%mul118 = fmul contract float %129, %132
|
||||
%add119 = fadd contract float %add115, %mul118
|
||||
%133 = load float, float* %cn.addr, align 4
|
||||
%134 = load float*, float** %tIn.addr, align 8
|
||||
%135 = load i32, i32* %N, align 4
|
||||
%idxprom120 = sext i32 %135 to i64
|
||||
%arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
|
||||
%136 = load float, float* %arrayidx121, align 4
|
||||
%mul122 = fmul contract float %133, %136
|
||||
%add123 = fadd contract float %add119, %mul122
|
||||
%137 = load float, float* %cb.addr, align 4
|
||||
%138 = load float, float* %temp1, align 4
|
||||
%mul124 = fmul contract float %137, %138
|
||||
%add125 = fadd contract float %add123, %mul124
|
||||
%139 = load float, float* %ct.addr, align 4
|
||||
%140 = load float, float* %temp3, align 4
|
||||
%mul126 = fmul contract float %139, %140
|
||||
%add127 = fadd contract float %add125, %mul126
|
||||
%141 = load float, float* %sdc.addr, align 4
|
||||
%142 = load float*, float** %p.addr, align 8
|
||||
%143 = load i32, i32* %c, align 4
|
||||
%idxprom128 = sext i32 %143 to i64
|
||||
%arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
|
||||
%144 = load float, float* %arrayidx129, align 4
|
||||
%mul130 = fmul contract float %141, %144
|
||||
%add131 = fadd contract float %add127, %mul130
|
||||
%145 = load float, float* %ct.addr, align 4
|
||||
%146 = load float, float* %amb_temp, align 4
|
||||
%mul132 = fmul contract float %145, %146
|
||||
%add133 = fadd contract float %add131, %mul132
|
||||
%147 = load float*, float** %tOut.addr, align 8
|
||||
%148 = load i32, i32* %c, align 4
|
||||
%idxprom134 = sext i32 %148 to i64
|
||||
%arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
|
||||
store float %add133, float* %arrayidx135, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,205 @@
|
|||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
|
||||
#define BLOCK_SIZE 16
|
||||
#define STR_SIZE 256
|
||||
|
||||
#define block_x_ 128
|
||||
#define block_y_ 2
|
||||
#define block_z_ 1
|
||||
#define MAX_PD (3.0e6)
|
||||
/* required precision in degrees */
|
||||
#define PRECISION 0.001
|
||||
#define SPEC_HEAT_SI 1.75e6
|
||||
#define K_SI 100
|
||||
/* capacitance fitting factor */
|
||||
#define FACTOR_CHIP 0.5
|
||||
|
||||
#include "opt1.cu"
|
||||
|
||||
/* chip parameters */
|
||||
float t_chip = 0.0005;
|
||||
float chip_height = 0.016;
|
||||
float chip_width = 0.016; /* ambient temperature, assuming no package at all
|
||||
*/
|
||||
float amb_temp = 80.0;
|
||||
|
||||
void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
|
||||
|
||||
void readinput(float *vect, int grid_rows, int grid_cols, int layers,
|
||||
char *file) {
|
||||
int i, j, k;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
float val;
|
||||
|
||||
if ((fp = fopen(file, "r")) == 0)
|
||||
fatal("The file was not opened");
|
||||
|
||||
for (i = 0; i <= grid_rows - 1; i++)
|
||||
for (j = 0; j <= grid_cols - 1; j++)
|
||||
for (k = 0; k <= layers - 1; k++) {
|
||||
if (fgets(str, STR_SIZE, fp) == NULL)
|
||||
fatal("Error reading file\n");
|
||||
if (feof(fp))
|
||||
fatal("not enough lines in file");
|
||||
if ((sscanf(str, "%f", &val) != 1))
|
||||
fatal("invalid file format");
|
||||
vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
|
||||
char *file) {
|
||||
|
||||
int i, j, k, index = 0;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
|
||||
if ((fp = fopen(file, "w")) == 0)
|
||||
printf("The file was not opened\n");
|
||||
|
||||
for (i = 0; i < grid_rows; i++)
|
||||
for (j = 0; j < grid_cols; j++)
|
||||
for (k = 0; k < layers; k++) {
|
||||
sprintf(str, "%d\t%g\n", index,
|
||||
vect[i * grid_cols + j + k * grid_rows * grid_cols]);
|
||||
fputs(str, fp);
|
||||
index++;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
|
||||
float Cap, float Rx, float Ry, float Rz, float dt,
|
||||
int numiter) {
|
||||
float ce, cw, cn, cs, ct, cb, cc;
|
||||
float stepDivCap = dt / Cap;
|
||||
ce = cw = stepDivCap / Rx;
|
||||
cn = cs = stepDivCap / Ry;
|
||||
ct = cb = stepDivCap / Rz;
|
||||
|
||||
cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
|
||||
|
||||
int c, w, e, n, s, b, t;
|
||||
int x, y, z;
|
||||
int i = 0;
|
||||
do {
|
||||
for (z = 0; z < nz; z++)
|
||||
for (y = 0; y < ny; y++)
|
||||
for (x = 0; x < nx; x++) {
|
||||
c = x + y * nx + z * nx * ny;
|
||||
|
||||
w = (x == 0) ? c : c - 1;
|
||||
e = (x == nx - 1) ? c : c + 1;
|
||||
n = (y == 0) ? c : c - nx;
|
||||
s = (y == ny - 1) ? c : c + nx;
|
||||
b = (z == 0) ? c : c - nx * ny;
|
||||
t = (z == nz - 1) ? c : c + nx * ny;
|
||||
|
||||
tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
|
||||
tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
|
||||
(dt / Cap) * pIn[c] + ct * amb_temp;
|
||||
}
|
||||
float *temp = tIn;
|
||||
tIn = tOut;
|
||||
tOut = temp;
|
||||
i++;
|
||||
} while (i < numiter);
|
||||
}
|
||||
|
||||
float accuracy(float *arr1, float *arr2, int len) {
|
||||
float err = 0.0;
|
||||
int i;
|
||||
for (i = 0; i < len; i++) {
|
||||
err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
|
||||
}
|
||||
|
||||
return (float)sqrt(err / len);
|
||||
}
|
||||
|
||||
void usage(int argc, char **argv) {
|
||||
fprintf(stderr,
|
||||
"Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
|
||||
"<outputFile>\n",
|
||||
argv[0]);
|
||||
fprintf(
|
||||
stderr,
|
||||
"\t<rows/cols> - number of rows/cols in the grid (positive integer)\n");
|
||||
fprintf(stderr,
|
||||
"\t<layers> - number of layers in the grid (positive integer)\n");
|
||||
|
||||
fprintf(stderr, "\t<iteration> - number of iterations\n");
|
||||
fprintf(stderr, "\t<powerFile> - name of the file containing the initial "
|
||||
"power values of each cell\n");
|
||||
fprintf(stderr, "\t<tempFile> - name of the file containing the initial "
|
||||
"temperature values of each cell\n");
|
||||
fprintf(stderr, "\t<outputFile - output file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
if (argc != 7) {
|
||||
usage(argc, argv);
|
||||
}
|
||||
|
||||
char *pfile, *tfile, *ofile;
|
||||
int iterations = atoi(argv[3]);
|
||||
|
||||
pfile = argv[4];
|
||||
tfile = argv[5];
|
||||
ofile = argv[6];
|
||||
int numCols = atoi(argv[1]);
|
||||
int numRows = atoi(argv[1]);
|
||||
int layers = atoi(argv[2]);
|
||||
|
||||
/* calculating parameters*/
|
||||
|
||||
float dx = chip_height / numRows;
|
||||
float dy = chip_width / numCols;
|
||||
float dz = t_chip / layers;
|
||||
|
||||
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
|
||||
float Rx = dy / (2.0 * K_SI * t_chip * dx);
|
||||
float Ry = dx / (2.0 * K_SI * t_chip * dy);
|
||||
float Rz = dz / (K_SI * dx * dy);
|
||||
|
||||
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
|
||||
float dt = PRECISION / max_slope;
|
||||
|
||||
float *powerIn, *tempOut, *tempIn, *tempCopy;
|
||||
int size = numCols * numRows * layers;
|
||||
|
||||
powerIn = (float *)calloc(size, sizeof(float));
|
||||
tempCopy = (float *)malloc(size * sizeof(float));
|
||||
tempIn = (float *)calloc(size, sizeof(float));
|
||||
tempOut = (float *)calloc(size, sizeof(float));
|
||||
float *answer = (float *)calloc(size, sizeof(float));
|
||||
|
||||
readinput(powerIn, numRows, numCols, layers, pfile);
|
||||
readinput(tempIn, numRows, numCols, layers, tfile);
|
||||
|
||||
memcpy(tempCopy, tempIn, size * sizeof(float));
|
||||
|
||||
hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
|
||||
Rz, dt, iterations);
|
||||
|
||||
computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
|
||||
Ry, Rz, dt, iterations);
|
||||
|
||||
float acc = accuracy(tempOut, answer, numRows * numCols * layers);
|
||||
printf("Accuracy: %e\n", acc);
|
||||
writeoutput(tempOut, numRows, numCols, layers, ofile);
|
||||
free(tempIn);
|
||||
free(tempOut);
|
||||
free(powerIn);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
# # #!/bin/bash
|
||||
set -e
|
||||
llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o 3D \
|
||||
-fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
|
||||
|
||||
if head output.out | grep -q "334.017"; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -0,0 +1,24 @@
|
|||
#ifndef _COMPARISON_HELPERS_H_
|
||||
#define _COMPARISON_HELPERS_H_
|
||||
#include <stdio.h>
|
||||
template <typename T>
|
||||
__inline int compare_vectors(T *data1, T *data2, unsigned int size) {
|
||||
printf("Comparing vectors: \n");
|
||||
bool match = true;
|
||||
for (unsigned int i = 0; i < size; i++)
|
||||
if (data1[i] != data2[i]) {
|
||||
match = false;
|
||||
printf("Diff: data1[%d]=%d, data1[%d]=%d.\n", i, data1[i], i, data2[i]);
|
||||
}
|
||||
|
||||
if (match) {
|
||||
printf("PASS! vectors are matching!\n");
|
||||
return 0;
|
||||
} else {
|
||||
printf("FAIL! vectors are NOT matching!\n");
|
||||
exit(1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,116 @@
|
|||
#include "stdafx.h"
|
||||
|
||||
#include "cpuencode.h"
|
||||
#include "print_helpers.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#if 1
|
||||
|
||||
// The max. codeword length for each byte symbol is 32-bits
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens) {
|
||||
unsigned int *bitstreamPt =
|
||||
(unsigned int *)outdata; /* Pointer to current byte */
|
||||
*bitstreamPt = 0x00000000U;
|
||||
unsigned int startbit = 0;
|
||||
unsigned int totalBytes = 0;
|
||||
|
||||
for (unsigned int k = 0; k < num_elements; k++) {
|
||||
unsigned int cw32 = 0;
|
||||
unsigned int val32 = indata[k];
|
||||
unsigned int numbits = 0;
|
||||
unsigned int mask32;
|
||||
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
|
||||
cw32 = codewords[symbol];
|
||||
numbits = codewordlens[symbol];
|
||||
|
||||
while (numbits > 0) {
|
||||
int writebits = min(32 - startbit, numbits);
|
||||
if (numbits == writebits)
|
||||
mask32 = (cw32 & ((1 << numbits) - 1))
|
||||
<< (32 - startbit -
|
||||
numbits); // first make sure that the start of the word
|
||||
// is clean, then shift to the left as many
|
||||
// places as you need
|
||||
else
|
||||
mask32 = cw32 >>
|
||||
(numbits - writebits); // shift out the bits that can not fit
|
||||
*bitstreamPt = (*bitstreamPt) | mask32;
|
||||
numbits = numbits - writebits;
|
||||
startbit = (startbit + writebits) % 32;
|
||||
if (startbit == 0) {
|
||||
bitstreamPt++;
|
||||
*bitstreamPt = 0x00000000;
|
||||
totalBytes += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
totalBytes += (startbit / 8) +
|
||||
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
|
||||
*outsize = totalBytes;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
/// ALTERNATIVE CODER
|
||||
/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
|
||||
/// i.e. g 64 bits
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#else
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens) {
|
||||
unsigned int *bitstreamPt =
|
||||
(unsigned int *)outdata; /* Pointer to current byte */
|
||||
// assume memset is done.
|
||||
*bitstreamPt = 0x00000000U;
|
||||
unsigned int startbit = 0;
|
||||
unsigned int totalBytes = 0;
|
||||
|
||||
for (unsigned int k = 0; k < num_elements; k++) {
|
||||
unsigned long long cw64 = 0, mask64 = 0;
|
||||
unsigned int val32 = indata[k];
|
||||
unsigned int numbits = 0;
|
||||
unsigned int mask32, temp32;
|
||||
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
|
||||
cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
|
||||
numbits += codewordlens[symbol];
|
||||
// if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
|
||||
// %d!!!!!!!\n", k, numbits);
|
||||
}
|
||||
|
||||
while (numbits > 0) {
|
||||
int writebits = min(32 - startbit, numbits);
|
||||
if (numbits == writebits) {
|
||||
temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
|
||||
mask32 = temp32 << (32 - startbit - numbits);
|
||||
} else {
|
||||
mask32 = (unsigned int)(cw64 >> (numbits - writebits));
|
||||
cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
|
||||
}
|
||||
*bitstreamPt = (*bitstreamPt) | mask32;
|
||||
numbits = numbits - writebits;
|
||||
startbit = (startbit + writebits) % 32;
|
||||
if (startbit == 0) {
|
||||
bitstreamPt++;
|
||||
*bitstreamPt = 0x00000000;
|
||||
totalBytes += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
totalBytes += (startbit / 8) +
|
||||
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
|
||||
*outsize = totalBytes;
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,8 @@
|
|||
#ifndef _CE_H_
|
||||
#define _CE_H_
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens);
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue