add codebase for TACO submission

This commit is contained in:
Han Ruobing 2022-05-04 08:59:38 -04:00
parent 897af29748
commit f8e72916c1
164 changed files with 65421 additions and 1082 deletions

View File

@ -39,3 +39,4 @@ set(GCC_COVERAGE_LINK_FLAGS
"-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm") "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm")
add_subdirectory(compilation) add_subdirectory(compilation)
add_subdirectory(runtime)

View File

@ -1,6 +1,6 @@
# Contributing to CuPBoP # Contributing to COX
Thank you for your interest in contributing to CuPBoP! Thank you for your interest in contributing to COX!
We appreciate all contributions, including but not limited to: We appreciate all contributions, including but not limited to:
- Add documentation - Add documentation
@ -10,9 +10,9 @@ We appreciate all contributions, including but not limited to:
## How to contribute? ## How to contribute?
0. (Optional) Open an issue and discuss your idea before start 0. (Optional) Open an issue and discuss your idea before start
1. Fork the latest version CuPBoP 1. Fork the latest version COX
2. Commit to the forked repo 2. Commit to the forked repo
3. Create a Pull Request to CuPBoP main branch 3. Create a Pull Request to COX main branch
## Code style ## Code style
@ -21,15 +21,14 @@ To make sure your contribution is following the correct style,
we highly recommend you to install [pre-commit](https://pre-commit.com/) before development. we highly recommend you to install [pre-commit](https://pre-commit.com/) before development.
```bash ```bash
# Python3 environment is required # Python environment is required
pip install pre-commit pip install pre-commit
``` ```
Then, from the repository folder, execute the following instruction: Then, from the repository folder, execute the following instruction:
```bash ```bash
# execute in CuPBoP's root folder pre-commit install
pre-commit install
``` ```
With pre-commit plugin, each local commit will be automatically checked. With pre-commit plugin, each local commit will be automatically checked.

View File

@ -1,10 +1,10 @@
# CuPBoP: Cuda for Parallelized and Broad-range Processors # COX: CUDA on X86
## Introduction ## Introduction
CuPBoP (Cuda for parallelized and broad-range processors) is a framework This project consists of two parts: a series of LLVM passes that
aims to execute CUDA source code on non-NVIDIA devices, achieve a SPMD NVVM IR as input, and output the corresponding
including CPU, GPU and other architectures. MPMD+SIMD version of LLVM IR which can be execute on CPU devices.
## Install ## Install
@ -22,8 +22,8 @@ including CPU, GPU and other architectures.
1. Clone from github 1. Clone from github
```bash ```bash
git clone https://github.com/cupbop/CuPBoP git clone https://github.com/drcut/open_source_template
cd CuPBoP cd open_source_template
``` ```
2. Build the transformer for NVVM IR to LLVM IR for X86 2. Build the transformer for NVVM IR to LLVM IR for X86
@ -55,12 +55,8 @@ g++ ../compilation/examples/vecadd/host.cpp \
./vecadd_example ./vecadd_example
``` ```
## Contribution
We sincerely appreciate all kinds of contributions.
Please refer to [CONTRIBUTING](docs/CONTRIBUTING.md) for the contributing guideline.
## Author ## Author
* [Ruobing Han](https://drcut.github.io/) [Ruobing Han](https://drcut.github.io/) is a CS phd student in
* [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/) Georgia Institute Technology, under the supervision
of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/).

View File

@ -1,25 +1,43 @@
#include "ReplaceKernelLaunch.h" #include "RemoveCudaBuiltin.h"
#include "ReplaceConstantMemory.h"
#include "ReplaceCudaBuiltin.h"
#include "ReplaceKernelArgs.h"
#include "tool.h" #include "tool.h"
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h" #include "llvm/IR/Verifier.h"
#include <assert.h> #include <assert.h>
#include <fstream>
#include <iostream> #include <iostream>
#include <stdlib.h> #include <stdlib.h>
using namespace llvm; using namespace llvm;
std::string PATH = "kernel_meta.log";
int main(int argc, char **argv) { int main(int argc, char **argv) {
assert(argc == 3 && "incorrect number of arguments\n"); assert(argc == 3 && "incorrect number of arguments\n");
char *input_host_path = argv[1]; char *input_host_path = argv[1];
char *output_host_path = argv[2]; char *output_host_path = argv[2];
std::ifstream fin;
fin.open(PATH);
// load LLVM module(s) // load LLVM module(s)
llvm::Module *hostModule = LoadModuleFromFilr(input_host_path); llvm::Module *hostModule = LoadModuleFromFilr(input_host_path);
VerifyModule(hostModule); VerifyModule(hostModule);
// replace const memory
ReplaceConstantMemory(hostModule, fin);
// process host module // process host module
ReplaceKernelLaunch(hostModule); ReplaceCudaBuiltin(hostModule);
// remove builtin unuse functions and variables
RemoveCudaBuiltin(hostModule);
// replace arguments in kernel_arg, from alloc to malloc
ReplaceKernelArg(hostModule);
VerifyModule(hostModule); VerifyModule(hostModule);
DumpModule(hostModule, output_host_path); DumpModule(hostModule, output_host_path);
fin.close();
return 0; return 0;
} }

View File

@ -1,11 +1,11 @@
#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__ #ifndef __NVVM2x86_REMOVE_CUDABUILTIN__
#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__ #define __NVVM2x86_REMOVE_CUDABUILTIN__
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
/* /*
* Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
* Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
*/ */
void ReplaceKernelLaunch(llvm::Module *M); void RemoveCudaBuiltin(llvm::Module *M);
#endif #endif

View File

@ -0,0 +1,12 @@
#ifndef __NVVM2x86_REPLACE_CONSTANT_MEMORY__
#define __NVVM2x86_REPLACE_CONSTANT_MEMORY__
#include "llvm/IR/Module.h"
#include <fstream>
/*
* From: @ff_variable = internal global [5 x float] undef, align 16
* To: @wrapper_global_ff_variable = common global [5 x float] zeroinitializer
*/
void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin);
#endif

View File

@ -0,0 +1,11 @@
#ifndef __NVVM2x86_REPLACE_CUDA_BUILTIN__
#define __NVVM2x86_REPLACE_CUDA_BUILTIN__
#include "llvm/IR/Module.h"
/*
* Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
* Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
*/
void ReplaceCudaBuiltin(llvm::Module *M);
#endif

View File

@ -0,0 +1,14 @@
#ifndef __NVVM2x86_REPLACE_KERNEL_ARGS__
#define __NVVM2x86_REPLACE_KERNEL_ARGS__
#include "llvm/IR/Module.h"
/*
* before:
* %m_cuda.addr = alloca float*, align 8
* after:
* %m_cuda.addr_tmp = call i8* @malloc(i64 8)
* %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
*/
void ReplaceKernelArg(llvm::Module *M);
#endif

View File

@ -0,0 +1,7 @@
/**
* Generate a file for Cuda Kernel Function Attributes
*
*
*
*
*/

View File

@ -0,0 +1,6 @@
/*
Initialize the cudaDevice as first statements if not set by the User
(cudaSetDevice)
*/

View File

@ -0,0 +1,59 @@
/**
* Remove Clang cuda builtin functions and variables
*/
#include "RemoveCudaBuiltin.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include <iostream>
#include <map>
#include <set>
using namespace llvm;
void RemoveCudaBuiltin(llvm::Module *M) {
std::set<llvm::Function *> need_remove;
if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) {
gv->dropAllReferences();
gv->eraseFromParent();
}
Function *c_tor = NULL;
if (c_tor = M->getFunction("__cuda_module_ctor")) {
c_tor->dropAllReferences();
c_tor->eraseFromParent();
}
if (c_tor = M->getFunction("__cuda_module_dtor")) {
c_tor->dropAllReferences();
c_tor->eraseFromParent();
}
if (c_tor = M->getFunction("__cuda_register_globals")) {
c_tor->dropAllReferences();
c_tor->eraseFromParent();
}
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (func_name == "__cuda_module_dtor" ||
func_name == "__cuda_register_globals" ||
func_name == "__cudaRegisterFunction" ||
func_name == "__cudaRegisterVar" ||
func_name == "__cudaRegisterFatBinary" ||
func_name == "__cuda_module_ctor" ||
func_name == "__cudaRegisterFatBinaryEnd" ||
func_name == "__cudaUnregisterFatBinary") {
need_remove.insert(F);
}
}
for (auto f : need_remove) {
f->dropAllReferences();
f->eraseFromParent();
}
}

View File

@ -0,0 +1,93 @@
#include "ReplaceConstantMemory.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include <assert.h>
#include <fstream>
#include <iostream>
#include <map>
#include <set>
using namespace llvm;
void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin) {
std::string s;
bool find_constant_memory = false;
while (getline(fin, s)) {
if (s.find("ConstMemory2GlobalMemory") != std::string::npos) {
find_constant_memory = true;
break;
}
}
if (!find_constant_memory) {
assert(0 && "Do not find constant to global mapping\n");
}
std::map<std::string, std::string> corresponding_global_memory;
while (getline(fin, s)) {
if (s.find("END") != std::string::npos) {
break;
}
// get constant name
size_t pos = 0;
pos = s.find(' ');
std::string constant_name = s.substr(0, pos);
s.erase(0, pos + 1);
// get mapped global name
std::string global_name = s.substr(3, s.length() - 1);
corresponding_global_memory.insert(
std::pair<std::string, std::string>(constant_name, global_name));
}
std::set<llvm::GlobalVariable *> need_remove_constant_memory;
// find all constant memory and generate corresponding global memory
for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
if (auto constant_memory = dyn_cast<llvm::GlobalVariable>(I)) {
if (corresponding_global_memory.find(constant_memory->getName().str()) !=
corresponding_global_memory.end()) {
auto global_name =
corresponding_global_memory.find(constant_memory->getName().str())
->second;
// create a new global variable
if (auto PT = dyn_cast<llvm::PointerType>(I->getType())) {
need_remove_constant_memory.insert(constant_memory);
// generate the corresponding global memory variable
auto element_type = PT->getElementType();
if (auto array_type = dyn_cast<llvm::ArrayType>(element_type)) {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, array_type, false, llvm::GlobalValue::CommonLinkage, NULL,
global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
llvm::ConstantAggregateZero *const_array =
llvm::ConstantAggregateZero::get(array_type);
global_memory->setInitializer(const_array);
constant_memory->replaceAllUsesWith(
llvm::ConstantExpr::getPointerCast(
global_memory,
cast<PointerType>(constant_memory->getType())));
} else if (element_type->isStructTy()) {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, element_type, false, llvm::GlobalValue::CommonLinkage, NULL,
global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
llvm::ConstantAggregateZero *const_array =
llvm::ConstantAggregateZero::get(element_type);
global_memory->setInitializer(const_array);
constant_memory->replaceAllUsesWith(
llvm::ConstantExpr::getPointerCast(
global_memory,
cast<PointerType>(constant_memory->getType())));
} else {
assert(0 && "The required Constant Memory Type is not supported\n");
}
}
}
}
}
for (auto i : need_remove_constant_memory) {
i->dropAllReferences();
i->eraseFromParent();
}
return;
}

View File

@ -0,0 +1,292 @@
#include "ReplaceCudaBuiltin.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include <iostream>
#include <map>
#include <set>
using namespace llvm;
/*
insert sync after cudaKernel launch
call void @_Z13staticReversePii(i32* %55, i32 64)
%57 = call i32 @cudaDeviceSynchronize()
*/
void InsertSyncAfterKernelLaunch(llvm::Module *M) {
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::FunctionType *LauncherFuncT = FunctionType::get(Int32T, NULL);
llvm::FunctionCallee _f =
M->getOrInsertFunction("cudaDeviceSynchronize", LauncherFuncT);
llvm::Function *func_launch = llvm::cast<llvm::Function>(_f.getCallee());
std::set<std::string> launch_function_name;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
// F is a kernel launch function
launch_function_name.insert(func_name);
}
}
}
}
}
}
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (launch_function_name.find(calledFunction->getName().str()) !=
launch_function_name.end()) {
// insert a sync after launch
if (callInst->getNextNonDebugInstruction()) {
llvm::CallInst::Create(func_launch, "",
callInst->getNextNonDebugInstruction());
}
}
}
}
}
}
}
}
// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
void ReplaceKernelLaunch(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto VoidTy = llvm::Type::getVoidTy(context);
auto I8 = llvm::Type::getInt8PtrTy(context);
std::map<std::string, Function *> kernels;
std::set<llvm::Function *> need_remove;
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
llvm::FunctionType *LauncherFuncT =
FunctionType::get(Type::getVoidTy(*C), NULL);
llvm::FunctionType *LaunchFun2 =
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
bool done = false;
std::set<std::string> cuda_register_kernel_names;
std::string str;
llvm::raw_string_ostream ss(str);
/*
When using << >>, clang generates cudaPushCallConfiguration with the same
function definition as the kernel definition in the kernel bitcode
define internal void @__cuda_register_globals(i8** %0) {
entry:
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*,
float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x
i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14
x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32*
null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void
(float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8*
getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8*
getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8*
null, i8* null, i8* null, i8* null, i32* null) ret void
}
*/
Function *f_register_global = M->getFunction("__cuda_register_globals");
if (f_register_global) {
for (Function::iterator b = f_register_global->begin();
b != f_register_global->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (calledFunction->getName().str() == "__cudaRegisterFunction") {
Value *callOperand = callInst->getArgOperand(1);
Function *functionOperand =
dyn_cast<Function>(callInst->getArgOperand(1));
// call function is wrapped in a bitcast
if (functionOperand == NULL) {
std::vector<size_t> arg_sizes;
functionOperand =
dyn_cast<Function>(callOperand->stripPointerCasts());
cuda_register_kernel_names.insert(
functionOperand->getName().str());
std::cout << "Cuda Register Global Kernel: "
<< functionOperand->getName().str() << std::endl;
}
}
}
}
}
}
}
bool host_changed = false;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallBase *callInst = llvm::dyn_cast<llvm::CallBase>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
Value *callOperand = callInst->getArgOperand(0);
Function *functionOperand =
dyn_cast<Function>(callInst->getArgOperand(0));
// call function is wrapped in a bitcast
if (functionOperand == NULL) {
std::vector<size_t> arg_sizes;
functionOperand =
dyn_cast<Function>(callOperand->stripPointerCasts());
FunctionType *ft = calledFunction->getFunctionType();
std::cout << " Parent (Caller) Function Name: " << func_name
<< ", cudaLaunchKernel Function: "
<< functionOperand->getName().str() << ", args "
<< functionOperand->arg_size() << std::endl;
auto rep = kernels.find(functionOperand->getName().str());
if (rep != kernels.end()) {
Function *FC = rep->second;
BitCastInst *B = new BitCastInst(FC, I8, "", callInst);
callInst->setArgOperand(0, B);
continue;
}
std::vector<Type *> Params;
Params.push_back(I8);
FunctionType *FT = FunctionType::get(VoidTy, Params, false);
/*
Because of the TODO in the 2nd if statement, need to get the
prior name before _host is add
*/
std::string oldName = functionOperand->getName().str();
// if parent function is __host and same as the cudaKernelLaunch
std::string newName = oldName + "_wrapper";
if (func_name == oldName && host_changed &&
oldName.find("_host") != std::string::npos) {
newName =
oldName.substr(0, oldName.length() - 5) + "_wrapper";
}
std::cout << "Change Kernel Name to: " << newName << std::endl;
Function *F =
Function::Create(FT, Function::ExternalLinkage, newName, M);
F->setDSOLocal(true);
F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
callInst->setArgOperand(0, BC);
kernels.insert({functionOperand->getName().str(), F});
}
} else if (cuda_register_kernel_names.find(
calledFunction->getName()) !=
cuda_register_kernel_names.end()) {
// if the called function collides with kernel definiton
// TODO: some reason changes all occurences of the function name
// for both cudaKernelLaunch calls and regular function call
// errs() << *inst;
host_changed = true;
calledFunction->setName(calledFunction->getName() + "_host");
std::cout << std::endl;
std::cout << "Change Host Function Name To: "
<< calledFunction->getName().str() << std::endl;
}
}
}
}
}
}
}
void ReplaceMemcpyToSymbol(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context);
std::vector<llvm::Instruction *> need_remove;
for (Module::iterator F = M->begin(); F != M->end(); ++F) {
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->getCalledFunction()) {
auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "cudaMemcpyToSymbol") {
std::vector<llvm::Type *> args;
// i32 @cudaMemcpyToSymbol(i8* %1, i8* %2, i64 %3, i64 %4, i32 %5)
args.push_back(llvm::Type::getInt8PtrTy(context));
args.push_back(llvm::Type::getInt8PtrTy(context));
args.push_back(llvm::Type::getInt64Ty(context));
args.push_back(llvm::Type::getInt64Ty(context));
args.push_back(llvm::Type::getInt32Ty(context));
llvm::FunctionType *func_Type =
FunctionType::get(I32, args, false);
llvm::FunctionCallee _f =
M->getOrInsertFunction("cudaMemcpyToSymbol_host", func_Type);
llvm::Function *func = llvm::cast<llvm::Function>(_f.getCallee());
// construct argument(s)
std::vector<Value *> func_args;
func_args.push_back(Call->getArgOperand(0));
func_args.push_back(Call->getArgOperand(1));
func_args.push_back(Call->getArgOperand(2));
func_args.push_back(Call->getArgOperand(3));
func_args.push_back(Call->getArgOperand(4));
auto c_inst = llvm::CallInst::Create(func, func_args, "", Call);
// insert
Call->replaceAllUsesWith(c_inst);
need_remove.push_back(Call);
}
}
}
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
void ReplaceCudaBuiltin(llvm::Module *M) {
InsertSyncAfterKernelLaunch(M);
ReplaceKernelLaunch(M);
ReplaceMemcpyToSymbol(M);
}

View File

@ -0,0 +1,90 @@
#include "ReplaceKernelArgs.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ToolOutputFile.h"
#include <iostream>
#include <map>
#include <set>
using namespace llvm;
/*
* before:
* %m_cuda.addr = alloca float*, align 8
* after:
* %m_cuda.addr_tmp = call i8* @malloc(i64 8)
* %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float**
*/
// TODO: we use hard-code to implement this replacement,
// to use use-analysis to find the arguments in the future
void ReplaceKernelArg(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto VoidTy = llvm::Type::getVoidTy(context);
auto I8 = llvm::Type::getInt8PtrTy(context);
std::map<std::string, Function *> kernels;
std::set<llvm::Function *> need_replace;
LLVMContext *C = &M->getContext();
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
need_replace.insert(F);
}
}
}
}
}
}
// find/create C's malloc function
std::vector<llvm::Type *> args;
args.push_back(llvm::Type::getInt8PtrTy(context));
llvm::FunctionType *mallocFuncType =
FunctionType::get(llvm::Type::getInt8PtrTy(context),
{llvm::Type::getInt64Ty(context)}, false);
llvm::FunctionCallee _f = M->getOrInsertFunction("malloc", mallocFuncType);
llvm::Function *func_malloc = llvm::cast<llvm::Function>(_f.getCallee());
for (auto F : need_replace) {
std::set<const llvm::Value *> args_set;
int arg_cnt = 0;
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
ii != ee; ++ii) {
args_set.insert(&(*ii));
arg_cnt++;
}
std::vector<llvm::Instruction *> need_remove;
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::AllocaInst *alloc = llvm::dyn_cast<llvm::AllocaInst>(inst)) {
// just replace all alloc in that function
auto c_malloc_inst = llvm::CallInst::Create(
func_malloc,
ConstantInt::get(llvm::Type::getInt64Ty(context), 256), "",
alloc);
auto bit_cast = new BitCastInst(c_malloc_inst, alloc->getType(),
alloc->getName().str(), alloc);
alloc->replaceAllUsesWith(bit_cast);
need_remove.push_back(alloc);
}
}
}
for (auto inst : need_remove) {
inst->eraseFromParent();
}
}
}

View File

@ -1,94 +0,0 @@
#include "ReplaceKernelLaunch.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include <iostream>
#include <map>
#include <set>
using namespace llvm;
// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*)
// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*)
void ReplaceKernelLaunch(llvm::Module *M) {
LLVMContext &context = M->getContext();
auto VoidTy = llvm::Type::getVoidTy(context);
auto I8 = llvm::Type::getInt8PtrTy(context);
std::map<std::string, BitCastInst *> kernels;
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
llvm::FunctionType *LauncherFuncT =
FunctionType::get(Type::getVoidTy(*C), NULL);
llvm::FunctionType *LaunchFun2 =
FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
bool done = false;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
for (Function::iterator b = F->begin(); b != F->end(); ++b) {
BasicBlock *B = &(*b);
for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) {
Instruction *inst = &(*i);
if (llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst)) {
if (Function *calledFunction = callInst->getCalledFunction()) {
if (calledFunction->getName().startswith("cudaLaunchKernel")) {
Value *callOperand = callInst->getArgOperand(0);
Function *functionOperand =
dyn_cast<Function>(callInst->getArgOperand(0));
// call function is wrapped in a bitcast
if (functionOperand == NULL) {
std::vector<size_t> arg_sizes;
functionOperand =
dyn_cast<Function>(callOperand->stripPointerCasts());
FunctionType *ft = calledFunction->getFunctionType();
std::cout << " Parent (Caller) Function Name: " << func_name
<< ", cudaLaunchKernel Function: "
<< functionOperand->getName().str() << ", args "
<< functionOperand->arg_size() << std::endl;
auto rep = kernels.find(functionOperand->getName().str());
if (rep != kernels.end()) {
callInst->setArgOperand(0, rep->second);
continue;
}
std::vector<Type *> Params;
Params.push_back(I8);
FunctionType *FT = FunctionType::get(VoidTy, Params, false);
std::string newName =
functionOperand->getName().str() + "_wrapper";
Function *F =
Function::Create(FT, Function::ExternalLinkage, newName, M);
F->setDSOLocal(true);
BitCastInst *BC = new BitCastInst(F, I8, "", callInst);
callInst->setArgOperand(0, BC);
kernels.insert({functionOperand->getName().str(), BC});
}
}
}
}
}
}
}
}

View File

@ -8,46 +8,66 @@
#include "warp_func.h" #include "warp_func.h"
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
#include <assert.h> #include <assert.h>
#include <fstream>
#include <iostream> #include <iostream>
#include <llvm/Support/raw_ostream.h>
#include <map> #include <map>
#include <set> #include <set>
#include <stdlib.h> #include <stdlib.h>
using namespace llvm; using namespace llvm;
std::string PATH = "kernel_meta.log";
int main(int argc, char **argv) { int main(int argc, char **argv) {
assert(argc == 9 && "incorrect number of arguments\n"); assert(argc == 3 && "incorrect number of arguments\n");
llvm::Module *program = LoadModuleFromFilr(argv[1]); llvm::Module *program = LoadModuleFromFilr(argv[1]);
// get size of grid and dim from input arguments
int *grid_dim = new int[3]; std::ofstream fout;
int *block_dim = new int[3]; fout.open(PATH);
grid_dim[0] = atoi(argv[3]);
grid_dim[1] = atoi(argv[4]);
grid_dim[2] = atoi(argv[5]);
block_dim[0] = atoi(argv[6]);
block_dim[1] = atoi(argv[7]);
block_dim[2] = atoi(argv[8]);
// inline, and create auxiliary global variables // inline, and create auxiliary global variables
init_block(program); init_block(program, fout);
// insert sync before each vote, and replace the // insert sync before each vote, and replace the
// original vote function to warp vote // original vote function to warp vote
handle_warp_vote(program); handle_warp_vote(program);
// replace warp shuffle // replace warp shuffle
// VerifyModule(program);
handle_warp_shfl(program); handle_warp_shfl(program);
// insert sync // insert sync
// VerifyModule(program);
insert_sync(program); insert_sync(program);
// split block by sync // split block by sync
// VerifyModule(program);
std::cout << "split\n" << std::flush;
split_block_by_sync(program); split_block_by_sync(program);
// add loop for intra&intera thread // add loop for intra&intera thread
insert_warp_loop(program);
// (TODO): replace this patch
replace_built_in_function(program, grid_dim, block_dim);
// VerifyModule(program); // VerifyModule(program);
std::cout << "insert\n" << std::flush;
insert_warp_loop(program);
// VerifyModule(program);
// (TODO): replace this patch
std::cout << "replace\n" << std::flush;
replace_built_in_function(program);
// VerifyModule(program);
std::cout << "generate\n" << std::flush;
generate_x86_format(program); generate_x86_format(program);
// VerifyModule(program);
// performance optimization // performance optimization
performance_optimization(program); performance_optimization(program);
VerifyModule(program);
DumpModule(program, argv[2]); DumpModule(program, argv[2]);
fout.close();
return 0; return 0;
} }

View File

@ -5,4 +5,6 @@
void generate_x86_format(llvm::Module *M); void generate_x86_format(llvm::Module *M);
void set_meta_data(llvm::Module *M);
#endif #endif

View File

@ -2,6 +2,6 @@
#define __NVVM2x86_INIT__ #define __NVVM2x86_INIT__
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
#include <fstream>
void init_block(llvm::Module *M); void init_block(llvm::Module *M, std::ofstream &fout);
#endif #endif

View File

@ -1,9 +1,10 @@
#ifndef __NVVM2x86_MEMORY_HIERARCHY__ #ifndef __NVVM2x86_MEMORY_HIERARCHY__
#define __NVVM2x86_MEMORY_HIERARCHY__ #define __NVVM2x86_MEMORY_HIERARCHY__
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
#include <fstream>
using namespace llvm; using namespace llvm;
void mem_share2global(llvm::Module *M); void mem_share2global(llvm::Module *M);
void mem_constant2global(llvm::Module *M, std::ofstream &fout);
#endif #endif

View File

@ -12,7 +12,7 @@ llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore);
void VerifyModule(llvm::Module *); void VerifyModule(llvm::Module *);
void phi2alloc(llvm::Module *M); void phi2alloc(llvm::Module *M);
void remove_cuda_built_in(llvm::Module *M); void remove_cuda_built_in(llvm::Module *M);
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim); void replace_built_in_function(llvm::Module *M);
void replace_asm_call(llvm::Module *M); void replace_asm_call(llvm::Module *M);
bool find_block_barrier_in_region(llvm::BasicBlock *start, bool find_block_barrier_in_region(llvm::BasicBlock *start,
llvm::BasicBlock *end); llvm::BasicBlock *end);
@ -21,4 +21,5 @@ bool has_warp_barrier(llvm::BasicBlock *B);
bool has_barrier(llvm::BasicBlock *B); bool has_barrier(llvm::BasicBlock *B);
bool has_block_barrier(llvm::BasicBlock *B); bool has_block_barrier(llvm::BasicBlock *B);
bool has_barrier(llvm::Function *F); bool has_barrier(llvm::Function *F);
void replace_dynamic_shared_memory(llvm::Module *M);
#endif #endif

View File

@ -18,6 +18,7 @@
#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
using namespace llvm; using namespace llvm;
@ -40,6 +41,10 @@ void decode_input(llvm::Module *M) {
llvm::FunctionType *LauncherFuncT = FunctionType::get( llvm::FunctionType *LauncherFuncT = FunctionType::get(
Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false); Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false);
std::set<GlobalVariable *> dynmaic_memory;
std::map<GlobalVariable *, Value *> corres_dynamic_memory_load_address;
// generate Wrapper Function type // generate Wrapper Function type
// now we only support a single int32* // now we only support a single int32*
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
@ -64,6 +69,51 @@ void decode_input(llvm::Module *M) {
// convert to int** // convert to int**
input_arg = Builder.CreateBitOrPointerCast( input_arg = Builder.CreateBitOrPointerCast(
input_arg, PointerType::get(PointerType::get(Int32T, 0), 0)); input_arg, PointerType::get(PointerType::get(Int32T, 0), 0));
// dynamic memory load in the wrapper function
GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data");
if (share_memory != NULL) {
dynmaic_memory.insert(share_memory);
llvm::GlobalVariable *global_mem = new llvm::GlobalVariable(
*M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL,
"thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel,
0, false);
Value *loadedValue = Builder.CreateLoad(global_mem);
llvm::FunctionType *LaunchFun2 = FunctionType::get(
PointerType::get(PointerType::get(Int32T, 0), 0), NULL);
FunctionCallee fc2 =
M->getOrInsertFunction("_wrapper_global_data", LaunchFun2);
Function *WorkGroup2 = dyn_cast<Function>(fc2.getCallee());
WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage);
WorkGroup2->setVisibility(GlobalValue::HiddenVisibility);
Comdat *co = M->getOrInsertComdat("_wrapper_global_data");
co->setSelectionKind(Comdat::SelectionKind::Any);
WorkGroup2->setComdat(co);
BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2);
llvm::IRBuilder<> Builder2(M->getContext());
Builder2.SetInsertPoint(Block2);
Builder2.CreateRet(share_memory);
auto PT = dyn_cast<PointerType>(share_memory->getType());
auto element_type = PT->getElementType();
// std::cout << element_type->getTypeID() << " Got global memor $$$$$$"
// << share_memory->getName().str() << std::endl;
AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr");
// new_arr->setAlignment(llvm::MaybeAlign(16));
Value *new_ar = new_arr;
Value *gptr = Builder.CreateBitOrPointerCast(
share_memory, PointerType::get(PointerType::get(Int8T, 0), 0));
Builder.CreateStore(new_ar, gptr);
}
size_t idx = 0; size_t idx = 0;
for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
ii != ee; ++ii) { ii != ee; ++ii) {
@ -95,6 +145,8 @@ void remove_barrier(llvm::Module *M) {
for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) { for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) { if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync" || if (func_name == "llvm.nvvm.bar.warp.sync" ||
func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.barrier0" ||
@ -109,6 +161,11 @@ void remove_barrier(llvm::Module *M) {
} }
} }
void remove_useless_var(llvm::Module *M) {
M->getGlobalVariable("intra_warp_index")->eraseFromParent();
M->getGlobalVariable("inter_warp_index")->eraseFromParent();
}
void generate_x86_format(llvm::Module *M) { void generate_x86_format(llvm::Module *M) {
// change metadata // change metadata
set_meta_data(M); set_meta_data(M);
@ -116,4 +173,6 @@ void generate_x86_format(llvm::Module *M) {
decode_input(M); decode_input(M);
// remove barrier // remove barrier
remove_barrier(M); remove_barrier(M);
// remove useless func/variable
remove_useless_var(M);
} }

View File

@ -27,6 +27,8 @@ void split_block_by_sync(llvm::Function *F) {
} }
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst); llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) { if (Call) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" || if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.bar.warp.sync" || func_name == "llvm.nvvm.bar.warp.sync" ||

View File

@ -1,6 +1,7 @@
#include "init.h" #include "init.h"
#include "memory_hierarchy.h" #include "memory_hierarchy.h"
#include "tool.h" #include "tool.h"
#include <fstream>
#include <iostream> #include <iostream>
#include <set> #include <set>
@ -23,7 +24,8 @@
using namespace llvm; using namespace llvm;
void inline_func_vote(llvm::Module *M) { bool inline_warp_level_func(llvm::Module *M) {
bool changed = false;
std::set<llvm::Function *> need_remove; std::set<llvm::Function *> need_remove;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
@ -36,10 +38,13 @@ void inline_func_vote(llvm::Module *M) {
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
if (CallInst *c = dyn_cast<CallInst>(BI++)) { if (CallInst *c = dyn_cast<CallInst>(BI++)) {
if (c->getCalledFunction()) { if (c->getCalledFunction()) {
if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") { auto func_name = c->getCalledFunction()->getName().str();
if (func_name == "_Z10__any_syncji" ||
func_name.find("shfl_down_sync") != std::string::npos) {
InlineFunctionInfo IFI; InlineFunctionInfo IFI;
InlineFunction(c, IFI); InlineFunction(c, IFI);
need_remove.insert(c->getCalledFunction()); need_remove.insert(c->getCalledFunction());
changed = true;
} }
} }
} }
@ -50,6 +55,56 @@ void inline_func_vote(llvm::Module *M) {
f->dropAllReferences(); f->dropAllReferences();
f->eraseFromParent(); f->eraseFromParent();
} }
return changed;
}
bool find_sreg_inst(llvm::Function *F) {
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
if (c->getCalledFunction()) {
auto func_name = c->getCalledFunction()->getName().str();
if (func_name.find("llvm.nvvm.read.ptx.sreg.") != std::string::npos) {
return true;
}
}
}
}
}
return false;
}
bool inline_func_with_tid(llvm::Module *M) {
bool changed = false;
std::set<llvm::Function *> need_remove;
std::set<CallInst *> need_inline;
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
Function::iterator I = F->begin();
for (Function::iterator E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
if (CallInst *c = dyn_cast<CallInst>(BI++)) {
if (c->getCalledFunction()) {
if (find_sreg_inst(c->getCalledFunction())) {
printf("inline: %s\n",
c->getCalledFunction()->getName().str().c_str());
need_inline.insert(c);
need_remove.insert(c->getCalledFunction());
}
}
}
}
}
}
if (!need_inline.empty()) {
changed = true;
}
for (auto c : need_inline) {
InlineFunctionInfo IFI;
InlineFunction(c, IFI);
}
return changed;
} }
void create_global_variable(llvm::Module *M) { void create_global_variable(llvm::Module *M) {
@ -70,21 +125,33 @@ void create_global_variable(llvm::Module *M) {
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size", NULL, NULL, "block_size", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size_x", NULL, NULL, "block_size_x", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size_y", NULL, NULL, "block_size_y", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_size_z", NULL, NULL, "block_size_z", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "grid_size", NULL, NULL, "grid_size_x", NULL,
llvm::GlobalValue::NotThreadLocal, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_index", NULL, NULL, "grid_size_y", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "grid_size_z", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_index_x", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_index_y", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage,
NULL, "block_index_z", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
// TLS variable used for warp-level collective operators // TLS variable used for warp-level collective operators
new llvm::GlobalVariable( new llvm::GlobalVariable(
@ -224,24 +291,23 @@ bool lower_constant_expr(llvm::Module *M) {
auto load_from = load_inst->getOperand(0); auto load_from = load_inst->getOperand(0);
if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) { if (auto get_element_ptr = dyn_cast<llvm::ConstantExpr>(load_from)) {
modified = true; modified = true;
auto ReplInst = get_element_ptr->getAsInstruction();
ReplInst->insertBefore(load_inst);
std::vector<Instruction *> Users; std::vector<Instruction *> Users;
// Do not replace use during iteration of use. Do it in another loop
for (auto U : get_element_ptr->users()) { for (auto U : get_element_ptr->users()) {
if (auto InstUser = dyn_cast<Instruction>(U)) { if (auto InstUser = dyn_cast<Instruction>(U)) {
Users.push_back(InstUser); Users.push_back(InstUser);
} }
} }
for (auto &User : Users) for (auto &User : Users) {
auto ReplInst = get_element_ptr->getAsInstruction();
ReplInst->insertBefore(User);
User->replaceUsesOfWith(get_element_ptr, ReplInst); User->replaceUsesOfWith(get_element_ptr, ReplInst);
}
} }
} else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) { } else if (auto store_inst = dyn_cast<llvm::StoreInst>(BI)) {
auto store_to = store_inst->getOperand(1); auto store_to = store_inst->getOperand(1);
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) { if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(store_to)) {
modified = true; modified = true;
auto ReplInst = addr_cast->getAsInstruction();
ReplInst->insertBefore(store_inst);
std::vector<Instruction *> Users; std::vector<Instruction *> Users;
// Do not replace use during iteration of use. Do it in another loop // Do not replace use during iteration of use. Do it in another loop
for (auto U : addr_cast->users()) { for (auto U : addr_cast->users()) {
@ -249,16 +315,19 @@ bool lower_constant_expr(llvm::Module *M) {
Users.push_back(InstUser); Users.push_back(InstUser);
} }
} }
for (auto &User : Users) for (auto &User : Users) {
auto ReplInst = addr_cast->getAsInstruction();
ReplInst->insertBefore(User);
User->replaceUsesOfWith(addr_cast, ReplInst); User->replaceUsesOfWith(addr_cast, ReplInst);
}
} }
} else if (auto get_element_ptr = } else if (auto get_element_ptr =
dyn_cast<llvm::GetElementPtrInst>(BI)) { dyn_cast<llvm::GetElementPtrInst>(BI)) {
auto get_from = get_element_ptr->getOperand(0); auto get_from = get_element_ptr->getOperand(0);
if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) { if (auto addr_cast = dyn_cast<llvm::ConstantExpr>(get_from)) {
modified = true; modified = true;
auto ReplInst = addr_cast->getAsInstruction(); // auto ReplInst = addr_cast->getAsInstruction();
ReplInst->insertBefore(get_element_ptr); // ReplInst->insertBefore(get_element_ptr);
std::vector<Instruction *> Users; std::vector<Instruction *> Users;
// Do not replace use during iteration of use. Do it in another loop // Do not replace use during iteration of use. Do it in another loop
for (auto U : addr_cast->users()) { for (auto U : addr_cast->users()) {
@ -266,8 +335,11 @@ bool lower_constant_expr(llvm::Module *M) {
Users.push_back(InstUser); Users.push_back(InstUser);
} }
} }
for (auto &User : Users) for (auto &User : Users) {
auto ReplInst = addr_cast->getAsInstruction();
ReplInst->insertBefore(User);
User->replaceUsesOfWith(addr_cast, ReplInst); User->replaceUsesOfWith(addr_cast, ReplInst);
}
} }
} }
} }
@ -276,11 +348,24 @@ bool lower_constant_expr(llvm::Module *M) {
return modified; return modified;
} }
void init_block(llvm::Module *M) { void replace_cuda_math_built_in(llvm::Module *M) {
// replace _ZL3expd, just delete its body
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
auto func_name = F->getName().str();
if (func_name.find("_ZL3expd") != std::string::npos) {
F->deleteBody();
}
}
}
void init_block(llvm::Module *M, std::ofstream &fout) {
// using official llvm preprocess // using official llvm preprocess
llvm_preprocess(M); llvm_preprocess(M);
// remove useles Cuda function // remove useles Cuda function
remove_cuda_built_in(M); remove_cuda_built_in(M);
// replace CUDA math function, like expf
replace_cuda_math_built_in(M);
// lower ConstantExpression // lower ConstantExpression
bool modified; bool modified;
@ -289,14 +374,26 @@ void init_block(llvm::Module *M) {
} while (modified); } while (modified);
// remove useless metadata // remove useless metadata
remove_metadata(M); remove_metadata(M);
// inline vote function // inline warp-level function
inline_func_vote(M); while (1) {
if (!inline_warp_level_func(M))
break;
}
// TODO: remove the hardcode
while (1) {
if (!inline_func_with_tid(M))
break;
}
// create global variable for warp and vote // create global variable for warp and vote
create_global_variable(M); create_global_variable(M);
// replace phi with data load // replace phi with data load
phi2alloc(M); phi2alloc(M);
// replace share memory // replace share memory
mem_share2global(M); mem_share2global(M);
// replace share memory
mem_constant2global(M, fout);
// replace asm Inline // replace asm Inline
replace_asm_call(M); replace_asm_call(M);
// replace dynamic shared memory
replace_dynamic_shared_memory(M);
} }

View File

@ -212,11 +212,22 @@ public:
changed = true; changed = true;
// we may create a new conditional barrier after insert // we may create a new conditional barrier after insert
if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) {
conditionalBarriers.push_back(pred); // if the block postdominates all its predecessor
// then it is not a conditional barriers
bool post_dominate_all = true;
for (auto I = pred_begin(pred); I != pred_end(pred); I++) {
if (!PDT->getPostDomTree().dominates(pred, *I)) {
post_dominate_all = false;
break;
}
}
if (!post_dominate_all)
conditionalBarriers.push_back(pred);
}
// find any block which are not dominated by header // find any block which are not dominated by header
// but be posdiminated by merge point // but be postdominated by merge point
std::queue<llvm::BasicBlock *> if_body; std::queue<llvm::BasicBlock *> if_body;
std::set<llvm::BasicBlock *> visited_block; std::set<llvm::BasicBlock *> visited_block;
for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) { for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) {
@ -234,19 +245,26 @@ public:
PDT->getPostDomTree().dominates(merge_point, curr)) { PDT->getPostDomTree().dominates(merge_point, curr)) {
// we should insert barrier at the beginning and // we should insert barrier at the beginning and
// end of its predecessor // end of its predecessor
printf("insert [255]: %s\n", curr->getName().str().c_str());
if (has_warp_barrier(b)) { if (has_warp_barrier(b)) {
CreateIntraWarpBarrier(&(*curr->begin())); CreateIntraWarpBarrier(&(*curr->begin()));
for (BasicBlock *Pred : predecessors(curr)) { for (BasicBlock *Pred : predecessors(curr)) {
printf("insert [262]: %s\n", Pred->getName().str().c_str());
CreateIntraWarpBarrier(&(*Pred->getTerminator())); CreateIntraWarpBarrier(&(*Pred->getTerminator()));
} }
} else { } else {
CreateInterWarpBarrier(&(*curr->begin())); CreateInterWarpBarrier(&(*curr->begin()));
for (BasicBlock *Pred : predecessors(curr)) { for (BasicBlock *Pred : predecessors(curr)) {
printf("insert [268]: %s\n", Pred->getName().str().c_str());
CreateInterWarpBarrier(&(*Pred->getTerminator())); CreateInterWarpBarrier(&(*Pred->getTerminator()));
} }
} }
} }
for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) { for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) {
// avoid backedge
if (DT->dominates(curr->getTerminator()->getSuccessor(i), pred)) {
continue;
}
if_body.push(curr->getTerminator()->getSuccessor(i)); if_body.push(curr->getTerminator()->getSuccessor(i));
} }
} }
@ -266,6 +284,32 @@ public:
AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>();
} }
BasicBlock *find_merge_point(BasicBlock *start, PostDominatorTree &PDT) {
assert(start->getTerminator()->getNumSuccessors() == 2);
std::set<llvm::BasicBlock *> visit;
std::queue<llvm::BasicBlock *> pending_blocks;
for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) {
pending_blocks.push(start->getTerminator()->getSuccessor(i));
}
while (!pending_blocks.empty()) {
BasicBlock *current = pending_blocks.front();
pending_blocks.pop();
if (visit.find(current) != visit.end())
continue;
visit.insert(current);
if (PDT.dominates(current, start))
return current;
for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) {
auto succ = current->getTerminator()->getSuccessor(i);
if (visit.find(succ) == visit.end())
pending_blocks.push(succ);
}
}
assert(0 && "Do not find merge point\n");
return NULL;
}
virtual bool runOnFunction(Function &F) { virtual bool runOnFunction(Function &F) {
if (!isKernelFunction(F.getParent(), &F)) if (!isKernelFunction(F.getParent(), &F))
return 0; return 0;
@ -280,18 +324,8 @@ public:
for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
BasicBlock *b = &*i; BasicBlock *b = &*i;
BasicBlock *merge_point = NULL;
if (b->getTerminator()->getNumSuccessors() == 2) { if (b->getTerminator()->getNumSuccessors() == 2) {
auto b1 = b->getTerminator()->getSuccessor(0); auto merge_point = find_merge_point(b, PDT->getPostDomTree());
auto b2 = b->getTerminator()->getSuccessor(1);
if (PDT->getPostDomTree().dominates(b1, b2)) {
merge_point = b1;
} else if (PDT->getPostDomTree().dominates(b2, b2)) {
merge_point = b2;
} else {
assert(0 && "find complex if-else branch\n");
}
std::cout << std::flush;
for (BasicBlock *Pred : predecessors(merge_point)) { for (BasicBlock *Pred : predecessors(merge_point)) {
if (!DT->dominates(b, Pred)) { if (!DT->dominates(b, Pred)) {
// we need to insert an extra block to be the merge point // we need to insert an extra block to be the merge point
@ -305,14 +339,8 @@ public:
auto M = F.getParent(); auto M = F.getParent();
for (auto head : if_head) { for (auto head : if_head) {
assert(head->getTerminator()->getNumSuccessors() == 2); assert(head->getTerminator()->getNumSuccessors() == 2);
BasicBlock *merge_point = NULL; BasicBlock *merge_point = find_merge_point(head, PDT->getPostDomTree());
auto s1 = head->getTerminator()->getSuccessor(0); assert(PDT->getPostDomTree().dominates(merge_point, head));
auto s2 = head->getTerminator()->getSuccessor(1);
if (PDT->getPostDomTree().dominates(s1, s2)) {
merge_point = s1;
} else {
merge_point = s2;
}
if (!find_barrier_in_region(head, merge_point)) { if (!find_barrier_in_region(head, merge_point)) {
printf("do not need to handle tri-income if: %s\n", printf("do not need to handle tri-income if: %s\n",
merge_point->getName().str().c_str()); merge_point->getName().str().c_str());
@ -368,6 +396,8 @@ public:
for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e; for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e;
++j) { ++j) {
if (auto Call = dyn_cast<CallInst>(j)) { if (auto Call = dyn_cast<CallInst>(j)) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" || if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.bar.warp.sync" || func_name == "llvm.nvvm.bar.warp.sync" ||
@ -383,7 +413,7 @@ public:
} }
if (!is_conditional_loop) if (!is_conditional_loop)
return 0; return 0;
// insert barrier at the beginning of header // insert barrier at the beginning of header (for_cond)
// and the end of pre header, so that we can get a // and the end of pre header, so that we can get a
// single block connected with latch // single block connected with latch
if (!is_warp) { if (!is_warp) {
@ -399,17 +429,40 @@ public:
} }
// as we assume all loops are rotated, we have to insert // as we assume all loops are rotated, we have to insert
// barrier before the condition jump of the loop exit // barrier before the condition jump of the for_cond
if (auto for_cond = L->getExitingBlock()) {
if (auto exit_block = L->getExitingBlock()) { assert(for_cond->getTerminator()->getNumSuccessors() == 2 &&
"has more than 2 successors of the for-cond\n");
auto conditional_br = auto conditional_br =
dyn_cast<llvm::BranchInst>(exit_block->getTerminator()); dyn_cast<llvm::BranchInst>(for_cond->getTerminator());
assert(conditional_br && conditional_br->isConditional()); assert(conditional_br && conditional_br->isConditional());
// insert barrier at the beginning of successor of exit // insert barrier before the condition jump of the loop cond
if (!is_warp) if (!is_warp)
CreateInterWarpBarrier(conditional_br); CreateInterWarpBarrier(conditional_br);
else else
CreateIntraWarpBarrier(conditional_br); CreateIntraWarpBarrier(conditional_br);
// insert barrier before the for_body
auto for_body = for_cond->getTerminator()->getSuccessor(0);
if (for_body == L->getExitBlock()) {
for_body = for_cond->getTerminator()->getSuccessor(1);
}
// insert at the beginning of for_body
if (!is_warp)
CreateInterWarpBarrier(&(*for_body->begin()));
else
CreateIntraWarpBarrier(&(*for_body->begin()));
// insert at the beginning and end in for_inc block
if (auto for_inc = L->getLoopLatch()) {
if (!is_warp) {
CreateInterWarpBarrier(&(*for_inc->begin()));
CreateInterWarpBarrier(for_inc->getTerminator());
} else {
CreateIntraWarpBarrier(&(*for_inc->begin()));
CreateIntraWarpBarrier(for_inc->getTerminator());
}
} else {
assert(0 && "has continue in a barrier loop\n");
}
} else { } else {
// handle break in for-loop // handle break in for-loop
printf("loop has multiply exists\n"); printf("loop has multiply exists\n");

View File

@ -67,9 +67,15 @@ std::map<std::string, llvm::Instruction *> contextArrays;
int tempInstructionIndex = 0; int tempInstructionIndex = 0;
int need_nested_loop; int need_nested_loop;
// adding multiple kenerl in file support
bool ShouldNotBeContextSaved(llvm::Instruction *instr) { bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
if (isa<BranchInst>(instr)) if (isa<BranchInst>(instr))
return true; return true;
// if (isa<AddrSpaceCastInst>(instr))
// return true;
// if (isa<CastInst>(instr))
// return true;
llvm::Module *M = instr->getParent()->getParent()->getParent(); llvm::Module *M = instr->getParent()->getParent()->getParent();
llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr); llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
@ -111,6 +117,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
return contextArrays[varName]; return contextArrays[varName];
BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock(); BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
IRBuilder<> builder(&*(bb.getFirstInsertionPt())); IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
Function *FF = instruction->getParent()->getParent(); Function *FF = instruction->getParent()->getParent();
Module *M = instruction->getParent()->getParent()->getParent(); Module *M = instruction->getParent()->getParent()->getParent();
@ -127,6 +134,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
Type *AllocType = elementType; Type *AllocType = elementType;
AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction); AllocaInst *InstCast = dyn_cast<AllocaInst>(instruction);
/*
if (InstCast) { if (InstCast) {
unsigned Alignment = InstCast->getAlignment(); unsigned Alignment = InstCast->getAlignment();
@ -166,7 +174,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction,
} }
} }
} }
*/
llvm::Value *ItemSize = nullptr; llvm::Value *ItemSize = nullptr;
llvm::AllocaInst *Alloca = nullptr; llvm::AllocaInst *Alloca = nullptr;
@ -354,13 +362,36 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
auto F = PRs[0].start_block->getParent(); auto F = PRs[0].start_block->getParent();
for (auto bb = F->begin(); bb != F->end(); bb++) { for (auto bb = F->begin(); bb != F->end(); bb++) {
for (auto ii = bb->begin(); ii != bb->end(); ii++) { for (auto ii = bb->begin(); ii != bb->end(); ii++) {
if (isa<AllocaInst>(&(*ii))) if (isa<AllocaInst>(&(*ii))) {
instruction_to_fix.push_back(&(*ii)); auto alloc = dyn_cast<AllocaInst>(&(*ii));
} // Do not duplicate var used outside PRs
for (auto inst : instruction_to_fix) { bool used_in_non_PR = false;
AddContextSaveRestore(inst, intra_warp_loop); for (Instruction::use_iterator ui = alloc->use_begin(),
ue = alloc->use_end();
ui != ue; ++ui) {
llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
auto user_block = user->getParent();
bool find_in_PR = false;
for (auto PR : PRs) {
if (PR.wrapped_block.find(user_block) != PR.wrapped_block.end()) {
find_in_PR = true;
break;
}
}
if (find_in_PR == false) {
used_in_non_PR = true;
break;
}
}
if (!used_in_non_PR) {
instruction_to_fix.push_back(alloc);
}
}
} }
} }
for (auto inst : instruction_to_fix) {
AddContextSaveRestore(inst, intra_warp_loop);
}
} }
for (auto parallel_regions : PRs) { for (auto parallel_regions : PRs) {
@ -380,10 +411,8 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end(); for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end();
++instr) { ++instr) {
llvm::Instruction *instruction = &*instr; llvm::Instruction *instruction = &*instr;
if (ShouldNotBeContextSaved(instruction)) if (ShouldNotBeContextSaved(instruction))
continue; continue;
for (Instruction::use_iterator ui = instruction->use_begin(), for (Instruction::use_iterator ui = instruction->use_begin(),
ue = instruction->use_end(); ue = instruction->use_end();
ui != ue; ++ui) { ui != ue; ++ui) {
@ -582,6 +611,8 @@ void remove_barrier(llvm::Function *F, bool intra_warp_loop) {
for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) { for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) { if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync") { if (func_name == "llvm.nvvm.bar.warp.sync") {
need_remove.push_back(Call); need_remove.push_back(Call);
@ -648,6 +679,8 @@ public:
bool has_barrier = 0; bool has_barrier = 0;
for (auto i = current->begin(), e = current->end(); i != e; ++i) { for (auto i = current->begin(), e = current->end(); i != e; ++i) {
if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) { if (llvm::CallInst *call_inst = llvm::dyn_cast<llvm::CallInst>(&(*i))) {
if (call_inst->isInlineAsm())
continue;
auto func_name = call_inst->getCalledFunction()->getName().str(); auto func_name = call_inst->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" || if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") func_name == "llvm.nvvm.barrier.sync")
@ -761,6 +794,8 @@ public:
for (Function::iterator s = F->begin(); s != F->end(); s++) { for (Function::iterator s = F->begin(); s != F->end(); s++) {
if (llvm::CallInst *call_inst = if (llvm::CallInst *call_inst =
llvm::dyn_cast<llvm::CallInst>(s->begin())) { llvm::dyn_cast<llvm::CallInst>(s->begin())) {
if (call_inst->isInlineAsm())
continue;
auto func_name = call_inst->getCalledFunction()->getName().str(); auto func_name = call_inst->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" || if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") { func_name == "llvm.nvvm.barrier.sync") {
@ -787,6 +822,12 @@ public:
if (!isKernelFunction(F.getParent(), &F)) if (!isKernelFunction(F.getParent(), &F))
return 0; return 0;
auto func_name = (&F)->getName().str();
// clear context array, temp variables for new kernel function
contextArrays.clear();
tempInstructionIds.clear();
tempInstructionIndex = 0;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
@ -794,11 +835,11 @@ public:
auto parallel_regions = getParallelRegions(&F, intra_warp_loop); auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
assert(!parallel_regions.empty() && "can not find any parallel regions\n"); assert(!parallel_regions.empty() && "can not find any parallel regions\n");
// print_parallel_region(parallel_regions); // print_parallel_region(parallel_regions);
add_warp_loop(parallel_regions, intra_warp_loop);
if (intra_warp_loop) { if (intra_warp_loop) {
handle_local_variable_intra_warp(parallel_regions); handle_local_variable_intra_warp(parallel_regions);
} }
add_warp_loop(parallel_regions, intra_warp_loop);
remove_barrier(&F, intra_warp_loop); remove_barrier(&F, intra_warp_loop);
return 1; return 1;
} }
@ -816,6 +857,8 @@ bool has_warp_barrier(llvm::Module *M) {
for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) { for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) { if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync") { if (func_name == "llvm.nvvm.bar.warp.sync") {
return true; return true;
@ -841,8 +884,8 @@ void insert_warp_loop(llvm::Module *M) {
// only need a single loop, with size=block_size // only need a single loop, with size=block_size
Passes.add(new InsertWarpLoopPass(intra_warp)); Passes.add(new InsertWarpLoopPass(intra_warp));
Passes.run(*M); Passes.run(*M);
// remove all barriers
for (auto F = M->begin(); F != M->end(); ++F)
remove_barrier(dyn_cast<llvm::Function>(F), false);
} }
// remove all barriers
for (auto F = M->begin(); F != M->end(); ++F)
remove_barrier(dyn_cast<llvm::Function>(F), false);
} }

View File

@ -9,6 +9,8 @@
#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Transforms/Utils/ValueMapper.h"
#include <assert.h> #include <assert.h>
#include <fstream>
#include <iostream>
#include <map> #include <map>
#include <set> #include <set>
#include <sstream> #include <sstream>
@ -36,15 +38,35 @@ void mem_share2global(llvm::Module *M) {
auto new_name = "wrapper_global_" + share_memory->getName().str(); auto new_name = "wrapper_global_" + share_memory->getName().str();
auto element_type = PT->getElementType(); auto element_type = PT->getElementType();
if (auto array_type = dyn_cast<ArrayType>(element_type)) { if (auto array_type = dyn_cast<ArrayType>(element_type)) {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( if (share_memory->hasExternalLinkage() &&
*M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL, array_type->getArrayNumElements() == 0) {
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1); // external shared memory of []
ConstantAggregateZero *const_array = // generate global type pointer
ConstantAggregateZero::get(array_type); PointerType *PointerTy =
global_memory->setInitializer(const_array); PointerType::get(array_type->getElementType(), 0);
corresponding_global_memory.insert( llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
std::pair<GlobalVariable *, GlobalVariable *>(share_memory, llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
global_memory)); *M, PointerTy, false, llvm::GlobalValue::CommonLinkage, x1,
"wrapper_global_data", NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
global_ptr->setDSOLocal(true);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_ptr));
} else {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, array_type, false, llvm::GlobalValue::ExternalLinkage,
NULL, new_name, NULL,
llvm::GlobalValue::GeneralDynamicTLSModel, 1);
ConstantAggregateZero *const_array =
ConstantAggregateZero::get(array_type);
global_memory->setInitializer(const_array);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory));
}
} else if (auto int_type = dyn_cast<IntegerType>(element_type)) { } else if (auto int_type = dyn_cast<IntegerType>(element_type)) {
auto zero = llvm::ConstantInt::get(int_type, 0, true); auto zero = llvm::ConstantInt::get(int_type, 0, true);
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
@ -54,6 +76,16 @@ void mem_share2global(llvm::Module *M) {
corresponding_global_memory.insert( corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory, std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory)); global_memory));
} else if (element_type->isFloatTy()) {
auto FP_type = llvm::Type::getFloatTy(*C);
auto zero = llvm::ConstantFP::get(FP_type, 0);
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero,
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
false);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory));
} else { } else {
assert(0 && "The required Share Memory Type is not supported\n"); assert(0 && "The required Share Memory Type is not supported\n");
} }
@ -62,57 +94,11 @@ void mem_share2global(llvm::Module *M) {
} }
} }
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { for (auto k : corresponding_global_memory) {
Function *F = &(*i); auto share_addr = k.first;
for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { auto global_addr = k.second;
BasicBlock *b = &*i; share_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) { global_addr, cast<PointerType>(share_addr->getType())));
if (auto get_element_ptr = dyn_cast<llvm::GetElementPtrInst>(i)) {
auto read_array = get_element_ptr->getPointerOperand();
if (GlobalVariable *read_share_memory =
dyn_cast<llvm::GlobalVariable>(read_array)) {
// find a GetElementPtr which read share memory
if (corresponding_global_memory.find(read_share_memory) !=
corresponding_global_memory.end()) {
std::vector<Value *> Indices;
for (int i = 0; i < get_element_ptr->getNumIndices(); i++)
Indices.push_back(get_element_ptr->getOperand(i + 1));
auto new_GEP = GetElementPtrInst::Create(
NULL, // Pointee type
corresponding_global_memory.find(read_share_memory)
->second, // Alloca
Indices, // Indices
"", get_element_ptr);
// replace all get_element_ptr with new_GEP:
// we can not directly use:
// get_element_ptr->replaceAllUsesWith(new_GEP);
// as get_element_ptr and new_GEP have different return type
llvm::Type *original_type = get_element_ptr->getType();
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
new_GEP, original_type, "", get_element_ptr);
get_element_ptr->replaceAllUsesWith(FormatASC);
need_remove.insert(get_element_ptr);
}
}
} else if (auto addr_cast = dyn_cast<llvm::CastInst>(i)) {
auto read_array = addr_cast->getOperand(0);
if (GlobalVariable *read_share_memory =
dyn_cast<llvm::GlobalVariable>(read_array)) {
// find a GetElementPtr which read share memory
if (corresponding_global_memory.find(read_share_memory) !=
corresponding_global_memory.end()) {
llvm::Type *original_type = addr_cast->getType();
auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast(
corresponding_global_memory.find(read_share_memory)->second,
original_type, "", addr_cast);
addr_cast->replaceAllUsesWith(FormatASC);
need_remove.insert(addr_cast);
}
}
}
}
}
} }
for (auto i : need_remove) { for (auto i : need_remove) {
@ -124,3 +110,83 @@ void mem_share2global(llvm::Module *M) {
i->eraseFromParent(); i->eraseFromParent();
} }
} }
void mem_constant2global(llvm::Module *M, std::ofstream &fout) {
LLVMContext *C = &M->getContext();
llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int64T = Type::getInt64Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C);
std::map<GlobalVariable *, GlobalVariable *> corresponding_global_memory;
std::set<llvm::Instruction *> need_remove;
std::set<GlobalVariable *> need_remove_constant_memory;
// find all constant memory and generate corresponding global memory
for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
if (GlobalVariable *constant_memory = dyn_cast<GlobalVariable>(I)) {
if (auto PT = dyn_cast<PointerType>(I->getType())) {
unsigned AS = PT->getAddressSpace();
if (AS == 4) { // find a share memory
need_remove_constant_memory.insert(constant_memory);
// generate the corresponding global memory variable
auto new_name = "wrapper_global_" + constant_memory->getName().str();
auto element_type = PT->getElementType();
if (auto array_type = dyn_cast<ArrayType>(element_type)) {
if (constant_memory->hasExternalLinkage() &&
array_type->getArrayNumElements() == 0) {
// external shared memory of []
// generate global type pointer
PointerType *PointerTy =
PointerType::get(array_type->getElementType(), 0);
llvm::Constant *x1 = ConstantPointerNull::get(PointerTy);
llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable(
*M, PointerTy, false, llvm::GlobalValue::ExternalLinkage, x1,
"wrapper_global_data", NULL,
llvm::GlobalValue::NotThreadLocal, 0, true);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
global_ptr));
} else {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, array_type, false, llvm::GlobalValue::ExternalLinkage,
NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
global_memory));
}
} else if (element_type->isStructTy()) {
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, element_type, false, llvm::GlobalValue::ExternalLinkage,
NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0);
corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(constant_memory,
global_memory));
} else {
assert(0 && "The required Constant Memory Type is not supported\n");
}
}
}
}
}
fout << "ConstMemory2GlobalMemory\n";
for (auto k : corresponding_global_memory) {
auto const_addr = k.first;
auto global_addr = k.second;
const_addr->replaceAllUsesWith(ConstantExpr::getPointerCast(
global_addr, cast<PointerType>(const_addr->getType())));
// this file will be used by host translator
fout << const_addr->getName().str().c_str() << " to "
<< global_addr->getName().str().c_str() << std::endl;
}
fout << "END\n";
for (auto i : need_remove) {
i->dropAllReferences();
i->eraseFromParent();
}
for (auto i : need_remove_constant_memory) {
i->dropAllReferences();
i->eraseFromParent();
}
}

View File

@ -1,5 +1,6 @@
#include "tool.h" #include "tool.h"
#include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h" #include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IRBuilder.h"
@ -187,7 +188,52 @@ void remove_cuda_built_in(llvm::Module *M) {
} }
} }
void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { // copied from POCL
static void breakConstantExpressions(llvm::Value *Val, llvm::Function *Func) {
std::vector<llvm::Value *> Users(Val->user_begin(), Val->user_end());
for (auto *U : Users) {
if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(U)) {
// First, make sure no users of this constant expression are themselves
// constant expressions.
breakConstantExpressions(U, Func);
// Convert this constant expression to an instruction.
llvm::Instruction *I = CE->getAsInstruction();
I->insertBefore(&*Func->begin()->begin());
CE->replaceAllUsesWith(I);
CE->destroyConstant();
}
}
}
void replace_dynamic_shared_memory(llvm::Module *M) {
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
if (!isKernelFunction(M, F))
continue;
for (Module::global_iterator i = M->global_begin(), e = M->global_end();
i != e; ++i) {
breakConstantExpressions(&*i, F);
}
auto dynamic_shared_memory_addr =
M->getGlobalVariable("dynamic_shared_memory");
if (!dynamic_shared_memory_addr) {
return;
}
auto load_shared_memory =
new LoadInst(dynamic_shared_memory_addr, "new_load");
auto new_bit_cast =
new BitCastInst(load_shared_memory,
dynamic_shared_memory_addr->getType(), "new_bit_cast");
new_bit_cast->insertBefore(&*F->begin()->begin());
load_shared_memory->insertBefore(new_bit_cast);
dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) {
auto *Instr = dyn_cast<Instruction>(U.getUser());
return Instr != new_bit_cast && Instr != load_shared_memory;
});
}
}
void replace_built_in_function(llvm::Module *M) {
LLVMContext &context = M->getContext(); LLVMContext &context = M->getContext();
auto I32 = llvm::Type::getInt32Ty(context); auto I32 = llvm::Type::getInt32Ty(context);
std::vector<llvm::Instruction *> need_remove; std::vector<llvm::Instruction *> need_remove;
@ -203,28 +249,60 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
auto local_intra_warp_idx = auto local_intra_warp_idx =
builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(), builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(),
0, "local_intra_warp_idx"); 0, "local_intra_warp_idx");
global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx); global_intra_warp_idx->replaceUsesWithIf(local_intra_warp_idx, [&](Use &U) {
auto *Instr = dyn_cast<Instruction>(U.getUser());
return Instr->getParent()->getParent()->getName().str() == func_name;
});
auto global_inter_warp_idx = auto global_inter_warp_idx =
F->getParent()->getGlobalVariable("inter_warp_index"); F->getParent()->getGlobalVariable("inter_warp_index");
auto local_inter_warp_idx = auto local_inter_warp_idx =
builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(), builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(),
0, "local_inter_warp_idx"); 0, "local_inter_warp_idx");
global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx);
builder.CreateStore(ConstantInt::get(I32, 0), local_inter_warp_idx);
global_inter_warp_idx->replaceUsesWithIf(local_inter_warp_idx, [&](Use &U) {
auto *Instr = dyn_cast<Instruction>(U.getUser());
return Instr->getParent()->getParent()->getName().str() == func_name;
});
for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) { for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Load = dyn_cast<LoadInst>(BI)) { if (auto Load = dyn_cast<LoadInst>(BI)) {
auto load_from = Load->getOperand(0); auto load_from = Load->getOperand(0);
if (load_from == F->getParent()->getGlobalVariable("block_size")) {
Load->replaceAllUsesWith(ConstantInt::get(
I32, block_dim[0] * block_dim[1] * block_dim[2]));
need_remove.push_back(Load);
}
} else if (auto Call = dyn_cast<CallInst>(BI)) { } else if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->getCalledFunction()) { if (Call->getCalledFunction()) {
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") { if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" ||
func_name ==
"_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv") {
auto block_size_addr = M->getGlobalVariable("block_size_x");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto val = builder.CreateLoad(block_size_addr);
Call->replaceAllUsesWith(val);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") {
auto block_size_addr = M->getGlobalVariable("block_size_y");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto val = builder.CreateLoad(block_size_addr);
Call->replaceAllUsesWith(val);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") {
auto block_size_addr = M->getGlobalVariable("block_size_z");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto val = builder.CreateLoad(block_size_addr);
Call->replaceAllUsesWith(val);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" ||
func_name == "_ZN26__cuda_builtin_threadIdx_t17__fetch_"
"builtin_xEv") {
// replace it by warp_id // replace it by warp_id
IRBuilder<> builder(context); IRBuilder<> builder(context);
builder.SetInsertPoint(Call); builder.SetInsertPoint(Call);
@ -234,12 +312,11 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
thread_idx = builder.CreateBinOp( thread_idx = builder.CreateBinOp(
Instruction::Add, builder.CreateLoad(local_intra_warp_idx), Instruction::Add, builder.CreateLoad(local_intra_warp_idx),
thread_idx, "thread_idx"); thread_idx, "thread_idx");
if (block_dim[1] != 1 || block_dim[2] != 1) {
printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]); thread_idx = builder.CreateBinOp(
thread_idx = builder.CreateBinOp( Instruction::SRem, thread_idx,
Instruction::SRem, thread_idx, builder.CreateLoad(M->getGlobalVariable("block_size_x")),
ConstantInt::get(I32, block_dim[0]), "thread_id_x"); "thread_id_x");
}
Call->replaceAllUsesWith(thread_idx); Call->replaceAllUsesWith(thread_idx);
need_remove.push_back(Call); need_remove.push_back(Call);
@ -257,63 +334,61 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
// tidy = tid / block_dim.x // tidy = tid / block_dim.x
thread_idx = builder.CreateBinOp( thread_idx = builder.CreateBinOp(
Instruction::SDiv, thread_idx, Instruction::SDiv, thread_idx,
ConstantInt::get(I32, block_dim[0]), builder.CreateLoad(M->getGlobalVariable("block_size_x")),
// builder.CreateLoad(M->getGlobalVariable("block_size_x")),
"thread_id_y"); "thread_id_y");
Call->replaceAllUsesWith(thread_idx); Call->replaceAllUsesWith(thread_idx);
need_remove.push_back(Call); need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") { } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") {
printf("[WARNING] We DO NOT support multi-dim block\n"); printf("[WARNING] We DO NOT support triple-dim block\n");
exit(1);
auto zero = ConstantInt::get(I32, 0); auto zero = ConstantInt::get(I32, 0);
Call->replaceAllUsesWith(zero); Call->replaceAllUsesWith(zero);
need_remove.push_back(Call); need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") { } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x" ||
auto block_index_addr = M->getGlobalVariable("block_index"); func_name == "_ZN25__cuda_builtin_blockIdx_t17__fetch_"
"builtin_xEv") {
auto block_index_addr = M->getGlobalVariable("block_index_x");
IRBuilder<> builder(context); IRBuilder<> builder(context);
builder.SetInsertPoint(Call); builder.SetInsertPoint(Call);
auto block_idx = builder.CreateLoad(block_index_addr); auto block_idx = builder.CreateLoad(block_index_addr);
Call->replaceAllUsesWith(block_idx); Call->replaceAllUsesWith(block_idx);
need_remove.push_back(Call); need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" || } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") {
func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") { auto block_index_addr = M->getGlobalVariable("block_index_y");
printf("[WARNING We DO NOT support multi-dim grid\n");
auto zero = ConstantInt::get(I32, 0);
Call->replaceAllUsesWith(zero);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") {
auto block_size_addr = M->getGlobalVariable("block_size_x");
IRBuilder<> builder(context); IRBuilder<> builder(context);
builder.SetInsertPoint(Call); builder.SetInsertPoint(Call);
auto block_size = ConstantInt::get(I32, block_dim[0]); auto block_idx = builder.CreateLoad(block_index_addr);
Call->replaceAllUsesWith(block_size); Call->replaceAllUsesWith(block_idx);
need_remove.push_back(Call); need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") { } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") {
auto block_size_addr = M->getGlobalVariable("block_size_y"); auto block_index_addr = M->getGlobalVariable("block_index_z");
IRBuilder<> builder(context); IRBuilder<> builder(context);
builder.SetInsertPoint(Call); builder.SetInsertPoint(Call);
auto block_size = ConstantInt::get(I32, block_dim[1]); auto block_idx = builder.CreateLoad(block_index_addr);
Call->replaceAllUsesWith(block_size); Call->replaceAllUsesWith(block_idx);
need_remove.push_back(Call); need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") { } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" ||
auto block_size_addr = M->getGlobalVariable("block_size_z"); func_name == "_ZN24__cuda_builtin_gridDim_t17__fetch_"
"builtin_xEv") {
auto grid_size_addr = M->getGlobalVariable("grid_size_x");
IRBuilder<> builder(context); IRBuilder<> builder(context);
builder.SetInsertPoint(Call); builder.SetInsertPoint(Call);
auto block_size = ConstantInt::get(I32, block_dim[2]); auto grid_size = builder.CreateLoad(grid_size_addr);
Call->replaceAllUsesWith(block_size);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") {
auto grid_size_addr = M->getGlobalVariable("grid_size");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto grid_size = ConstantInt::get(I32, grid_dim[0]);
Call->replaceAllUsesWith(grid_size); Call->replaceAllUsesWith(grid_size);
need_remove.push_back(Call); need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" || } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") {
func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") { auto grid_size_addr = M->getGlobalVariable("grid_size_y");
printf("[WARNING We DO NOT support multi-dim grid\n"); IRBuilder<> builder(context);
auto one = ConstantInt::get(I32, 1); builder.SetInsertPoint(Call);
Call->replaceAllUsesWith(one); auto grid_size = builder.CreateLoad(grid_size_addr);
Call->replaceAllUsesWith(grid_size);
need_remove.push_back(Call);
} else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") {
auto grid_size_addr = M->getGlobalVariable("grid_size_z");
IRBuilder<> builder(context);
builder.SetInsertPoint(Call);
auto grid_size = builder.CreateLoad(grid_size_addr);
Call->replaceAllUsesWith(grid_size);
need_remove.push_back(Call); need_remove.push_back(Call);
} }
} }
@ -334,6 +409,98 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) {
} }
} }
} }
for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
Function *F = &(*i);
for (auto BB = F->begin(); BB != F->end(); ++BB) {
for (auto BI = BB->begin(); BI != BB->end(); BI++) {
if (auto Call = dyn_cast<CallInst>(BI)) {
if (Call->getCalledFunction()) {
auto func_name = Call->getCalledFunction()->getName().str();
auto callFn = Call->getCalledFunction();
if (func_name == "vprintf") {
/*
* replace CUDA's printf to C's printf
* CUDA:
* %0 = tail call i32 @vprintf(i8* getelementptr inbounds ([19 x
* i8], [19 x i8]* @.str, i64 0, i64 0), i8* null)
* C: %call1 = call i32 (i8*, ...) @printf(i8* getelementptr
* inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0))
*/
// find/create C's printf function
std::vector<llvm::Type *> args;
args.push_back(llvm::Type::getInt8PtrTy(context));
llvm::FunctionType *printfType =
FunctionType::get(I32, args, true);
llvm::FunctionCallee _f =
M->getOrInsertFunction("printf", printfType);
llvm::Function *func_printf =
llvm::cast<llvm::Function>(_f.getCallee());
// construct argument(s)
std::vector<Value *> printf_args;
// first argument is same between CUDA and C
auto placeholder = Call->getArgOperand(0);
printf_args.push_back(placeholder);
// insert arguments
auto compressed_args = Call->getArgOperand(1);
if (auto BC = dyn_cast<BitCastInst>(compressed_args)) {
auto src_alloc = BC->getOperand(0);
auto SrcPointTy =
dyn_cast<PointerType>(BC->getOperand(0)->getType());
auto SrcTy = SrcPointTy->getElementType();
// reverse the bitcast
auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call);
assert(SrcTy->isStructTy() == 1);
auto StructTy = dyn_cast<StructType>(SrcTy);
for (int i = 0; i < StructTy->getNumElements(); i++) {
std::vector<Value *> Indices;
Indices.push_back(ConstantInt::get(I32, 0));
Indices.push_back(ConstantInt::get(I32, i));
auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type
src_alloc, // Alloca
Indices, // Indices
"", Call);
auto new_load = new LoadInst(new_GEP, "", Call);
printf_args.push_back(new_load);
}
}
auto c_printf_inst =
llvm::CallInst::Create(func_printf, printf_args, "", Call);
// insert
Call->replaceAllUsesWith(c_printf_inst);
need_remove.push_back(Call);
} else if (func_name == "__nv_fast_log2f" ||
func_name == "__nv_log2f" ||
func_name == "__nv_fast_powf" ||
func_name == "__nv_powf" || func_name == "__nv_logf" ||
func_name == "__nv_expf" || func_name == "__nv_fabsf" ||
func_name == "__nv_log10f" ||
func_name == "__nv_fmodf" || func_name == "__nv_sqrt" ||
func_name == "__nv_sqrtf" || func_name == "__nv_exp" ||
func_name == "__nv_isnanf" ||
func_name == "__nv_isinff" || func_name == "__nv_powi" ||
func_name == "__nv_powif") {
Call->getCalledFunction()->deleteBody();
} else if (func_name == "llvm.nvvm.fma.rn.d") {
Call->getCalledFunction()->setName("__nvvm_fma_rn_d");
} else if (func_name == "llvm.nvvm.d2i.lo") {
Call->getCalledFunction()->setName("__nvvm_d2i_lo");
} else if (func_name == "llvm.nvvm.d2i.hi") {
Call->getCalledFunction()->setName("__nvvm_d2i_hi");
} else if (func_name == "llvm.nvvm.add.rn.d") {
Call->getCalledFunction()->setName("__nvvm_add_rn_d");
} else if (func_name == "llvm.nvvm.lohi.i2d") {
Call->getCalledFunction()->setName("__nvvm_lohi_i2d");
} else if (func_name == "llvm.nvvm.fabs.f") {
Call->getCalledFunction()->setName("__nvvm_fabs_f");
} else if (func_name == "llvm.nvvm.mul24.i") {
Call->getCalledFunction()->setName("__nvvm_mul24_i");
}
}
}
}
}
}
for (auto inst : need_remove) { for (auto inst : need_remove) {
inst->eraseFromParent(); inst->eraseFromParent();
@ -382,6 +549,8 @@ bool has_warp_barrier(llvm::BasicBlock *B) {
Instruction *inst = &(*i); Instruction *inst = &(*i);
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst); llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) { if (Call) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.bar.warp.sync") { if (func_name == "llvm.nvvm.bar.warp.sync") {
return true; return true;
@ -396,6 +565,8 @@ bool has_barrier(llvm::BasicBlock *B) {
Instruction *inst = &(*i); Instruction *inst = &(*i);
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst); llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) { if (Call) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" || if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.bar.warp.sync" || func_name == "llvm.nvvm.bar.warp.sync" ||
@ -412,6 +583,8 @@ bool has_block_barrier(llvm::BasicBlock *B) {
Instruction *inst = &(*i); Instruction *inst = &(*i);
llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst); llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(inst);
if (Call) { if (Call) {
if (Call->isInlineAsm())
continue;
auto func_name = Call->getCalledFunction()->getName().str(); auto func_name = Call->getCalledFunction()->getName().str();
if (func_name == "llvm.nvvm.barrier0" || if (func_name == "llvm.nvvm.barrier0" ||
func_name == "llvm.nvvm.barrier.sync") { func_name == "llvm.nvvm.barrier.sync") {
@ -478,3 +651,21 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) {
} }
return 0; return 0;
} }
/*
Print IR to String Output for Debugging Purposes
*/
// void printModule(llvm::Module *M) {
// std::string str;
// llvm::raw_string_ostream ss(str);
// std::cout << "### Printing Module ###" << std::endl;
// for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
// Function *F = &(*i);
// auto func_name = F->getName().str();
// std::cout << func_name << std::endl;
// for (Function::iterator b = F->begin(); b != F->end(); ++b) {
// BasicBlock *B = &(*b);
// errs() << *B;
// }
// }
// }

View File

@ -44,6 +44,8 @@ void handle_warp_vote(llvm::Module *M) {
for (Function::iterator E = F->end(); I != E; ++I) { for (Function::iterator E = F->end(); I != E; ++I) {
for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) { for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) {
if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) { if (CallInst *vote_any_sync = dyn_cast<CallInst>(BI)) {
if (vote_any_sync->isInlineAsm())
continue;
auto func_name = vote_any_sync->getCalledFunction()->getName(); auto func_name = vote_any_sync->getCalledFunction()->getName();
if (func_name == "llvm.nvvm.vote.any.sync" || if (func_name == "llvm.nvvm.vote.any.sync" ||
func_name == "llvm.nvvm.vote.all.sync") { func_name == "llvm.nvvm.vote.all.sync") {

View File

@ -1,82 +0,0 @@
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define NUM_WARP 2
#define NUM_BLOCK 1
int block_size = 32 * NUM_WARP;
int block_size_x = block_size;
int block_size_y = 1;
int block_size_z = 1;
__thread int block_index = 0;
int grid_size = NUM_BLOCK;
extern "C" {
void *_Z7reduce0PiS_j_wrapper(void *);
__thread int warp_shfl[32];
}
void *wrap(void *p) {
int **res = (int **)p;
block_index = (*(int *)res[3]);
_Z7reduce0PiS_j_wrapper(p);
return NULL;
}
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
int **ret = new int *[4];
int **p0 = new int *;
*p0 = g_idata;
ret[0] = (int *)(p0);
int **p1 = new int *;
*p1 = g_odata;
ret[1] = (int *)(p1);
unsigned int *p2 = new unsigned int;
*p2 = n;
ret[2] = (int *)p2;
int *p3 = new int;
*p3 = bid;
ret[3] = (int *)p3;
return (void *)ret;
}
int main(int argc, char *argv[]) {
int *g_idata;
int size = block_size * NUM_BLOCK;
g_idata = new int[size * 2];
int *res = new int[size];
for (int i = 0; i < size; i++) {
g_idata[i] = i;
}
pthread_t threads[NUM_BLOCK];
void *inp[NUM_BLOCK];
for (long t = 0; t < NUM_BLOCK; t++) {
inp[t] = gen_input(t, g_idata, res, size);
}
for (long t = 0; t < NUM_BLOCK; t++) {
pthread_create(&threads[t], NULL, wrap, inp[t]);
}
for (long t = 0; t < NUM_BLOCK; t++)
pthread_join(threads[t], NULL);
int gold = 0;
for (int i = 0; i < size; i++) {
gold += g_idata[i];
}
assert(*res == gold && "Incorrect res\n");
printf("PASS\n");
pthread_exit(NULL);
}

View File

@ -1,150 +0,0 @@
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
; Function Attrs: nounwind
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: convergent nounwind
define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11
%2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12
%mul = mul i32 %2, %1
%add = add i32 %mul, %0
%cmp = icmp ult i32 %add, %n
br i1 %cmp, label %cond.true, label %cond.end
cond.true: ; preds = %entry
%idxprom = zext i32 %add to i64
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4, !tbaa !13
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %3, %cond.true ], [ 0, %entry ]
%idxprom5 = zext i32 %0 to i64
%arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5
%arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32*
store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
%cmp839 = icmp ugt i32 %2, 1
br i1 %cmp839, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %if.end, %cond.end
%cmp18 = icmp eq i32 %0, 0
br i1 %cmp18, label %if.then19, label %if.end23
for.body: ; preds = %cond.end, %if.end
%s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ]
%mul9 = shl nuw nsw i32 %s.040, 1
%rem = urem i32 %0, %mul9
%cmp10 = icmp eq i32 %rem, 0
br i1 %cmp10, label %if.then, label %if.end
if.then: ; preds = %for.body
%add11 = add i32 %s.040, %0
%idxprom12 = zext i32 %add11 to i64
%arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12
%arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32*
%4 = load i32, i32* %arrayidx13, align 4, !tbaa !13
%5 = load i32, i32* %arrayidx6, align 4, !tbaa !13
%add16 = add nsw i32 %5, %4
store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13
br label %if.end
if.end: ; preds = %if.then, %for.body
tail call void @llvm.nvvm.barrier.sync(i32 0) #4
%cmp8 = icmp ult i32 %mul9, %2
br i1 %cmp8, label %for.body, label %for.cond.cleanup
if.then19: ; preds = %for.cond.cleanup
%idxprom21 = zext i32 %1 to i64
%arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21
%6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13
store i32 %6, i32* %arrayidx22, align 4, !tbaa !13
br label %if.end23
if.end23: ; preds = %if.then19, %for.cond.cleanup
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier.sync(i32) #3
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
attributes #4 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}
!10 = !{i32 0, i32 1024}
!11 = !{i32 0, i32 2147483647}
!12 = !{i32 1, i32 1025}
!13 = !{!14, !14, i64 0}
!14 = !{!"int", !15, i64 0}
!15 = !{!"omnipotent char", !16, i64 0}
!16 = !{!"Simple C++ TBAA"}

View File

@ -1,6 +0,0 @@
#!/bin/bash
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
llc --filetype=obj kernel.bc
g++ host.cpp kernel.o -lpthread -o test
./test

View File

@ -1,82 +0,0 @@
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define NUM_WARP 2
#define NUM_BLOCK 1
int block_size = 32 * NUM_WARP;
int block_size_x = block_size;
int block_size_y = 1;
int block_size_z = 1;
__thread int block_index = 0;
int grid_size = NUM_BLOCK;
extern "C" {
void *_Z7reduce5PiS_j_wrapper(void *);
__thread int warp_shfl[32];
}
void *wrap(void *p) {
int **res = (int **)p;
block_index = (*(int *)res[3]);
_Z7reduce5PiS_j_wrapper(p);
return NULL;
}
void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) {
int **ret = new int *[4];
int **p0 = new int *;
*p0 = g_idata;
ret[0] = (int *)(p0);
int **p1 = new int *;
*p1 = g_odata;
ret[1] = (int *)(p1);
unsigned int *p2 = new unsigned int;
*p2 = n;
ret[2] = (int *)p2;
int *p3 = new int;
*p3 = bid;
ret[3] = (int *)p3;
return (void *)ret;
}
int main(int argc, char *argv[]) {
int *g_idata;
int size = block_size * NUM_BLOCK;
g_idata = new int[size * 2];
int *res = new int[size];
for (int i = 0; i < size; i++) {
g_idata[i] = i;
}
pthread_t threads[NUM_BLOCK];
void *inp[NUM_BLOCK];
for (long t = 0; t < NUM_BLOCK; t++) {
inp[t] = gen_input(t, g_idata, res, size);
}
for (long t = 0; t < NUM_BLOCK; t++) {
pthread_create(&threads[t], NULL, wrap, inp[t]);
}
for (long t = 0; t < NUM_BLOCK; t++)
pthread_join(threads[t], NULL);
int gold = 0;
for (int i = 0; i < size; i++) {
gold += g_idata[i];
}
assert(*res == gold && "Incorrect res\n");
printf("PASS\n");
pthread_exit(NULL);
}

View File

@ -1,179 +0,0 @@
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
; Function Attrs: nounwind
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: convergent nounwind
define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 {
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11
%mul = shl i32 %1, 7
%add = add i32 %mul, %0
%cmp = icmp ult i32 %add, %n
br i1 %cmp, label %cond.true, label %cond.end
cond.true: ; preds = %entry
%idxprom = zext i32 %add to i64
%arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom
%2 = load i32, i32* %arrayidx, align 4, !tbaa !12
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %2, %cond.true ], [ 0, %entry ]
%add4 = add i32 %add, 64
%cmp5 = icmp ult i32 %add4, %n
br i1 %cmp5, label %if.then, label %if.end
if.then: ; preds = %cond.end
%idxprom7 = zext i32 %add4 to i64
%arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7
%3 = load i32, i32* %arrayidx8, align 4, !tbaa !12
%add9 = add nsw i32 %3, %cond
br label %if.end
if.end: ; preds = %if.then, %cond.end
%mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ]
%idxprom10 = zext i32 %0 to i64
%arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10
%arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32*
store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
tail call void @llvm.nvvm.barrier.sync(i32 0) #5
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16
%5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17
%mul.i.i52 = mul nuw nsw i32 %5, %4
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10
%mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52
%add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6
%add8.i.i55 = add nuw nsw i32 %add.i.i54, %0
%cmp14 = icmp ult i32 %add8.i.i55, 32
br i1 %cmp14, label %if.then15, label %if.end32
if.then15: ; preds = %if.end
%add16 = add nuw nsw i32 %0, 32
%idxprom17 = zext i32 %add16 to i64
%arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17
%arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32*
%8 = load i32, i32* %arrayidx18, align 4, !tbaa !12
%add19 = add nsw i32 %8, %mySum.0
%9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5
%add23 = add nsw i32 %9, %add19
%10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5
%add23.1 = add nsw i32 %10, %add23
%11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5
%add23.2 = add nsw i32 %11, %add23.1
%12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5
%add23.3 = add nsw i32 %12, %add23.2
%13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5
%cmp27 = icmp eq i32 %add8.i.i55, 0
br i1 %cmp27, label %if.then28, label %if.end32
if.then28: ; preds = %if.then15
%add23.4 = add nsw i32 %13, %add23.3
%idxprom30 = zext i32 %1 to i64
%arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30
store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12
br label %if.end32
if.end32: ; preds = %if.end, %if.then28, %if.then15
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier.sync(i32) #3
; Function Attrs: convergent inaccessiblememonly nounwind
declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
attributes #4 = { convergent inaccessiblememonly nounwind }
attributes #5 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}
!10 = !{i32 0, i32 1024}
!11 = !{i32 0, i32 2147483647}
!12 = !{!13, !13, i64 0}
!13 = !{!"int", !14, i64 0}
!14 = !{!"omnipotent char", !15, i64 0}
!15 = !{!"Simple C++ TBAA"}
!16 = !{i32 0, i32 64}
!17 = !{i32 1, i32 1025}

View File

@ -1,6 +0,0 @@
#!/bin/bash
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1
llc --filetype=obj kernel.bc
g++ host.cpp kernel.o -lpthread -o test
./test

View File

@ -1,11 +0,0 @@
#!bin/sh
for file in ./*
do
if test -d $file
then
echo executing $file
cd $file
bash run.sh
cd ..
fi
done

View File

@ -1,84 +0,0 @@
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM_BLOCK 1
int N = 32;
int block_size = 32;
int block_size_x = block_size;
int block_size_y = 1;
int block_size_z = 1;
__thread int block_index = 0;
int grid_size = NUM_BLOCK;
extern "C" {
void *_Z9vectorAddPKfS0_Pfi_wrapper(void *);
}
void *wrap(void *p) {
int **res = (int **)p;
block_index = (*(int *)res[4]);
_Z9vectorAddPKfS0_Pfi_wrapper(p);
return NULL;
}
void *gen_input(int bid, float *A, float *B, float *C, int N) {
int **ret = new int *[5];
float **p0 = new float *;
*p0 = A;
ret[0] = (int *)(p0);
float **p1 = new float *;
*p1 = B;
ret[1] = (int *)(p1);
float **p2 = new float *;
*p2 = C;
ret[2] = (int *)(p2);
int *p3 = new int;
*p3 = N;
ret[3] = (int *)p3;
int *p4 = new int;
*p4 = bid;
ret[4] = (int *)p4;
return (void *)ret;
}
int main() {
float *A, *B, *C;
A = new float[N];
B = new float[N];
C = new float[N];
for (int i = 0; i < N; i++) {
A[i] = i;
B[i] = 1;
C[i] = 0;
}
pthread_t threads[NUM_BLOCK];
int rc;
for (long t = 0; t < NUM_BLOCK; t++) {
void *inp = gen_input(t, A, B, C, N);
rc = pthread_create(&threads[t], NULL, wrap, inp);
}
clock_t t1 = clock();
/* Last thing that main() should do */
for (long t = 0; t < NUM_BLOCK; t++)
pthread_join(threads[t], NULL);
for (int i = 0; i < N; i++) {
assert(C[i] == (A[i] + B[i]));
}
printf("PASS\n");
pthread_exit(NULL);
}

View File

@ -1,86 +0,0 @@
; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
; Function Attrs: nounwind
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nounwind
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 {
entry:
ret i32 999
}
; Function Attrs: nofree nounwind
define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 {
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10
%idxprom8 = zext i32 %0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8
%1 = load float, float* %arrayidx, align 4, !tbaa !11
%arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8
%2 = load float, float* %arrayidx2, align 4, !tbaa !11
%add = fadd contract float %1, %2
%arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8
store float %add, float* %arrayidx4, align 4, !tbaa !11
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}
!10 = !{i32 0, i32 1024}
!11 = !{!12, !12, i64 0}
!12 = !{!"float", !13, i64 0}
!13 = !{!"omnipotent char", !14, i64 0}
!14 = !{!"Simple C++ TBAA"}

View File

@ -1,6 +0,0 @@
#!/bin/bash
llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll
../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1
llc --filetype=obj kernel.bc
g++ host.cpp kernel.o -lpthread -o test
./test

Binary file not shown.

Before

Width:  |  Height:  |  Size: 109 KiB

View File

@ -1,11 +0,0 @@
# The workflow of CuPBoP
The workflow of CuPBoP is described as following:
![The workflow of executing CUDA applications on CuPBoP.](figures/workflow.png)
First, CuPBoP uses Clang to compile the CUDA source code into NVVM IR,
which consists of two parts: Host part and Kernel Part.
In the next step, CuPBoP-compilation parses and transforms these NVVM IRs
to make it suitable for executing on specific architectures.
The CuPBoP-runtime compiles the transformed Host IR and executes the generated programs,
which will compile the transformed Kernel IR and
upload the compiled kernel programs to specific architectures.

View File

@ -0,0 +1,454 @@
#include "backprop.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
//#define OPEN
#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
#define fastcopy(to, from, len) \
{ \
register char *_to, *_from; \
register int _i, _l; \
_to = (char *)(to); \
_from = (char *)(from); \
_l = (len); \
for (_i = 0; _i < _l; _i++) \
*_to++ = *_from++; \
}
/*** Return random number between 0.0 and 1.0 ***/
float drnd() { return ((float)rand() / (float)BIGRND); }
/*** Return random number between -1.0 and 1.0 ***/
float dpn1() { return ((drnd() * 2.0) - 1.0); }
/*** The squashing function. Currently, it's a sigmoid. ***/
float squash(x)
float x;
{
float m;
// x = -x;
// m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
// return(1.0 / (1.0 + m));
return (1.0 / (1.0 + exp(-x)));
}
/*** Allocate 1d array of floats ***/
float *alloc_1d_dbl(n)
int n;
{
float *new;
new = (float *)malloc((unsigned)(n * sizeof(float)));
if (new == NULL) {
printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
return (NULL);
}
return (new);
}
/*** Allocate 2d array of floats ***/
float **alloc_2d_dbl(m, n)
int m, n;
{
int i;
float **new;
new = (float **)malloc((unsigned)(m * sizeof(float *)));
if (new == NULL) {
printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
return (NULL);
}
for (i = 0; i < m; i++) {
new[i] = alloc_1d_dbl(n);
}
return (new);
}
bpnn_randomize_weights(w, m, n) float **w;
int m, n;
{
int i, j;
for (i = 0; i <= m; i++) {
for (j = 0; j <= n; j++) {
w[i][j] = (float)rand() / RAND_MAX;
// w[i][j] = dpn1();
}
}
}
bpnn_randomize_row(w, m) float *w;
int m;
{
int i;
for (i = 0; i <= m; i++) {
// w[i] = (float) rand()/RAND_MAX;
w[i] = 0.1;
}
}
bpnn_zero_weights(w, m, n) float **w;
int m, n;
{
int i, j;
for (i = 0; i <= m; i++) {
for (j = 0; j <= n; j++) {
w[i][j] = 0.0;
}
}
}
void bpnn_initialize(seed) {
printf("Random number generator seed: %d\n", seed);
srand(seed);
}
BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
int n_in, n_hidden, n_out;
{
BPNN *newnet;
newnet = (BPNN *)malloc(sizeof(BPNN));
if (newnet == NULL) {
printf("BPNN_CREATE: Couldn't allocate neural network\n");
return (NULL);
}
newnet->input_n = n_in;
newnet->hidden_n = n_hidden;
newnet->output_n = n_out;
newnet->input_units = alloc_1d_dbl(n_in + 1);
newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
newnet->output_units = alloc_1d_dbl(n_out + 1);
newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
newnet->output_delta = alloc_1d_dbl(n_out + 1);
newnet->target = alloc_1d_dbl(n_out + 1);
newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
return (newnet);
}
void bpnn_free(net) BPNN *net;
{
int n1, n2, i;
n1 = net->input_n;
n2 = net->hidden_n;
free((char *)net->input_units);
free((char *)net->hidden_units);
free((char *)net->output_units);
free((char *)net->hidden_delta);
free((char *)net->output_delta);
free((char *)net->target);
for (i = 0; i <= n1; i++) {
free((char *)net->input_weights[i]);
free((char *)net->input_prev_weights[i]);
}
free((char *)net->input_weights);
free((char *)net->input_prev_weights);
for (i = 0; i <= n2; i++) {
free((char *)net->hidden_weights[i]);
free((char *)net->hidden_prev_weights[i]);
}
free((char *)net->hidden_weights);
free((char *)net->hidden_prev_weights);
free((char *)net);
}
/*** Creates a new fully-connected network from scratch,
with the given numbers of input, hidden, and output units.
Threshold units are automatically included. All weights are
randomly initialized.
Space is also allocated for temporary storage (momentum weights,
error computations, etc).
***/
BPNN *bpnn_create(n_in, n_hidden, n_out)
int n_in, n_hidden, n_out;
{
BPNN *newnet;
newnet = bpnn_internal_create(n_in, n_hidden, n_out);
#ifdef INITZERO
bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
#else
bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
#endif
bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
bpnn_randomize_row(newnet->target, n_out);
return (newnet);
}
void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
int n1, n2;
{
float sum;
int j, k;
/*** Set up thresholding unit ***/
l1[0] = 1.0;
#ifdef OPEN
omp_set_num_threads(NUM_THREAD);
#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
#endif
/*** For each unit in second layer ***/
for (j = 1; j <= n2; j++) {
/*** Compute weighted sum of its inputs ***/
sum = 0.0;
for (k = 0; k <= n1; k++) {
sum += conn[k][j] * l1[k];
}
l2[j] = squash(sum);
}
}
// extern "C"
void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
*output, *err;
int nj;
{
int j;
float o, t, errsum;
errsum = 0.0;
for (j = 1; j <= nj; j++) {
o = output[j];
t = target[j];
delta[j] = o * (1.0 - o) * (t - o);
errsum += ABS(delta[j]);
}
*err = errsum;
}
void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
err) float *delta_h,
*delta_o, *hidden, **who, *err;
int nh, no;
{
int j, k;
float h, sum, errsum;
errsum = 0.0;
for (j = 1; j <= nh; j++) {
h = hidden[j];
sum = 0.0;
for (k = 1; k <= no; k++) {
sum += delta_o[k] * who[j][k];
}
delta_h[j] = h * (1.0 - h) * sum;
errsum += ABS(delta_h[j]);
}
*err = errsum;
}
void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
**w, **oldw;
{
float new_dw;
int k, j;
ly[0] = 1.0;
// eta = 0.3;
// momentum = 0.3;
#ifdef OPEN
omp_set_num_threads(NUM_THREAD);
#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw) \
firstprivate(ndelta, nly, momentum)
#endif
for (j = 1; j <= ndelta; j++) {
for (k = 0; k <= nly; k++) {
new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
w[k][j] += new_dw;
oldw[k][j] = new_dw;
}
}
}
void bpnn_feedforward(net) BPNN *net;
{
int in, hid, out;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
/*** Feed forward input activations. ***/
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
hid);
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
hid, out);
}
void bpnn_train(net, eo, eh) BPNN *net;
float *eo, *eh;
{
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
/*** Feed forward input activations. ***/
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
hid);
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
hid, out);
/*** Compute error on output and hidden units. ***/
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
&out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
net->hidden_weights, net->hidden_units, &hid_err);
*eo = out_err;
*eh = hid_err;
/*** Adjust input and hidden weights. ***/
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
net->hidden_weights, net->hidden_prev_weights);
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
net->input_weights, net->input_prev_weights);
}
void bpnn_save(net, filename) BPNN *net;
char *filename;
{
int n1, n2, n3, i, j, memcnt;
float dvalue, **w;
char *mem;
/// add//
FILE *pFile;
pFile = fopen(filename, "w+");
///////
/*
if ((fd = creat(filename, 0644)) == -1) {
printf("BPNN_SAVE: Cannot create '%s'\n", filename);
return;
}
*/
n1 = net->input_n;
n2 = net->hidden_n;
n3 = net->output_n;
printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
// fflush(stdout);
// write(fd, (char *) &n1, sizeof(int));
// write(fd, (char *) &n2, sizeof(int));
// write(fd, (char *) &n3, sizeof(int));
fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
memcnt = 0;
w = net->input_weights;
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
for (i = 0; i <= n1; i++) {
for (j = 0; j <= n2; j++) {
dvalue = w[i][j];
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
memcnt += sizeof(float);
}
}
// write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
fwrite(mem, (unsigned)(sizeof(float)),
(unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
free(mem);
memcnt = 0;
w = net->hidden_weights;
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
for (i = 0; i <= n2; i++) {
for (j = 0; j <= n3; j++) {
dvalue = w[i][j];
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
memcnt += sizeof(float);
}
}
// write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
pFile);
free(mem);
fclose(pFile);
return;
}
BPNN *bpnn_read(filename)
char *filename;
{
char *mem;
BPNN *new;
int fd, n1, n2, n3, i, j, memcnt;
if ((fd = open(filename, 0, 0644)) == -1) {
return (NULL);
}
printf("Reading '%s'\n", filename); // fflush(stdout);
read(fd, (char *)&n1, sizeof(int));
read(fd, (char *)&n2, sizeof(int));
read(fd, (char *)&n3, sizeof(int));
new = bpnn_internal_create(n1, n2, n3);
printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
printf("Reading input weights..."); // fflush(stdout);
memcnt = 0;
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
for (i = 0; i <= n1; i++) {
for (j = 0; j <= n2; j++) {
fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
memcnt += sizeof(float);
}
}
free(mem);
printf("Done\nReading hidden weights..."); // fflush(stdout);
memcnt = 0;
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
for (i = 0; i <= n2; i++) {
for (j = 0; j <= n3; j++) {
fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
memcnt += sizeof(float);
}
}
free(mem);
close(fd);
printf("Done\n"); // fflush(stdout);
bpnn_zero_weights(new->input_prev_weights, n1, n2);
bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
return (new);
}

View File

@ -0,0 +1,50 @@
#ifndef _BACKPROP_H_
#define _BACKPROP_H_
#define BIGRND 0x7fffffff
#define GPU
#define THREADS 256
#define WIDTH 16 // shared memory width
#define HEIGHT 16 // shared memory height
#define ETA 0.3 // eta value
#define MOMENTUM 0.3 // momentum value
#define NUM_THREAD 4 // OpenMP threads
typedef struct {
int input_n; /* number of input units */
int hidden_n; /* number of hidden units */
int output_n; /* number of output units */
float *input_units; /* the input units */
float *hidden_units; /* the hidden units */
float *output_units; /* the output units */
float *hidden_delta; /* storage for hidden unit error */
float *output_delta; /* storage for output unit error */
float *target; /* storage for target vector */
float **input_weights; /* weights from input to hidden layer */
float **hidden_weights; /* weights from hidden to output layer */
/*** The next two are for momentum ***/
float **input_prev_weights; /* previous change on input to hidden wgt */
float **hidden_prev_weights; /* previous change on hidden to output wgt */
} BPNN;
/*** User-level functions ***/
void bpnn_initialize();
BPNN *bpnn_create();
void bpnn_free();
void bpnn_train();
void bpnn_feedforward();
void bpnn_save();
BPNN *bpnn_read();
#endif

View File

@ -0,0 +1,615 @@
; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "backprop_cuda.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
entry:
%input_cuda.addr = alloca float*, align 8
%output_hidden_cuda.addr = alloca float*, align 8
%input_hidden_cuda.addr = alloca float*, align 8
%hidden_partial_sum.addr = alloca float*, align 8
%in.addr = alloca i32, align 4
%hid.addr = alloca i32, align 4
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%index = alloca i32, align 4
%index_in = alloca i32, align 4
%i = alloca i32, align 4
%power_two = alloca i32, align 4
store float* %input_cuda, float** %input_cuda.addr, align 8
store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
store i32 %in, i32* %in.addr, align 4
store i32 %hid, i32* %hid.addr, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call, i32* %by, align 4
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %tx, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call2, i32* %ty, align 4
%0 = load i32, i32* %hid.addr, align 4
%add = add nsw i32 %0, 1
%mul = mul nsw i32 %add, 16
%1 = load i32, i32* %by, align 4
%mul3 = mul nsw i32 %mul, %1
%2 = load i32, i32* %hid.addr, align 4
%add4 = add nsw i32 %2, 1
%3 = load i32, i32* %ty, align 4
%mul5 = mul nsw i32 %add4, %3
%add6 = add nsw i32 %mul3, %mul5
%4 = load i32, i32* %tx, align 4
%add7 = add nsw i32 %add6, %4
%add8 = add nsw i32 %add7, 1
%5 = load i32, i32* %hid.addr, align 4
%add9 = add nsw i32 %5, 1
%add10 = add nsw i32 %add8, %add9
store i32 %add10, i32* %index, align 4
%6 = load i32, i32* %by, align 4
%mul11 = mul nsw i32 16, %6
%7 = load i32, i32* %ty, align 4
%add12 = add nsw i32 %mul11, %7
%add13 = add nsw i32 %add12, 1
store i32 %add13, i32* %index_in, align 4
%8 = load i32, i32* %tx, align 4
%cmp = icmp eq i32 %8, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%9 = load float*, float** %input_cuda.addr, align 8
%10 = load i32, i32* %index_in, align 4
%idxprom = sext i32 %10 to i64
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
%11 = load float, float* %arrayidx, align 4
%12 = load i32, i32* %ty, align 4
%idxprom14 = sext i32 %12 to i64
%arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
store float %11, float* %arrayidx15, align 4
br label %if.end
if.end: ; preds = %if.then, %entry
call void @llvm.nvvm.barrier0()
%13 = load float*, float** %input_hidden_cuda.addr, align 8
%14 = load i32, i32* %index, align 4
%idxprom16 = sext i32 %14 to i64
%arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
%15 = load float, float* %arrayidx17, align 4
%16 = load i32, i32* %ty, align 4
%idxprom18 = sext i32 %16 to i64
%arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
%17 = load i32, i32* %tx, align 4
%idxprom20 = sext i32 %17 to i64
%arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
store float %15, float* %arrayidx21, align 4
call void @llvm.nvvm.barrier0()
%18 = load i32, i32* %ty, align 4
%idxprom22 = sext i32 %18 to i64
%arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
%19 = load i32, i32* %tx, align 4
%idxprom24 = sext i32 %19 to i64
%arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
%20 = load float, float* %arrayidx25, align 4
%21 = load i32, i32* %ty, align 4
%idxprom26 = sext i32 %21 to i64
%arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
%22 = load float, float* %arrayidx27, align 4
%mul28 = fmul contract float %20, %22
%23 = load i32, i32* %ty, align 4
%idxprom29 = sext i32 %23 to i64
%arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
%24 = load i32, i32* %tx, align 4
%idxprom31 = sext i32 %24 to i64
%arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
store float %mul28, float* %arrayidx32, align 4
call void @llvm.nvvm.barrier0()
store i32 1, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %if.end
%25 = load i32, i32* %i, align 4
%conv = sitofp i32 %25 to float
%call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
%cmp34 = fcmp ole float %conv, %call33
br i1 %cmp34, label %for.body, label %for.end
for.body: ; preds = %for.cond
%26 = load i32, i32* %i, align 4
%conv35 = sitofp i32 %26 to float
%call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
%conv37 = fptosi float %call36 to i32
store i32 %conv37, i32* %power_two, align 4
%27 = load i32, i32* %ty, align 4
%28 = load i32, i32* %power_two, align 4
%rem = srem i32 %27, %28
%cmp38 = icmp eq i32 %rem, 0
br i1 %cmp38, label %if.then39, label %if.end54
if.then39: ; preds = %for.body
%29 = load i32, i32* %ty, align 4
%idxprom40 = sext i32 %29 to i64
%arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
%30 = load i32, i32* %tx, align 4
%idxprom42 = sext i32 %30 to i64
%arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
%31 = load float, float* %arrayidx43, align 4
%32 = load i32, i32* %ty, align 4
%33 = load i32, i32* %power_two, align 4
%div = sdiv i32 %33, 2
%add44 = add nsw i32 %32, %div
%idxprom45 = sext i32 %add44 to i64
%arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
%34 = load i32, i32* %tx, align 4
%idxprom47 = sext i32 %34 to i64
%arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
%35 = load float, float* %arrayidx48, align 4
%add49 = fadd contract float %31, %35
%36 = load i32, i32* %ty, align 4
%idxprom50 = sext i32 %36 to i64
%arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
%37 = load i32, i32* %tx, align 4
%idxprom52 = sext i32 %37 to i64
%arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
store float %add49, float* %arrayidx53, align 4
br label %if.end54
if.end54: ; preds = %if.then39, %for.body
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end54
%38 = load i32, i32* %i, align 4
%inc = add nsw i32 %38, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%39 = load i32, i32* %ty, align 4
%idxprom55 = sext i32 %39 to i64
%arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
%40 = load i32, i32* %tx, align 4
%idxprom57 = sext i32 %40 to i64
%arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
%41 = load float, float* %arrayidx58, align 4
%42 = load float*, float** %input_hidden_cuda.addr, align 8
%43 = load i32, i32* %index, align 4
%idxprom59 = sext i32 %43 to i64
%arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
store float %41, float* %arrayidx60, align 4
call void @llvm.nvvm.barrier0()
%44 = load i32, i32* %tx, align 4
%cmp61 = icmp eq i32 %44, 0
br i1 %cmp61, label %if.then62, label %if.end71
if.then62: ; preds = %for.end
%45 = load i32, i32* %tx, align 4
%idxprom63 = sext i32 %45 to i64
%arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
%46 = load i32, i32* %ty, align 4
%idxprom65 = sext i32 %46 to i64
%arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
%47 = load float, float* %arrayidx66, align 4
%48 = load float*, float** %hidden_partial_sum.addr, align 8
%49 = load i32, i32* %by, align 4
%50 = load i32, i32* %hid.addr, align 4
%mul67 = mul nsw i32 %49, %50
%51 = load i32, i32* %ty, align 4
%add68 = add nsw i32 %mul67, %51
%idxprom69 = sext i32 %add68 to i64
%arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
store float %47, float* %arrayidx70, align 4
br label %if.end71
if.end71: ; preds = %if.then62, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: alwaysinline convergent nounwind
define internal float @_ZL7__log2ff(float %__a) #1 {
entry:
%__a.addr = alloca float, align 4
store float %__a, float* %__a.addr, align 4
%0 = load float, float* %__a.addr, align 4
%call = call float @__nv_fast_log2f(float %0) #2
ret float %call
}
; Function Attrs: alwaysinline convergent nounwind
define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
entry:
%__a.addr = alloca float, align 4
%__b.addr = alloca float, align 4
store float %__a, float* %__a.addr, align 4
store float %__b, float* %__b.addr, align 4
%0 = load float, float* %__a.addr, align 4
%1 = load float, float* %__b.addr, align 4
%call = call float @__nv_fast_powf(float %0, float %1) #2
ret float %call
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
entry:
%delta.addr = alloca float*, align 8
%hid.addr = alloca i32, align 4
%ly.addr = alloca float*, align 8
%in.addr = alloca i32, align 4
%w.addr = alloca float*, align 8
%oldw.addr = alloca float*, align 8
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%index = alloca i32, align 4
%index_y = alloca i32, align 4
%index_x = alloca i32, align 4
store float* %delta, float** %delta.addr, align 8
store i32 %hid, i32* %hid.addr, align 4
store float* %ly, float** %ly.addr, align 8
store i32 %in, i32* %in.addr, align 4
store float* %w, float** %w.addr, align 8
store float* %oldw, float** %oldw.addr, align 8
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call, i32* %by, align 4
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %tx, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call2, i32* %ty, align 4
%0 = load i32, i32* %hid.addr, align 4
%add = add nsw i32 %0, 1
%mul = mul nsw i32 %add, 16
%1 = load i32, i32* %by, align 4
%mul3 = mul nsw i32 %mul, %1
%2 = load i32, i32* %hid.addr, align 4
%add4 = add nsw i32 %2, 1
%3 = load i32, i32* %ty, align 4
%mul5 = mul nsw i32 %add4, %3
%add6 = add nsw i32 %mul3, %mul5
%4 = load i32, i32* %tx, align 4
%add7 = add nsw i32 %add6, %4
%add8 = add nsw i32 %add7, 1
%5 = load i32, i32* %hid.addr, align 4
%add9 = add nsw i32 %5, 1
%add10 = add nsw i32 %add8, %add9
store i32 %add10, i32* %index, align 4
%6 = load i32, i32* %by, align 4
%mul11 = mul nsw i32 16, %6
%7 = load i32, i32* %ty, align 4
%add12 = add nsw i32 %mul11, %7
%add13 = add nsw i32 %add12, 1
store i32 %add13, i32* %index_y, align 4
%8 = load i32, i32* %tx, align 4
%add14 = add nsw i32 %8, 1
store i32 %add14, i32* %index_x, align 4
%9 = load float*, float** %delta.addr, align 8
%10 = load i32, i32* %index_x, align 4
%idxprom = sext i32 %10 to i64
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
%11 = load float, float* %arrayidx, align 4
%conv = fpext float %11 to double
%mul15 = fmul contract double 3.000000e-01, %conv
%12 = load float*, float** %ly.addr, align 8
%13 = load i32, i32* %index_y, align 4
%idxprom16 = sext i32 %13 to i64
%arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
%14 = load float, float* %arrayidx17, align 4
%conv18 = fpext float %14 to double
%mul19 = fmul contract double %mul15, %conv18
%15 = load float*, float** %oldw.addr, align 8
%16 = load i32, i32* %index, align 4
%idxprom20 = sext i32 %16 to i64
%arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
%17 = load float, float* %arrayidx21, align 4
%conv22 = fpext float %17 to double
%mul23 = fmul contract double 3.000000e-01, %conv22
%add24 = fadd contract double %mul19, %mul23
%18 = load float*, float** %w.addr, align 8
%19 = load i32, i32* %index, align 4
%idxprom25 = sext i32 %19 to i64
%arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
%20 = load float, float* %arrayidx26, align 4
%conv27 = fpext float %20 to double
%add28 = fadd contract double %conv27, %add24
%conv29 = fptrunc double %add28 to float
store float %conv29, float* %arrayidx26, align 4
%21 = load float*, float** %delta.addr, align 8
%22 = load i32, i32* %index_x, align 4
%idxprom30 = sext i32 %22 to i64
%arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
%23 = load float, float* %arrayidx31, align 4
%conv32 = fpext float %23 to double
%mul33 = fmul contract double 3.000000e-01, %conv32
%24 = load float*, float** %ly.addr, align 8
%25 = load i32, i32* %index_y, align 4
%idxprom34 = sext i32 %25 to i64
%arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
%26 = load float, float* %arrayidx35, align 4
%conv36 = fpext float %26 to double
%mul37 = fmul contract double %mul33, %conv36
%27 = load float*, float** %oldw.addr, align 8
%28 = load i32, i32* %index, align 4
%idxprom38 = sext i32 %28 to i64
%arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
%29 = load float, float* %arrayidx39, align 4
%conv40 = fpext float %29 to double
%mul41 = fmul contract double 3.000000e-01, %conv40
%add42 = fadd contract double %mul37, %mul41
%conv43 = fptrunc double %add42 to float
%30 = load float*, float** %oldw.addr, align 8
%31 = load i32, i32* %index, align 4
%idxprom44 = sext i32 %31 to i64
%arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
store float %conv43, float* %arrayidx45, align 4
call void @llvm.nvvm.barrier0()
%32 = load i32, i32* %ty, align 4
%cmp = icmp eq i32 %32, 0
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%33 = load i32, i32* %by, align 4
%cmp46 = icmp eq i32 %33, 0
br i1 %cmp46, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true
%34 = load float*, float** %delta.addr, align 8
%35 = load i32, i32* %index_x, align 4
%idxprom47 = sext i32 %35 to i64
%arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
%36 = load float, float* %arrayidx48, align 4
%conv49 = fpext float %36 to double
%mul50 = fmul contract double 3.000000e-01, %conv49
%37 = load float*, float** %oldw.addr, align 8
%38 = load i32, i32* %index_x, align 4
%idxprom51 = sext i32 %38 to i64
%arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
%39 = load float, float* %arrayidx52, align 4
%conv53 = fpext float %39 to double
%mul54 = fmul contract double 3.000000e-01, %conv53
%add55 = fadd contract double %mul50, %mul54
%40 = load float*, float** %w.addr, align 8
%41 = load i32, i32* %index_x, align 4
%idxprom56 = sext i32 %41 to i64
%arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
%42 = load float, float* %arrayidx57, align 4
%conv58 = fpext float %42 to double
%add59 = fadd contract double %conv58, %add55
%conv60 = fptrunc double %add59 to float
store float %conv60, float* %arrayidx57, align 4
%43 = load float*, float** %delta.addr, align 8
%44 = load i32, i32* %index_x, align 4
%idxprom61 = sext i32 %44 to i64
%arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
%45 = load float, float* %arrayidx62, align 4
%conv63 = fpext float %45 to double
%mul64 = fmul contract double 3.000000e-01, %conv63
%46 = load float*, float** %oldw.addr, align 8
%47 = load i32, i32* %index_x, align 4
%idxprom65 = sext i32 %47 to i64
%arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
%48 = load float, float* %arrayidx66, align 4
%conv67 = fpext float %48 to double
%mul68 = fmul contract double 3.000000e-01, %conv67
%add69 = fadd contract double %mul64, %mul68
%conv70 = fptrunc double %add69 to float
%49 = load float*, float** %oldw.addr, align 8
%50 = load i32, i32* %index_x, align 4
%idxprom71 = sext i32 %50 to i64
%arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
store float %conv70, float* %arrayidx72, align 4
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true, %entry
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @__nv_fast_log2f(float %a) #4 {
%call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%1 = icmp ne i32 %call.i, 0
br i1 %1, label %2, label %4
2: ; preds = %0
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
br label %__nvvm_builtin_log2f.exit
4: ; preds = %0
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
br label %__nvvm_builtin_log2f.exit
__nvvm_builtin_log2f.exit: ; preds = %4, %2
%retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
ret float %retval.0.i
}
; Function Attrs: convergent nounwind
declare i32 @__nvvm_reflect(i8*) #5
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.lg2.approx.f(float) #3
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @__nv_fast_powf(float %a, float %b) #4 {
%call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%1 = icmp ne i32 %call.i.i, 0
br i1 %1, label %2, label %4
2: ; preds = %0
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
br label %__nv_fast_log2f.exit
4: ; preds = %0
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
br label %__nv_fast_log2f.exit
__nv_fast_log2f.exit: ; preds = %4, %2
%retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
%6 = fmul float %b, %retval.0.i.i
%call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%7 = icmp ne i32 %call.i.i1, 0
br i1 %7, label %8, label %10
8: ; preds = %__nv_fast_log2f.exit
%9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
br label %__nv_exp2f.exit
10: ; preds = %__nv_fast_log2f.exit
%11 = call float @llvm.nvvm.ex2.approx.f(float %6)
br label %__nv_exp2f.exit
__nv_exp2f.exit: ; preds = %10, %8
%retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
ret float %retval.0.i.i2
}
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.ex2.approx.f(float) #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,195 @@
#include <cuda.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
// includes, kernels
#include "backprop.h"
#include "backprop_cuda_kernel.cu"
////////////////////////////////////////////////////////////////////////////////
extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
int n2);
extern "C" void bpnn_output_error(float *delta, float *target, float *output,
int nj, float *err);
extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
int no, float **who, float *hidden,
float *err);
extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
int nly, float **w, float **oldw);
extern "C" int setup(int argc, char **argv);
extern "C" float **alloc_2d_dbl(int m, int n);
extern "C" float squash(float x);
double gettime() {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
unsigned int num_threads = 0;
unsigned int num_blocks = 0;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
cudaSetDevice(0);
setup(argc, argv);
}
extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
#ifdef GPU
int m = 0;
float *input_hidden_cuda;
float *input_cuda;
float *output_hidden_cuda;
float *partial_sum;
float *hidden_partial_sum;
float *hidden_delta_cuda;
float *input_prev_weights_cuda;
float sum;
float *input_weights_one_dim;
float *input_weights_prev_one_dim;
num_blocks = in / 16;
dim3 grid(1, num_blocks);
dim3 threads(16, 16);
input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
input_weights_prev_one_dim =
(float *)malloc((in + 1) * (hid + 1) * sizeof(float));
partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
// this preprocessing stage is added to correct the bugs of wrong memcopy
// using two-dimensional net->inputweights
for (int k = 0; k <= in; k++) {
for (int j = 0; j <= hid; j++) {
input_weights_one_dim[m] = net->input_weights[k][j];
input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
m++;
}
}
cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
#endif
#ifdef CPU
printf("Performing CPU computation\n");
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
hid);
#endif
#ifdef GPU
printf("Performing GPU computation\n");
// printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
input_hidden_cuda,
hidden_partial_sum, in, hid);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaMemcpy(partial_sum, hidden_partial_sum,
num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 1; j <= hid; j++) {
sum = 0.0;
for (int k = 0; k < num_blocks; k++) {
sum += partial_sum[k * hid + j - 1];
}
sum += net->input_weights[0][j];
net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
}
#endif
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
hid, out);
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
&out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
net->hidden_weights, net->hidden_units, &hid_err);
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
net->hidden_weights, net->hidden_prev_weights);
#ifdef CPU
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
net->input_weights, net->input_prev_weights);
#endif
#ifdef GPU
cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void **)&input_prev_weights_cuda,
(in + 1) * (hid + 1) * sizeof(float));
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
input_cuda, in, input_hidden_cuda,
input_prev_weights_cuda);
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < (in + 1) * (hid + 1); i++) {
printf("%f ", input_weights_one_dim[i]);
}
printf("\n");
cudaFree(input_cuda);
cudaFree(output_hidden_cuda);
cudaFree(input_hidden_cuda);
cudaFree(hidden_partial_sum);
cudaFree(input_prev_weights_cuda);
cudaFree(hidden_delta_cuda);
free(partial_sum);
free(input_weights_one_dim);
free(input_weights_prev_one_dim);
#endif
}

View File

@ -0,0 +1,96 @@
#ifndef _BACKPROP_CUDA_KERNEL_H_
#define _BACKPROP_CUDA_KERNEL_H_
#include "backprop.h"
#include "cuda.h"
#include "math.h"
#include <stdio.h>
__global__ void bpnn_layerforward_CUDA(float *input_cuda,
float *output_hidden_cuda,
float *input_hidden_cuda,
float *hidden_partial_sum, int in,
int hid) {
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
int index_in = HEIGHT * by + ty + 1;
__shared__ float input_node[HEIGHT];
__shared__ float weight_matrix[HEIGHT][WIDTH];
if (tx == 0)
input_node[ty] = input_cuda[index_in];
__syncthreads();
weight_matrix[ty][tx] = input_hidden_cuda[index];
__syncthreads();
weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
__syncthreads();
for (int i = 1; i <= __log2f(HEIGHT); i++) {
int power_two = __powf(2, i);
if (ty % power_two == 0)
weight_matrix[ty][tx] =
weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
__syncthreads();
}
//__syncthreads();
input_hidden_cuda[index] = weight_matrix[ty][tx];
/*
for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
unsigned int power_two = i - 1;
if( (ty & power_two) == 0 ) {
weight_matrix[ty][tx] = weight_matrix[ty][tx] +
weight_matrix[ty + power_two/2][tx];
}
}
*/
__syncthreads();
if (tx == 0) {
hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
}
}
__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
int in, float *w, float *oldw) {
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
int index_y = HEIGHT * by + ty + 1;
int index_x = tx + 1;
// eta = 0.3;
// momentum = 0.3;
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
oldw[index] =
((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
__syncthreads();
if (ty == 0 && by == 0) {
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
}
}
#endif

View File

@ -0,0 +1,48 @@
#include "backprop.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
extern char *strcpy();
extern void exit();
int layer_size = 0;
backprop_face() {
BPNN *net;
int i;
float out_err, hid_err;
net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
printf("Input layer size : %d\n", layer_size);
load(net);
// entering the training kernel, only one iteration
printf("Starting training kernel\n");
bpnn_train_cuda(net, &out_err, &hid_err);
bpnn_free(net);
printf("Training done\n");
}
int setup(argc, argv)
int argc;
char *argv[];
{
int seed;
if (argc != 2) {
fprintf(stderr, "usage: backprop <num of input elements>\n");
exit(0);
}
layer_size = atoi(argv[1]);
if (layer_size % 16 != 0) {
fprintf(stderr, "The number of input points must be divided by 16\n");
exit(0);
}
seed = 7;
bpnn_initialize(seed);
backprop_face();
exit(0);
}

View File

@ -0,0 +1,22 @@
#include "backprop.h"
#include <stdio.h>
#include <stdlib.h>
extern layer_size;
load(net) BPNN *net;
{
float *units;
int nr, nc, imgsize, i, j, k;
nr = layer_size;
imgsize = nr * nc;
units = net->input_units;
k = 1;
for (i = 0; i < nr; i++) {
units[k] = (float)rand() / RAND_MAX;
k++;
}
}

28
examples/backprop/run.sh Normal file
View File

@ -0,0 +1,28 @@
#!/bin/bash
set -e
clang -c -emit-llvm backprop.c
clang -c -emit-llvm facetrain.c
clang -c -emit-llvm imagenet.c
llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
llc --relocation-model=pic --filetype=obj backprop.bc
llc --relocation-model=pic --filetype=obj facetrain.bc
llc --relocation-model=pic --filetype=obj imagenet.bc
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o demo \
-fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
-lc -lx86Runtime -lthreadPool -lpthread
./demo 1024 > res.log
if grep -q -e "0.173289 0.259645 0.350836" res.log; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -0,0 +1,307 @@
; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "bfs.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
%struct.Node = type { i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
entry:
%g_graph_nodes.addr = alloca %struct.Node*, align 8
%g_graph_edges.addr = alloca i32*, align 8
%g_graph_mask.addr = alloca i8*, align 8
%g_updating_graph_mask.addr = alloca i8*, align 8
%g_graph_visited.addr = alloca i8*, align 8
%g_cost.addr = alloca i32*, align 8
%no_of_nodes.addr = alloca i32, align 4
%tid = alloca i32, align 4
%i = alloca i32, align 4
%id = alloca i32, align 4
store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
store i32* %g_cost, i32** %g_cost.addr, align 8
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call, 512
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add = add i32 %mul, %call1
store i32 %add, i32* %tid, align 4
%0 = load i32, i32* %tid, align 4
%1 = load i32, i32* %no_of_nodes.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %land.lhs.true, label %if.end26
land.lhs.true: ; preds = %entry
%2 = load i8*, i8** %g_graph_mask.addr, align 8
%3 = load i32, i32* %tid, align 4
%idxprom = sext i32 %3 to i64
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
%4 = load i8, i8* %arrayidx, align 1
%tobool = trunc i8 %4 to i1
br i1 %tobool, label %if.then, label %if.end26
if.then: ; preds = %land.lhs.true
%5 = load i8*, i8** %g_graph_mask.addr, align 8
%6 = load i32, i32* %tid, align 4
%idxprom2 = sext i32 %6 to i64
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
store i8 0, i8* %arrayidx3, align 1
%7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
%8 = load i32, i32* %tid, align 4
%idxprom4 = sext i32 %8 to i64
%arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
%starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
%9 = load i32, i32* %starting, align 4
store i32 %9, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %if.then
%10 = load i32, i32* %i, align 4
%11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
%12 = load i32, i32* %tid, align 4
%idxprom6 = sext i32 %12 to i64
%arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
%no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
%13 = load i32, i32* %no_of_edges, align 4
%14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
%15 = load i32, i32* %tid, align 4
%idxprom8 = sext i32 %15 to i64
%arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
%starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
%16 = load i32, i32* %starting10, align 4
%add11 = add nsw i32 %13, %16
%cmp12 = icmp slt i32 %10, %add11
br i1 %cmp12, label %for.body, label %for.end
for.body: ; preds = %for.cond
%17 = load i32*, i32** %g_graph_edges.addr, align 8
%18 = load i32, i32* %i, align 4
%idxprom13 = sext i32 %18 to i64
%arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
%19 = load i32, i32* %arrayidx14, align 4
store i32 %19, i32* %id, align 4
%20 = load i8*, i8** %g_graph_visited.addr, align 8
%21 = load i32, i32* %id, align 4
%idxprom15 = sext i32 %21 to i64
%arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
%22 = load i8, i8* %arrayidx16, align 1
%tobool17 = trunc i8 %22 to i1
br i1 %tobool17, label %if.end, label %if.then18
if.then18: ; preds = %for.body
%23 = load i32*, i32** %g_cost.addr, align 8
%24 = load i32, i32* %tid, align 4
%idxprom19 = sext i32 %24 to i64
%arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
%25 = load i32, i32* %arrayidx20, align 4
%add21 = add nsw i32 %25, 1
%26 = load i32*, i32** %g_cost.addr, align 8
%27 = load i32, i32* %id, align 4
%idxprom22 = sext i32 %27 to i64
%arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
store i32 %add21, i32* %arrayidx23, align 4
%28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
%29 = load i32, i32* %id, align 4
%idxprom24 = sext i32 %29 to i64
%arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
store i8 1, i8* %arrayidx25, align 1
br label %if.end
if.end: ; preds = %if.then18, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%30 = load i32, i32* %i, align 4
%inc = add nsw i32 %30, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
br label %if.end26
if.end26: ; preds = %for.end, %land.lhs.true, %entry
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
entry:
%g_graph_mask.addr = alloca i8*, align 8
%g_updating_graph_mask.addr = alloca i8*, align 8
%g_graph_visited.addr = alloca i8*, align 8
%g_over.addr = alloca i8*, align 8
%no_of_nodes.addr = alloca i32, align 4
%tid = alloca i32, align 4
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
store i8* %g_over, i8** %g_over.addr, align 8
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call, 512
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add = add i32 %mul, %call1
store i32 %add, i32* %tid, align 4
%0 = load i32, i32* %tid, align 4
%1 = load i32, i32* %no_of_nodes.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
%3 = load i32, i32* %tid, align 4
%idxprom = sext i32 %3 to i64
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
%4 = load i8, i8* %arrayidx, align 1
%tobool = trunc i8 %4 to i1
br i1 %tobool, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true
%5 = load i8*, i8** %g_graph_mask.addr, align 8
%6 = load i32, i32* %tid, align 4
%idxprom2 = sext i32 %6 to i64
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
store i8 1, i8* %arrayidx3, align 1
%7 = load i8*, i8** %g_graph_visited.addr, align 8
%8 = load i32, i32* %tid, align 4
%idxprom4 = sext i32 %8 to i64
%arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
store i8 1, i8* %arrayidx5, align 1
%9 = load i8*, i8** %g_over.addr, align 8
store i8 1, i8* %9, align 1
%10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
%11 = load i32, i32* %tid, align 4
%idxprom6 = sext i32 %11 to i64
%arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
store i8 0, i8* %arrayidx7, align 1
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true, %entry
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

213
examples/bfs/bfs.cu Normal file
View File

@ -0,0 +1,213 @@
#include <cuda.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_THREADS_PER_BLOCK 512
int no_of_nodes;
int edge_list_size;
FILE *fp;
// Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
#include "kernel.cu"
#include "kernel2.cu"
void BFSGraph(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
cudaSetDevice(0);
no_of_nodes = 0;
edge_list_size = 0;
BFSGraph(argc, argv);
}
void Usage(int argc, char **argv) {
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
}
////////////////////////////////////////////////////////////////////////////////
// Apply BFS on a Graph using CUDA
////////////////////////////////////////////////////////////////////////////////
void BFSGraph(int argc, char **argv) {
char *input_f;
if (argc != 2) {
Usage(argc, argv);
exit(0);
}
input_f = argv[1];
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return;
}
int source = 0;
fscanf(fp, "%d", &no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
// Make execution Parameters according to the number of nodes
// Distribute threads across multiple Blocks if necessary
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
// allocate host memory
Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (unsigned int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
printf("Read File\n");
// Copy the Node list to device memory
Node *d_graph_nodes;
cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
cudaMemcpyHostToDevice);
// Copy the Edge List to device Memory
int *d_graph_edges;
cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
cudaMemcpyHostToDevice);
// Copy the Mask to device memory
bool *d_graph_mask;
cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
cudaMemcpyHostToDevice);
bool *d_updating_graph_mask;
cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
// Copy the Visited nodes array to device memory
bool *d_graph_visited;
cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
cudaMemcpyHostToDevice);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++)
h_cost[i] = -1;
h_cost[source] = 0;
// allocate device memory for result
int *d_cost;
cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
// make a bool to check if the execution is over
bool *d_over;
cudaMalloc((void **)&d_over, sizeof(bool));
printf("Copied Everything to GPU memory\n");
// setup execution parameters
dim3 grid(num_of_blocks, 1, 1);
dim3 threads(num_of_threads_per_block, 1, 1);
int k = 0;
printf("Start traversing the tree\n");
bool stop;
// Call the Kernel untill all the elements of Frontier are not false
do {
// if no thread changes this value then the loop stops
stop = false;
cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
d_updating_graph_mask, d_graph_visited, d_cost,
no_of_nodes);
cudaDeviceSynchronize();
// check if kernel execution generated and error
Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_over, no_of_nodes);
cudaDeviceSynchronize();
// check if kernel execution generated and error
cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
k++;
} while (stop);
printf("Kernel Executed %d times\n", k);
// copy result from device to host
cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
// Store the result into a file
FILE *fpo = fopen("result.txt", "w");
for (int i = 0; i < no_of_nodes; i++)
fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
fclose(fpo);
printf("Result stored in result.txt\n");
// cleanup memory
free(h_graph_nodes);
free(h_graph_edges);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
free(h_cost);
cudaFree(d_graph_nodes);
cudaFree(d_graph_edges);
cudaFree(d_graph_mask);
cudaFree(d_updating_graph_mask);
cudaFree(d_graph_visited);
cudaFree(d_cost);
}

23
examples/bfs/kernel.cu Normal file
View File

@ -0,0 +1,23 @@
#ifndef _KERNEL_H_
#define _KERNEL_H_
__global__ void
Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
{
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
if( tid<no_of_nodes && g_graph_mask[tid])
{
g_graph_mask[tid]=false;
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
{
int id = g_graph_edges[i];
if(!g_graph_visited[id])
{
g_cost[id]=g_cost[tid]+1;
g_updating_graph_mask[id]=true;
}
}
}
}
#endif

18
examples/bfs/kernel2.cu Normal file
View File

@ -0,0 +1,18 @@
#ifndef _KERNEL2_H_
#define _KERNEL2_H_
__global__ void
Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
{
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
if( tid<no_of_nodes && g_updating_graph_mask[tid])
{
g_graph_mask[tid]=true;
g_graph_visited[tid]=true;
*g_over=true;
g_updating_graph_mask[tid]=false;
}
}
#endif

21
examples/bfs/run.sh Normal file
View File

@ -0,0 +1,21 @@
#!/bin/bash
set -e
llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
-o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./bfs.out ../../rodinia-data/bfs/graph65536.txt
if grep -q "0) cost:0" result.txt; then
echo "Pass"
else
echo "Error result"
exit 1
fi

343
examples/btree/common.h Normal file
View File

@ -0,0 +1,343 @@
// # ifdef __cplusplus
// extern "C" {
// # endif
// #ifndef LIST_H
// # define LIST_H
//===============================================================================================================================================================================================================200
// DEFINE/INCLUDE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE (for some reason these are not recognized when defined in main
// file before this one is included)
//======================================================================================================================================================150
#include <stdbool.h> // (in path known to compiler) needed by true/false, bool
#include <stdint.h> // (in path known to compiler) needed by uint32_t
#include <stdlib.h> // (in path known to compiler) needed by malloc
//======================================================================================================================================================150
// DEFINE
//======================================================================================================================================================150
#define fp float
#define Version "1.5"
#ifdef WINDOWS
#define bool char
#define false 0
#define true 1
#endif
/* #define DEFAULT_ORDER 256 */
#ifdef RD_WG_SIZE_0_0
#define DEFAULT_ORDER RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define DEFAULT_ORDER RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define DEFAULT_ORDER RD_WG_SIZE
#else
#define DEFAULT_ORDER 256
#endif
/* #ifdef RD_WG_SIZE_1_0 */
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
/* #elif defined(RD_WG_SIZE_1) */
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1 */
/* #elif defined(RD_WG_SIZE) */
/* #define DEFAULT_ORDER_2 RD_WG_SIZE */
/* #else */
/* #define DEFAULT_ORDER_2 256 */
/* #endif */
/* #define DEFAULT_ORDER 508 */
#define malloc(size) \
({ \
void *_tmp; \
\
if (!(_tmp = malloc(size))) { \
fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
exit(-1); \
} \
\
_tmp; \
})
//======================================================================================================================================================150
// STRUCTURES
//======================================================================================================================================================150
// struct list_item;
typedef struct list_item list_item_t;
typedef struct list_t {
list_item_t *head, *tail;
uint32_t length;
int32_t (*compare)(const void *key, const void *with);
void (*datum_delete)(void *);
} list_t;
typedef list_item_t *list_iterator_t;
typedef list_item_t *list_reverse_iterator_t;
/* Type representing the record
* to which a given key refers.
* In a real B+ tree system, the
* record would hold data (in a database)
* or a file (in an operating system)
* or some other information.
* Users can rewrite this part of the code
* to change the type and content
* of the value field.
*/
typedef struct record {
int value;
} record;
/* Type representing a node in the B+ tree.
* This type is general enough to serve for both
* the leaf and the internal node.
* The heart of the node is the array
* of keys and the array of corresponding
* pointers. The relation between keys
* and pointers differs between leaves and
* internal nodes. In a leaf, the index
* of each key equals the index of its corresponding
* pointer, with a maximum of order - 1 key-pointer
* pairs. The last pointer points to the
* leaf to the right (or NULL in the case
* of the rightmost leaf).
* In an internal node, the first pointer
* refers to lower nodes with keys less than
* the smallest key in the keys array. Then,
* with indices i starting at 0, the pointer
* at i + 1 points to the subtree with keys
* greater than or equal to the key in this
* node at index i.
* The num_keys field is used to keep
* track of the number of valid keys.
* In an internal node, the number of valid
* pointers is always num_keys + 1.
* In a leaf, the number of valid pointers
* to data is always num_keys. The
* last leaf pointer points to the next leaf.
*/
typedef struct node {
void **pointers;
int *keys;
struct node *parent;
bool is_leaf;
int num_keys;
struct node *next; // Used for queue.
} node;
//
typedef struct knode {
int location;
int indices[DEFAULT_ORDER + 1];
int keys[DEFAULT_ORDER + 1];
bool is_leaf;
int num_keys;
} knode;
struct list_item {
struct list_item *pred, *next;
void *datum;
};
//===============================================================================================================================================================================================================200
// PROTOTYPES
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// Other
//======================================================================================================================================================150
void list_item_init(list_item_t *li, void *datum);
void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
void list_insert_item_tail(list_t *l, list_item_t *i);
void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
void list_insert_item_sorted(list_t *l, list_item_t *i);
//======================================================================================================================================================150
// ???
//======================================================================================================================================================150
void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
void (*datum_delete)(void *datum));
void list_delete(list_t *l);
void list_reset(list_t *l);
void list_insert_head(list_t *l, void *v);
void list_insert_tail(list_t *l, void *v);
void list_insert_before(list_t *l, list_item_t *next, void *v);
void list_insert_after(list_t *l, list_item_t *pred, void *v);
void list_insert_sorted(list_t *l, void *v);
void list_insert_item_head(list_t *l, list_item_t *i);
void list_remove_item(list_t *l, list_item_t *i);
void list_remove_head(list_t *l);
void list_remove_tail(list_t *l);
list_item_t *list_find_item(list_t *l, void *datum);
list_item_t *list_get_head_item(list_t *l);
list_item_t *list_get_tail_item(list_t *l);
void *list_find(list_t *l, void *datum);
void *list_get_head(list_t *l);
void *list_get_tail(list_t *l);
uint32_t list_get_length(list_t *l);
bool list_is_empty(list_t *l);
bool list_not_empty(list_t *l);
void list_visit_items(list_t *l, void (*visitor)(void *v));
void *list_item_get_datum(list_item_t *li);
void list_iterator_init(list_t *l, list_iterator_t *li);
void list_iterator_delete(list_iterator_t *li);
void list_iterator_next(list_iterator_t *li);
void list_iterator_prev(list_iterator_t *li);
void *list_iterator_get_datum(list_iterator_t *li);
bool list_iterator_is_valid(list_iterator_t *li);
void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
void list_reverse_iterator_delete(list_iterator_t *li);
void list_reverse_iterator_next(list_iterator_t *li);
void list_reverse_iterator_prev(list_iterator_t *li);
void *list_reverse_iterator_get_datum(list_iterator_t *li);
bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
//======================================================================================================================================================150
// Output and utility
//======================================================================================================================================================150
void *kmalloc(int size);
long transform_to_cuda(node *n,
bool verbose); // returns actual mem used in a long
void usage_1(void);
void usage_2(void);
void enqueue(node *new_node);
node *dequeue(void);
int height(node *root);
int path_to_root(node *root, node *child);
void print_leaves(node *root);
void print_tree(node *root);
node *find_leaf(node *root, int key, bool verbose);
record *find(node *root, int key, bool verbose);
int cut(int length);
//======================================================================================================================================================150
// Insertion
//======================================================================================================================================================150
record *make_record(int value);
node *make_node(void);
node *make_leaf(void);
int get_left_index(node *parent, node *left);
node *insert_into_leaf(node *leaf, int key, record *pointer);
node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
record *pointer);
node *insert_into_node(node *root, node *parent, int left_index, int key,
node *right);
node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
int key, node *right);
node *insert_into_parent(node *root, node *left, int key, node *right);
node *insert_into_new_root(node *left, int key, node *right);
node *start_new_tree(int key, record *pointer);
node *insert(node *root, int key, int value);
//======================================================================================================================================================150
// Deletion
//======================================================================================================================================================150
int get_neighbor_index(node *n);
node *adjust_root(node *root);
node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
int k_prime);
node *redistribute_nodes(node *root, node *n, node *neighbor,
int neighbor_index, int k_prime_index, int k_prime);
node *delete_entry(node *root, node *n, int key, void *pointer);
node *deleteVal(node *root, int key);
//===============================================================================================================================================================================================================200
// HEADER
//===============================================================================================================================================================================================================200
// int main( int argc,
// char *argv []);
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
// #endif
// # ifdef __cplusplus
// }
// # endif

View File

@ -0,0 +1,54 @@
//========================================================================================================================================================================================================200
// findK function
//========================================================================================================================================================================================================200
__global__ void
findK( long height,
knode *knodesD,
long knodes_elem,
record *recordsD,
long *currKnodeD,
long *offsetD,
int *keysD,
record *ansD)
{
// private thread IDs
int thid = threadIdx.x;
int bid = blockIdx.x;
// processtree levels
int i;
for(i = 0; i < height; i++){
// if value is between the two keys
if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
}
}
__syncthreads();
// set for next tree level
if(thid==0){
currKnodeD[bid] = offsetD[bid];
}
__syncthreads();
}
//At this point, we have a candidate leaf node which may contain
//the target record. Check each key to hopefully find the record
if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
}
}
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200

View File

@ -0,0 +1,70 @@
//========================================================================================================================================================================================================200
// findRangeK function
//========================================================================================================================================================================================================200
__global__ void
findRangeK( long height,
knode *knodesD,
long knodes_elem,
long *currKnodeD,
long *offsetD,
long *lastKnodeD,
long *offset_2D,
int *startD,
int *endD,
int *RecstartD,
int *ReclenD)
{
// private thread IDs
int thid = threadIdx.x;
int bid = blockIdx.x;
// ???
int i;
for(i = 0; i < height; i++){
if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
}
}
if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
}
}
__syncthreads();
// set for next tree level
if(thid==0){
currKnodeD[bid] = offsetD[bid];
lastKnodeD[bid] = offset_2D[bid];
}
__syncthreads();
}
// Find the index of the starting record
if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
}
__syncthreads();
// Find the index of the ending record
if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
}
}
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200

View File

@ -0,0 +1,292 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// COMMON
//======================================================================================================================================================150
#include "../common.h" // (in main program directory) needed to recognized input variables
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "../util/cuda/cuda.h" // (in path specified to compiler) needed by for device functions
#include "../util/timer/timer.h" // (in path specified to compiler) needed by timer
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel_gpu_cuda.cu" // (in current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
//======================================================================================================================================================150
// HEADER
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_wrapper.h" // (in current directory)
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
//========================================================================================================================================================================================================200
void
kernel_gpu_cuda_wrapper(record *records,
long records_mem,
knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
int *keys,
record *ans)
{
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
time0 = get_time();
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
cudaThreadSynchronize();
//====================================================================================================100
// EXECUTION PARAMETERS
//====================================================================================================100
int numBlocks;
numBlocks = count; // max # of blocks can be 65,535
int threadsPerBlock;
threadsPerBlock = order < 1024 ? order : 1024;
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
time1 = get_time();
//======================================================================================================================================================150
// GPU MEMORY (MALLOC)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// recordsD
//==================================================50
record *recordsD;
cudaMalloc((void**)&recordsD, records_mem);
checkCUDAError("cudaMalloc recordsD");
//==================================================50
// knodesD
//==================================================50
knode *knodesD;
cudaMalloc((void**)&knodesD, knodes_mem);
checkCUDAError("cudaMalloc recordsD");
//==================================================50
// currKnodeD
//==================================================50
long *currKnodeD;
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
checkCUDAError("cudaMalloc currKnodeD");
//==================================================50
// offsetD
//==================================================50
long *offsetD;
cudaMalloc((void**)&offsetD, count*sizeof(long));
checkCUDAError("cudaMalloc offsetD");
//==================================================50
// keysD
//==================================================50
int *keysD;
cudaMalloc((void**)&keysD, count*sizeof(int));
checkCUDAError("cudaMalloc keysD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
record *ansD;
cudaMalloc((void**)&ansD, count*sizeof(record));
checkCUDAError("cudaMalloc ansD");
time2 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY IN
//====================================================================================================100
//==================================================50
// recordsD
//==================================================50
cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy memD");
//==================================================50
// knodesD
//==================================================50
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy memD");
//==================================================50
// currKnodeD
//==================================================50
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
//==================================================50
// offsetD
//==================================================50
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
//==================================================50
// keysD
//==================================================50
cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy keysD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy ansD");
time3 = get_time();
//======================================================================================================================================================150
// findK kernel
//======================================================================================================================================================150
findK<<<numBlocks, threadsPerBlock>>>( maxheight,
knodesD,
knodes_elem,
recordsD,
currKnodeD,
offsetD,
keysD,
ansD);
cudaThreadSynchronize();
checkCUDAError("findK");
time4 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy ansD");
time5 = get_time();
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
cudaFree(recordsD);
cudaFree(knodesD);
cudaFree(currKnodeD);
cudaFree(offsetD);
cudaFree(keysD);
cudaFree(ansD);
time6 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
printf("Total time:\n");
printf("%.12f s\n", (float) (time6-time0) / 1000000);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
}
//========================================================================================================================================================================================================200
// END
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,23 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER HEADER
//========================================================================================================================================================================================================200
void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
long knodes_elem, long knodes_mem,
int order, long maxheight, int count,
long *currKnode, long *offset, int *keys,
record *ans);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,347 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// COMMON
//======================================================================================================================================================150
#include "../common.h" // (in the main program folder) needed to recognized input parameters
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "../util/cuda/cuda.h" // (in library path specified to compiler) needed by for device functions
#include "../util/timer/timer.h" // (in library path specified to compiler) needed by timer
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_2.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
//======================================================================================================================================================150
// HEADER
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_wrapper_2.h" // (in the current directory)
//========================================================================================================================================================================================================200
// FUNCTION
//========================================================================================================================================================================================================200
void
kernel_gpu_cuda_wrapper_2( knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
long *lastKnode,
long *offset_2,
int *start,
int *end,
int *recstart,
int *reclength)
{
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
time0 = get_time();
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
cudaThreadSynchronize();
//====================================================================================================100
// EXECUTION PARAMETERS
//====================================================================================================100
int numBlocks;
numBlocks = count;
int threadsPerBlock;
threadsPerBlock = order < 1024 ? order : 1024;
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
time1 = get_time();
//======================================================================================================================================================150
// GPU MEMORY MALLOC
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// knodesD
//==================================================50
knode *knodesD;
cudaMalloc((void**)&knodesD, knodes_mem);
checkCUDAError("cudaMalloc recordsD");
//==================================================50
// currKnodeD
//==================================================50
long *currKnodeD;
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
checkCUDAError("cudaMalloc currKnodeD");
//==================================================50
// offsetD
//==================================================50
long *offsetD;
cudaMalloc((void**)&offsetD, count*sizeof(long));
checkCUDAError("cudaMalloc offsetD");
//==================================================50
// lastKnodeD
//==================================================50
long *lastKnodeD;
cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
checkCUDAError("cudaMalloc lastKnodeD");
//==================================================50
// offset_2D
//==================================================50
long *offset_2D;
cudaMalloc((void**)&offset_2D, count*sizeof(long));
checkCUDAError("cudaMalloc offset_2D");
//==================================================50
// startD
//==================================================50
int *startD;
cudaMalloc((void**)&startD, count*sizeof(int));
checkCUDAError("cudaMalloc startD");
//==================================================50
// endD
//==================================================50
int *endD;
cudaMalloc((void**)&endD, count*sizeof(int));
checkCUDAError("cudaMalloc endD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
int *ansDStart;
cudaMalloc((void**)&ansDStart, count*sizeof(int));
checkCUDAError("cudaMalloc ansDStart");
//==================================================50
// ansDLength
//==================================================50
int *ansDLength;
cudaMalloc((void**)&ansDLength, count*sizeof(int));
checkCUDAError("cudaMalloc ansDLength");
time2 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// knodesD
//==================================================50
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy memD");
//==================================================50
// currKnodeD
//==================================================50
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
//==================================================50
// offsetD
//==================================================50
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
//==================================================50
// lastKnodeD
//==================================================50
cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
//==================================================50
// offset_2D
//==================================================50
cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
//==================================================50
// startD
//==================================================50
cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy startD");
//==================================================50
// endD
//==================================================50
cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy endD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy ansDStart");
//==================================================50
// ansDLength
//==================================================50
cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy ansDLength");
time3 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
// [GPU] findRangeK kernel
findRangeK<<<numBlocks, threadsPerBlock>>>( maxheight,
knodesD,
knodes_elem,
currKnodeD,
offsetD,
lastKnodeD,
offset_2D,
startD,
endD,
ansDStart,
ansDLength);
cudaThreadSynchronize();
checkCUDAError("findRangeK");
time4 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy ansDStart");
//==================================================50
// ansDLength
//==================================================50
cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy ansDLength");
time5 = get_time();
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
cudaFree(knodesD);
cudaFree(currKnodeD);
cudaFree(offsetD);
cudaFree(lastKnodeD);
cudaFree(offset_2D);
cudaFree(startD);
cudaFree(endD);
cudaFree(ansDStart);
cudaFree(ansDLength);
time6 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
printf("Total time:\n");
printf("%.12f s\n", (float) (time6-time0) / 1000000);
}
//========================================================================================================================================================================================================200
// END
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,23 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER HEADER
//========================================================================================================================================================================================================200
void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
int order, long maxheight, int count,
long *currKnode, long *offset, long *lastKnode,
long *offset_2, int *start, int *end,
int *recstart, int *reclength);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,332 @@
; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
%struct.record = type { i32 }
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
entry:
%height.addr = alloca i64, align 8
%knodesD.addr = alloca %struct.knode*, align 8
%knodes_elem.addr = alloca i64, align 8
%recordsD.addr = alloca %struct.record*, align 8
%currKnodeD.addr = alloca i64*, align 8
%offsetD.addr = alloca i64*, align 8
%keysD.addr = alloca i32*, align 8
%ansD.addr = alloca %struct.record*, align 8
%thid = alloca i32, align 4
%bid = alloca i32, align 4
%i = alloca i32, align 4
store i64 %height, i64* %height.addr, align 8
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
store i64* %offsetD, i64** %offsetD.addr, align 8
store i32* %keysD, i32** %keysD.addr, align 8
store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %thid, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %bid, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%conv = sext i32 %0 to i64
%1 = load i64, i64* %height.addr, align 8
%cmp = icmp slt i64 %conv, %1
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%3 = load i64*, i64** %currKnodeD.addr, align 8
%4 = load i32, i32* %bid, align 4
%idxprom = sext i32 %4 to i64
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
%5 = load i64, i64* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
%6 = load i32, i32* %thid, align 4
%idxprom3 = sext i32 %6 to i64
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
%7 = load i32, i32* %arrayidx4, align 4
%8 = load i32*, i32** %keysD.addr, align 8
%9 = load i32, i32* %bid, align 4
%idxprom5 = sext i32 %9 to i64
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
%10 = load i32, i32* %arrayidx6, align 4
%cmp7 = icmp sle i32 %7, %10
br i1 %cmp7, label %land.lhs.true, label %if.end34
land.lhs.true: ; preds = %for.body
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%12 = load i64*, i64** %currKnodeD.addr, align 8
%13 = load i32, i32* %bid, align 4
%idxprom8 = sext i32 %13 to i64
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
%14 = load i64, i64* %arrayidx9, align 8
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
%15 = load i32, i32* %thid, align 4
%add = add nsw i32 %15, 1
%idxprom12 = sext i32 %add to i64
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
%16 = load i32, i32* %arrayidx13, align 4
%17 = load i32*, i32** %keysD.addr, align 8
%18 = load i32, i32* %bid, align 4
%idxprom14 = sext i32 %18 to i64
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
%19 = load i32, i32* %arrayidx15, align 4
%cmp16 = icmp sgt i32 %16, %19
br i1 %cmp16, label %if.then, label %if.end34
if.then: ; preds = %land.lhs.true
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%21 = load i64*, i64** %offsetD.addr, align 8
%22 = load i32, i32* %bid, align 4
%idxprom17 = sext i32 %22 to i64
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
%23 = load i64, i64* %arrayidx18, align 8
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
%24 = load i32, i32* %thid, align 4
%idxprom20 = sext i32 %24 to i64
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
%25 = load i32, i32* %arrayidx21, align 4
%conv22 = sext i32 %25 to i64
%26 = load i64, i64* %knodes_elem.addr, align 8
%cmp23 = icmp slt i64 %conv22, %26
br i1 %cmp23, label %if.then24, label %if.end
if.then24: ; preds = %if.then
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%28 = load i64*, i64** %offsetD.addr, align 8
%29 = load i32, i32* %bid, align 4
%idxprom25 = sext i32 %29 to i64
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
%30 = load i64, i64* %arrayidx26, align 8
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
%31 = load i32, i32* %thid, align 4
%idxprom29 = sext i32 %31 to i64
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
%32 = load i32, i32* %arrayidx30, align 4
%conv31 = sext i32 %32 to i64
%33 = load i64*, i64** %offsetD.addr, align 8
%34 = load i32, i32* %bid, align 4
%idxprom32 = sext i32 %34 to i64
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
store i64 %conv31, i64* %arrayidx33, align 8
br label %if.end
if.end: ; preds = %if.then24, %if.then
br label %if.end34
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
call void @llvm.nvvm.barrier0()
%35 = load i32, i32* %thid, align 4
%cmp35 = icmp eq i32 %35, 0
br i1 %cmp35, label %if.then36, label %if.end41
if.then36: ; preds = %if.end34
%36 = load i64*, i64** %offsetD.addr, align 8
%37 = load i32, i32* %bid, align 4
%idxprom37 = sext i32 %37 to i64
%arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
%38 = load i64, i64* %arrayidx38, align 8
%39 = load i64*, i64** %currKnodeD.addr, align 8
%40 = load i32, i32* %bid, align 4
%idxprom39 = sext i32 %40 to i64
%arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
store i64 %38, i64* %arrayidx40, align 8
br label %if.end41
if.end41: ; preds = %if.then36, %if.end34
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end41
%41 = load i32, i32* %i, align 4
%inc = add nsw i32 %41, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%43 = load i64*, i64** %currKnodeD.addr, align 8
%44 = load i32, i32* %bid, align 4
%idxprom42 = sext i32 %44 to i64
%arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
%45 = load i64, i64* %arrayidx43, align 8
%arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
%keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
%46 = load i32, i32* %thid, align 4
%idxprom46 = sext i32 %46 to i64
%arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
%47 = load i32, i32* %arrayidx47, align 4
%48 = load i32*, i32** %keysD.addr, align 8
%49 = load i32, i32* %bid, align 4
%idxprom48 = sext i32 %49 to i64
%arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
%50 = load i32, i32* %arrayidx49, align 4
%cmp50 = icmp eq i32 %47, %50
br i1 %cmp50, label %if.then51, label %if.end63
if.then51: ; preds = %for.end
%51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
%52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%53 = load i64*, i64** %currKnodeD.addr, align 8
%54 = load i32, i32* %bid, align 4
%idxprom52 = sext i32 %54 to i64
%arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
%55 = load i64, i64* %arrayidx53, align 8
%arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
%indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
%56 = load i32, i32* %thid, align 4
%idxprom56 = sext i32 %56 to i64
%arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
%57 = load i32, i32* %arrayidx57, align 4
%idxprom58 = sext i32 %57 to i64
%arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
%value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
%58 = load i32, i32* %value, align 4
%59 = load %struct.record*, %struct.record** %ansD.addr, align 8
%60 = load i32, i32* %bid, align 4
%idxprom60 = sext i32 %60 to i64
%arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
%value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
store i32 %58, i32* %value62, align 4
br label %if.end63
if.end63: ; preds = %if.then51, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,475 @@
; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
entry:
%height.addr = alloca i64, align 8
%knodesD.addr = alloca %struct.knode*, align 8
%knodes_elem.addr = alloca i64, align 8
%currKnodeD.addr = alloca i64*, align 8
%offsetD.addr = alloca i64*, align 8
%lastKnodeD.addr = alloca i64*, align 8
%offset_2D.addr = alloca i64*, align 8
%startD.addr = alloca i32*, align 8
%endD.addr = alloca i32*, align 8
%RecstartD.addr = alloca i32*, align 8
%ReclenD.addr = alloca i32*, align 8
%thid = alloca i32, align 4
%bid = alloca i32, align 4
%i = alloca i32, align 4
store i64 %height, i64* %height.addr, align 8
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
store i64* %offsetD, i64** %offsetD.addr, align 8
store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
store i64* %offset_2D, i64** %offset_2D.addr, align 8
store i32* %startD, i32** %startD.addr, align 8
store i32* %endD, i32** %endD.addr, align 8
store i32* %RecstartD, i32** %RecstartD.addr, align 8
store i32* %ReclenD, i32** %ReclenD.addr, align 8
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %thid, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %bid, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%conv = sext i32 %0 to i64
%1 = load i64, i64* %height.addr, align 8
%cmp = icmp slt i64 %conv, %1
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%3 = load i64*, i64** %currKnodeD.addr, align 8
%4 = load i32, i32* %bid, align 4
%idxprom = sext i32 %4 to i64
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
%5 = load i64, i64* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
%6 = load i32, i32* %thid, align 4
%idxprom3 = sext i32 %6 to i64
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
%7 = load i32, i32* %arrayidx4, align 4
%8 = load i32*, i32** %startD.addr, align 8
%9 = load i32, i32* %bid, align 4
%idxprom5 = sext i32 %9 to i64
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
%10 = load i32, i32* %arrayidx6, align 4
%cmp7 = icmp sle i32 %7, %10
br i1 %cmp7, label %land.lhs.true, label %if.end34
land.lhs.true: ; preds = %for.body
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%12 = load i64*, i64** %currKnodeD.addr, align 8
%13 = load i32, i32* %bid, align 4
%idxprom8 = sext i32 %13 to i64
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
%14 = load i64, i64* %arrayidx9, align 8
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
%15 = load i32, i32* %thid, align 4
%add = add nsw i32 %15, 1
%idxprom12 = sext i32 %add to i64
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
%16 = load i32, i32* %arrayidx13, align 4
%17 = load i32*, i32** %startD.addr, align 8
%18 = load i32, i32* %bid, align 4
%idxprom14 = sext i32 %18 to i64
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
%19 = load i32, i32* %arrayidx15, align 4
%cmp16 = icmp sgt i32 %16, %19
br i1 %cmp16, label %if.then, label %if.end34
if.then: ; preds = %land.lhs.true
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%21 = load i64*, i64** %currKnodeD.addr, align 8
%22 = load i32, i32* %bid, align 4
%idxprom17 = sext i32 %22 to i64
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
%23 = load i64, i64* %arrayidx18, align 8
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
%24 = load i32, i32* %thid, align 4
%idxprom20 = sext i32 %24 to i64
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
%25 = load i32, i32* %arrayidx21, align 4
%conv22 = sext i32 %25 to i64
%26 = load i64, i64* %knodes_elem.addr, align 8
%cmp23 = icmp slt i64 %conv22, %26
br i1 %cmp23, label %if.then24, label %if.end
if.then24: ; preds = %if.then
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%28 = load i64*, i64** %currKnodeD.addr, align 8
%29 = load i32, i32* %bid, align 4
%idxprom25 = sext i32 %29 to i64
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
%30 = load i64, i64* %arrayidx26, align 8
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
%31 = load i32, i32* %thid, align 4
%idxprom29 = sext i32 %31 to i64
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
%32 = load i32, i32* %arrayidx30, align 4
%conv31 = sext i32 %32 to i64
%33 = load i64*, i64** %offsetD.addr, align 8
%34 = load i32, i32* %bid, align 4
%idxprom32 = sext i32 %34 to i64
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
store i64 %conv31, i64* %arrayidx33, align 8
br label %if.end
if.end: ; preds = %if.then24, %if.then
br label %if.end34
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
%35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%36 = load i64*, i64** %lastKnodeD.addr, align 8
%37 = load i32, i32* %bid, align 4
%idxprom35 = sext i32 %37 to i64
%arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
%38 = load i64, i64* %arrayidx36, align 8
%arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
%keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
%39 = load i32, i32* %thid, align 4
%idxprom39 = sext i32 %39 to i64
%arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
%40 = load i32, i32* %arrayidx40, align 4
%41 = load i32*, i32** %endD.addr, align 8
%42 = load i32, i32* %bid, align 4
%idxprom41 = sext i32 %42 to i64
%arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
%43 = load i32, i32* %arrayidx42, align 4
%cmp43 = icmp sle i32 %40, %43
br i1 %cmp43, label %land.lhs.true44, label %if.end75
land.lhs.true44: ; preds = %if.end34
%44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%45 = load i64*, i64** %lastKnodeD.addr, align 8
%46 = load i32, i32* %bid, align 4
%idxprom45 = sext i32 %46 to i64
%arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
%47 = load i64, i64* %arrayidx46, align 8
%arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
%keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
%48 = load i32, i32* %thid, align 4
%add49 = add nsw i32 %48, 1
%idxprom50 = sext i32 %add49 to i64
%arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
%49 = load i32, i32* %arrayidx51, align 4
%50 = load i32*, i32** %endD.addr, align 8
%51 = load i32, i32* %bid, align 4
%idxprom52 = sext i32 %51 to i64
%arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
%52 = load i32, i32* %arrayidx53, align 4
%cmp54 = icmp sgt i32 %49, %52
br i1 %cmp54, label %if.then55, label %if.end75
if.then55: ; preds = %land.lhs.true44
%53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%54 = load i64*, i64** %lastKnodeD.addr, align 8
%55 = load i32, i32* %bid, align 4
%idxprom56 = sext i32 %55 to i64
%arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
%56 = load i64, i64* %arrayidx57, align 8
%arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
%indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
%57 = load i32, i32* %thid, align 4
%idxprom60 = sext i32 %57 to i64
%arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
%58 = load i32, i32* %arrayidx61, align 4
%conv62 = sext i32 %58 to i64
%59 = load i64, i64* %knodes_elem.addr, align 8
%cmp63 = icmp slt i64 %conv62, %59
br i1 %cmp63, label %if.then64, label %if.end74
if.then64: ; preds = %if.then55
%60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%61 = load i64*, i64** %lastKnodeD.addr, align 8
%62 = load i32, i32* %bid, align 4
%idxprom65 = sext i32 %62 to i64
%arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
%63 = load i64, i64* %arrayidx66, align 8
%arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
%indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
%64 = load i32, i32* %thid, align 4
%idxprom69 = sext i32 %64 to i64
%arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
%65 = load i32, i32* %arrayidx70, align 4
%conv71 = sext i32 %65 to i64
%66 = load i64*, i64** %offset_2D.addr, align 8
%67 = load i32, i32* %bid, align 4
%idxprom72 = sext i32 %67 to i64
%arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
store i64 %conv71, i64* %arrayidx73, align 8
br label %if.end74
if.end74: ; preds = %if.then64, %if.then55
br label %if.end75
if.end75: ; preds = %if.end74, %land.lhs.true44, %if.end34
call void @llvm.nvvm.barrier0()
%68 = load i32, i32* %thid, align 4
%cmp76 = icmp eq i32 %68, 0
br i1 %cmp76, label %if.then77, label %if.end86
if.then77: ; preds = %if.end75
%69 = load i64*, i64** %offsetD.addr, align 8
%70 = load i32, i32* %bid, align 4
%idxprom78 = sext i32 %70 to i64
%arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
%71 = load i64, i64* %arrayidx79, align 8
%72 = load i64*, i64** %currKnodeD.addr, align 8
%73 = load i32, i32* %bid, align 4
%idxprom80 = sext i32 %73 to i64
%arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
store i64 %71, i64* %arrayidx81, align 8
%74 = load i64*, i64** %offset_2D.addr, align 8
%75 = load i32, i32* %bid, align 4
%idxprom82 = sext i32 %75 to i64
%arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
%76 = load i64, i64* %arrayidx83, align 8
%77 = load i64*, i64** %lastKnodeD.addr, align 8
%78 = load i32, i32* %bid, align 4
%idxprom84 = sext i32 %78 to i64
%arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
store i64 %76, i64* %arrayidx85, align 8
br label %if.end86
if.end86: ; preds = %if.then77, %if.end75
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end86
%79 = load i32, i32* %i, align 4
%inc = add nsw i32 %79, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%81 = load i64*, i64** %currKnodeD.addr, align 8
%82 = load i32, i32* %bid, align 4
%idxprom87 = sext i32 %82 to i64
%arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
%83 = load i64, i64* %arrayidx88, align 8
%arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
%keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
%84 = load i32, i32* %thid, align 4
%idxprom91 = sext i32 %84 to i64
%arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
%85 = load i32, i32* %arrayidx92, align 4
%86 = load i32*, i32** %startD.addr, align 8
%87 = load i32, i32* %bid, align 4
%idxprom93 = sext i32 %87 to i64
%arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
%88 = load i32, i32* %arrayidx94, align 4
%cmp95 = icmp eq i32 %85, %88
br i1 %cmp95, label %if.then96, label %if.end105
if.then96: ; preds = %for.end
%89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%90 = load i64*, i64** %currKnodeD.addr, align 8
%91 = load i32, i32* %bid, align 4
%idxprom97 = sext i32 %91 to i64
%arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
%92 = load i64, i64* %arrayidx98, align 8
%arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
%indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
%93 = load i32, i32* %thid, align 4
%idxprom101 = sext i32 %93 to i64
%arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
%94 = load i32, i32* %arrayidx102, align 4
%95 = load i32*, i32** %RecstartD.addr, align 8
%96 = load i32, i32* %bid, align 4
%idxprom103 = sext i32 %96 to i64
%arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
store i32 %94, i32* %arrayidx104, align 4
br label %if.end105
if.end105: ; preds = %if.then96, %for.end
call void @llvm.nvvm.barrier0()
%97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%98 = load i64*, i64** %lastKnodeD.addr, align 8
%99 = load i32, i32* %bid, align 4
%idxprom106 = sext i32 %99 to i64
%arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
%100 = load i64, i64* %arrayidx107, align 8
%arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
%keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
%101 = load i32, i32* %thid, align 4
%idxprom110 = sext i32 %101 to i64
%arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
%102 = load i32, i32* %arrayidx111, align 4
%103 = load i32*, i32** %endD.addr, align 8
%104 = load i32, i32* %bid, align 4
%idxprom112 = sext i32 %104 to i64
%arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
%105 = load i32, i32* %arrayidx113, align 4
%cmp114 = icmp eq i32 %102, %105
br i1 %cmp114, label %if.then115, label %if.end127
if.then115: ; preds = %if.end105
%106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%107 = load i64*, i64** %lastKnodeD.addr, align 8
%108 = load i32, i32* %bid, align 4
%idxprom116 = sext i32 %108 to i64
%arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
%109 = load i64, i64* %arrayidx117, align 8
%arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
%indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
%110 = load i32, i32* %thid, align 4
%idxprom120 = sext i32 %110 to i64
%arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
%111 = load i32, i32* %arrayidx121, align 4
%112 = load i32*, i32** %RecstartD.addr, align 8
%113 = load i32, i32* %bid, align 4
%idxprom122 = sext i32 %113 to i64
%arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
%114 = load i32, i32* %arrayidx123, align 4
%sub = sub nsw i32 %111, %114
%add124 = add nsw i32 %sub, 1
%115 = load i32*, i32** %ReclenD.addr, align 8
%116 = load i32, i32* %bid, align 4
%idxprom125 = sext i32 %116 to i64
%arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
store i32 %add124, i32* %arrayidx126, align 4
br label %if.end127
if.end127: ; preds = %if.then115, %if.end105
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

2192
examples/btree/main.c Normal file

File diff suppressed because it is too large Load Diff

40
examples/btree/run.sh Executable file
View File

@ -0,0 +1,40 @@
#!/bin/bash
set -e
clang -c -emit-llvm util/timer/timer.c
clang -c -emit-llvm util/num/num.c
#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
#clang++ kernel/kernel_gpu_cuda_wrapper.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
clang -c -emit-llvm main.c
llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
llc --relocation-model=pic --filetype=obj main.bc
llc --relocation-model=pic --filetype=obj cuda.bc
llc --relocation-model=pic --filetype=obj num.bc
llc --relocation-model=pic --filetype=obj timer.bc
llc --relocation-model=pic --filetype=obj kernel1.bc
llc --relocation-model=pic --filetype=obj kernel2.bc
llc --relocation-model=pic --filetype=obj host1.bc
llc --relocation-model=pic --filetype=obj host2.bc
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o b+tree.out \
-fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
-lc -lx86Runtime -lthreadPool -lpthread
./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
command ../../rodinia-data/b+tree/command.txt
if grep -q "0 840187 6001" output.txt; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -0,0 +1,75 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// SET_DEVICE CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include "cuda.h" // (in library path specified to compiler)
//======================================================================================================================================================150
// FUNCTIONS
//======================================================================================================================================================150
//====================================================================================================100
// SET DEVICE
//====================================================================================================100
void setdevice(void){
// variables
int num_devices;
int device;
// work
cudaGetDeviceCount(&num_devices);
if (num_devices > 1) {
// variables
int max_multiprocessors;
int max_device;
cudaDeviceProp properties;
// initialize variables
max_multiprocessors = 0;
max_device = 0;
for (device = 0; device < num_devices; device++) {
cudaGetDeviceProperties(&properties, device);
if (max_multiprocessors < properties.multiProcessorCount) {
max_multiprocessors = properties.multiProcessorCount;
max_device = device;
}
}
cudaSetDevice(max_device);
}
}
//====================================================================================================100
// GET LAST ERROR
//====================================================================================================100
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
fflush(NULL);
exit(EXIT_FAILURE);
}
}
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,37 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// SET_DEVICE HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include <stdio.h> // (in library path known to compiler) needed by printf
//======================================================================================================================================================150
// FUNCTION PROTOTYPES
//======================================================================================================================================================150
//====================================================================================================100
// SET DEVICE
//====================================================================================================100
void setdevice(void);
//====================================================================================================100
// GET LAST ERROR
//====================================================================================================100
void checkCUDAError(const char *msg);
//===============================================================================================================================================================================================================200
// END SET_DEVICE HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,55 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// DESCRIPTION
//===============================================================================================================================================================================================================200
// Returns: 0 if string does not represent integer
// 1 if string represents integer
//===============================================================================================================================================================================================================200
// NUM CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// ISINTEGER FUNCTION
//======================================================================================================================================================150
int isInteger(char *str) {
//====================================================================================================100
// make sure it's not empty
//====================================================================================================100
if (*str == '\0') {
return 0;
}
//====================================================================================================100
// if any digit is not a number, return false
//====================================================================================================100
for (; *str != '\0'; str++) {
if (*str < 48 ||
*str >
57) { // digit characters (need to include . if checking for float)
return 0;
}
}
//====================================================================================================100
// it got past all my checks so I think it's a number
//====================================================================================================100
return 1;
}
//===============================================================================================================================================================================================================200
// END NUM CODE
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

21
examples/btree/util/num/num.h Executable file
View File

@ -0,0 +1,21 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// FILE HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// ISINTEGER FUNCTION PROTOTYPE
//======================================================================================================================================================150
int isInteger(char *str);
//===============================================================================================================================================================================================================200
// END FILE HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,36 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// TIMER CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include <stdlib.h>
//======================================================================================================================================================150
// FUNCTIONS
//======================================================================================================================================================150
//====================================================================================================100
// DISPLAY TIME
//====================================================================================================100
// Returns the current system time in microseconds
long long get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000) + tv.tv_usec;
}
//===============================================================================================================================================================================================================200
// END TIMER CODE
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,21 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// TIMER HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// FUNCTION PROTOTYPES
//======================================================================================================================================================150
long long get_time();
//===============================================================================================================================================================================================================200
// END TIMER HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

662
examples/cfd/euler3d.cu Executable file
View File

@ -0,0 +1,662 @@
#include <fstream>
#include <helper_cuda.h>
#include <helper_timer.h>
#include <iostream>
/*
* Options
*
*/
#define GAMMA 1.4f
#define iterations 2
// #ifndef block_length
// #define block_length 192
// #endif
#define NDIM 3
#define NNB 4
#define RK 3 // 3rd order RK
#define ff_mach 1.2f
#define deg_angle_of_attack 0.0f
/*
* not options
*/
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE_0 RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_0 RD_WG_SIZE
#else
#define BLOCK_SIZE_0 192
#endif
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_1 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_1 RD_WG_SIZE
#else
#define BLOCK_SIZE_1 192
#endif
#ifdef RD_WG_SIZE_2_0
#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_2 RD_WG_SIZE_2
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_2 RD_WG_SIZE
#else
#define BLOCK_SIZE_2 192
#endif
#ifdef RD_WG_SIZE_3_0
#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
#elif defined(RD_WG_SIZE_3)
#define BLOCK_SIZE_3 RD_WG_SIZE_3
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_3 RD_WG_SIZE
#else
#define BLOCK_SIZE_3 192
#endif
#ifdef RD_WG_SIZE_4_0
#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
#elif defined(RD_WG_SIZE_4)
#define BLOCK_SIZE_4 RD_WG_SIZE_4
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_4 RD_WG_SIZE
#else
#define BLOCK_SIZE_4 192
#endif
// #if block_length > 128
// #warning "the kernels may fail too launch on some systems if the block length
// is too large" #endif
#define VAR_DENSITY 0
#define VAR_MOMENTUM 1
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
#define NVAR (VAR_DENSITY_ENERGY + 1)
/*
* Generic functions
*/
template <typename T> T *alloc(int N) {
T *t;
checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
return t;
}
template <typename T> void dealloc(T *array) {
checkCudaErrors(cudaFree((void *)array));
}
template <typename T> void copy(T *dst, T *src, int N) {
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
cudaMemcpyDeviceToDevice));
}
template <typename T> void upload(T *dst, T *src, int N) {
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
cudaMemcpyHostToDevice));
}
template <typename T> void download(T *dst, T *src, int N) {
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
cudaMemcpyDeviceToHost));
}
void dump(float *variables, int nel, int nelr) {
float *h_variables = new float[nelr * NVAR];
download(h_variables, variables, nelr * NVAR);
{
std::ofstream file("density");
file << nel << " " << nelr << std::endl;
for (int i = 0; i < nel; i++)
file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
}
{
std::ofstream file("momentum");
file << nel << " " << nelr << std::endl;
for (int i = 0; i < nel; i++) {
for (int j = 0; j != NDIM; j++)
file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
file << std::endl;
}
}
{
std::ofstream file("density_energy");
file << nel << " " << nelr << std::endl;
for (int i = 0; i < nel; i++)
file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
}
delete[] h_variables;
}
/*
* Element-based Cell-centered FVM solver functions
*/
__constant__ float ff_variable[NVAR];
__constant__ float3 ff_flux_contribution_momentum_x[1];
__constant__ float3 ff_flux_contribution_momentum_y[1];
__constant__ float3 ff_flux_contribution_momentum_z[1];
__constant__ float3 ff_flux_contribution_density_energy[1];
__global__ void cuda_initialize_variables(int nelr, float *variables) {
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
for (int j = 0; j < NVAR; j++)
variables[i + j * nelr] = ff_variable[j];
}
void initialize_variables(int nelr, float *variables) {
dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
getLastCudaError("initialize_variables failed");
}
__device__ __host__ inline void compute_flux_contribution(
float &density, float3 &momentum, float &density_energy, float &pressure,
float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
float3 &fc_momentum_z, float3 &fc_density_energy) {
fc_momentum_x.x = velocity.x * momentum.x + pressure;
fc_momentum_x.y = velocity.x * momentum.y;
fc_momentum_x.z = velocity.x * momentum.z;
fc_momentum_y.x = fc_momentum_x.y;
fc_momentum_y.y = velocity.y * momentum.y + pressure;
fc_momentum_y.z = velocity.y * momentum.z;
fc_momentum_z.x = fc_momentum_x.z;
fc_momentum_z.y = fc_momentum_y.z;
fc_momentum_z.z = velocity.z * momentum.z + pressure;
float de_p = density_energy + pressure;
fc_density_energy.x = velocity.x * de_p;
fc_density_energy.y = velocity.y * de_p;
fc_density_energy.z = velocity.z * de_p;
}
__device__ inline void compute_velocity(float &density, float3 &momentum,
float3 &velocity) {
velocity.x = momentum.x / density;
velocity.y = momentum.y / density;
velocity.z = momentum.z / density;
}
__device__ inline float compute_speed_sqd(float3 &velocity) {
return velocity.x * velocity.x + velocity.y * velocity.y +
velocity.z * velocity.z;
}
__device__ inline float compute_pressure(float &density, float &density_energy,
float &speed_sqd) {
return (float(GAMMA) - float(1.0f)) *
(density_energy - float(0.5f) * density * speed_sqd);
}
__device__ inline float compute_speed_of_sound(float &density,
float &pressure) {
return sqrtf(float(GAMMA) * pressure / density);
}
__global__ void cuda_compute_step_factor(int nelr, float *variables,
float *areas, float *step_factors) {
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
float density = variables[i + VAR_DENSITY * nelr];
float3 momentum;
momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
float3 velocity;
compute_velocity(density, momentum, velocity);
float speed_sqd = compute_speed_sqd(velocity);
float pressure = compute_pressure(density, density_energy, speed_sqd);
float speed_of_sound = compute_speed_of_sound(density, pressure);
// dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time
// stepping, this later would need to be divided by the area, so we just do it
// all at once
step_factors[i] =
float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
}
void compute_step_factor(int nelr, float *variables, float *areas,
float *step_factors) {
dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
getLastCudaError("compute_step_factor failed");
}
/*
*
*
*/
__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
float *normals, float *variables,
float *fluxes) {
const float smoothing_coefficient = float(0.2f);
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
int j, nb;
float3 normal;
float normal_len;
float factor;
float density_i = variables[i + VAR_DENSITY * nelr];
float3 momentum_i;
momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
float3 velocity_i;
compute_velocity(density_i, momentum_i, velocity_i);
float speed_sqd_i = compute_speed_sqd(velocity_i);
float speed_i = sqrtf(speed_sqd_i);
float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
flux_contribution_i_momentum_z;
float3 flux_contribution_i_density_energy;
compute_flux_contribution(
density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
float flux_i_density = float(0.0f);
float3 flux_i_momentum;
flux_i_momentum.x = float(0.0f);
flux_i_momentum.y = float(0.0f);
flux_i_momentum.z = float(0.0f);
float flux_i_density_energy = float(0.0f);
float3 velocity_nb;
float density_nb, density_energy_nb;
float3 momentum_nb;
float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
flux_contribution_nb_momentum_z;
float3 flux_contribution_nb_density_energy;
float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
#pragma unroll
for (j = 0; j < NNB; j++) {
nb = elements_surrounding_elements[i + j * nelr];
normal.x = normals[i + (j + 0 * NNB) * nelr];
normal.y = normals[i + (j + 1 * NNB) * nelr];
normal.z = normals[i + (j + 2 * NNB) * nelr];
normal_len =
sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
if (nb >= 0) // a legitimate neighbor
{
density_nb = variables[nb + VAR_DENSITY * nelr];
momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
compute_velocity(density_nb, momentum_nb, velocity_nb);
speed_sqd_nb = compute_speed_sqd(velocity_nb);
pressure_nb =
compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
compute_flux_contribution(
density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
// artificial viscosity
factor = -normal_len * smoothing_coefficient * float(0.5f) *
(speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
speed_of_sound_nb);
flux_i_density += factor * (density_i - density_nb);
flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
// accumulate cell-centered fluxes
factor = float(0.5f) * normal.x;
flux_i_density += factor * (momentum_nb.x + momentum_i.x);
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
flux_contribution_i_density_energy.x);
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
flux_contribution_i_momentum_x.x);
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
flux_contribution_i_momentum_y.x);
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
flux_contribution_i_momentum_z.x);
factor = float(0.5f) * normal.y;
flux_i_density += factor * (momentum_nb.y + momentum_i.y);
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
flux_contribution_i_density_energy.y);
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
flux_contribution_i_momentum_x.y);
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
flux_contribution_i_momentum_y.y);
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
flux_contribution_i_momentum_z.y);
factor = float(0.5f) * normal.z;
flux_i_density += factor * (momentum_nb.z + momentum_i.z);
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
flux_contribution_i_density_energy.z);
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
flux_contribution_i_momentum_x.z);
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
flux_contribution_i_momentum_y.z);
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
flux_contribution_i_momentum_z.z);
} else if (nb == -1) // a wing boundary
{
flux_i_momentum.x += normal.x * pressure_i;
flux_i_momentum.y += normal.y * pressure_i;
flux_i_momentum.z += normal.z * pressure_i;
} else if (nb == -2) // a far field boundary
{
factor = float(0.5f) * normal.x;
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
flux_i_density_energy +=
factor * (ff_flux_contribution_density_energy[0].x +
flux_contribution_i_density_energy.x);
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
flux_contribution_i_momentum_x.x);
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
flux_contribution_i_momentum_y.x);
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
flux_contribution_i_momentum_z.x);
factor = float(0.5f) * normal.y;
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
flux_i_density_energy +=
factor * (ff_flux_contribution_density_energy[0].y +
flux_contribution_i_density_energy.y);
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
flux_contribution_i_momentum_x.y);
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
flux_contribution_i_momentum_y.y);
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
flux_contribution_i_momentum_z.y);
factor = float(0.5f) * normal.z;
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
flux_i_density_energy +=
factor * (ff_flux_contribution_density_energy[0].z +
flux_contribution_i_density_energy.z);
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
flux_contribution_i_momentum_x.z);
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
flux_contribution_i_momentum_y.z);
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
flux_contribution_i_momentum_z.z);
}
}
fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
}
void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
float *variables, float *fluxes) {
dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
variables, fluxes);
getLastCudaError("compute_flux failed");
}
__global__ void cuda_time_step(int j, int nelr, float *old_variables,
float *variables, float *step_factors,
float *fluxes) {
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
float factor = step_factors[i] / float(RK + 1 - j);
variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
factor * fluxes[i + VAR_DENSITY * nelr];
variables[i + VAR_DENSITY_ENERGY * nelr] =
old_variables[i + VAR_DENSITY_ENERGY * nelr] +
factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
variables[i + (VAR_MOMENTUM + 0) * nelr] =
old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
variables[i + (VAR_MOMENTUM + 1) * nelr] =
old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
variables[i + (VAR_MOMENTUM + 2) * nelr] =
old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
}
void time_step(int j, int nelr, float *old_variables, float *variables,
float *step_factors, float *fluxes) {
dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
fluxes);
getLastCudaError("update failed");
}
/*
* Main function
*/
int main(int argc, char **argv) {
printf("WG size of kernel:initialize = %d, WG size of "
"kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
"%d, WG size of kernel:time_step = %d\n",
BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
if (argc < 2) {
std::cout << "specify data file name" << std::endl;
return 0;
}
const char *data_file_name = argv[1];
cudaDeviceProp prop;
int dev;
checkCudaErrors(cudaSetDevice(0));
// set far field conditions and load them into constant memory on the gpu
{
float h_ff_variable[NVAR];
const float angle_of_attack =
float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
h_ff_variable[VAR_DENSITY] = float(1.4);
float ff_pressure = float(1.0f);
float ff_speed_of_sound =
sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
float ff_speed = float(ff_mach) * ff_speed_of_sound;
float3 ff_velocity;
ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
ff_velocity.z = 0.0f;
h_ff_variable[VAR_MOMENTUM + 0] =
h_ff_variable[VAR_DENSITY] * ff_velocity.x;
h_ff_variable[VAR_MOMENTUM + 1] =
h_ff_variable[VAR_DENSITY] * ff_velocity.y;
h_ff_variable[VAR_MOMENTUM + 2] =
h_ff_variable[VAR_DENSITY] * ff_velocity.z;
h_ff_variable[VAR_DENSITY_ENERGY] =
h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
(ff_pressure / float(GAMMA - 1.0f));
float3 h_ff_momentum;
h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
float3 h_ff_flux_contribution_momentum_x;
float3 h_ff_flux_contribution_momentum_y;
float3 h_ff_flux_contribution_momentum_z;
float3 h_ff_flux_contribution_density_energy;
compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
ff_velocity, h_ff_flux_contribution_momentum_x,
h_ff_flux_contribution_momentum_y,
h_ff_flux_contribution_momentum_z,
h_ff_flux_contribution_density_energy);
// copy far field conditions to the gpu
checkCudaErrors(
cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
&h_ff_flux_contribution_momentum_x,
sizeof(float3)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
&h_ff_flux_contribution_momentum_y,
sizeof(float3)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
&h_ff_flux_contribution_momentum_z,
sizeof(float3)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
&h_ff_flux_contribution_density_energy,
sizeof(float3)));
}
int nel;
int nelr;
// read in domain geometry
float *areas;
int *elements_surrounding_elements;
float *normals;
{
std::ifstream file(data_file_name);
file >> nel;
nelr =
BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
float *h_areas = new float[nelr];
int *h_elements_surrounding_elements = new int[nelr * NNB];
float *h_normals = new float[nelr * NDIM * NNB];
// read in data
for (int i = 0; i < nel; i++) {
file >> h_areas[i];
for (int j = 0; j < NNB; j++) {
file >> h_elements_surrounding_elements[i + j * nelr];
if (h_elements_surrounding_elements[i + j * nelr] < 0)
h_elements_surrounding_elements[i + j * nelr] = -1;
h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
// Fortran numbering
for (int k = 0; k < NDIM; k++) {
file >> h_normals[i + (j + k * NNB) * nelr];
h_normals[i + (j + k * NNB) * nelr] =
-h_normals[i + (j + k * NNB) * nelr];
}
}
}
// fill in remaining data
int last = nel - 1;
for (int i = nel; i < nelr; i++) {
h_areas[i] = h_areas[last];
for (int j = 0; j < NNB; j++) {
// duplicate the last element
h_elements_surrounding_elements[i + j * nelr] =
h_elements_surrounding_elements[last + j * nelr];
for (int k = 0; k < NDIM; k++)
h_normals[last + (j + k * NNB) * nelr] =
h_normals[last + (j + k * NNB) * nelr];
}
}
areas = alloc<float>(nelr);
upload<float>(areas, h_areas, nelr);
elements_surrounding_elements = alloc<int>(nelr * NNB);
upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
nelr * NNB);
normals = alloc<float>(nelr * NDIM * NNB);
upload<float>(normals, h_normals, nelr * NDIM * NNB);
delete[] h_areas;
delete[] h_elements_surrounding_elements;
delete[] h_normals;
}
// Create arrays and set initial conditions
float *variables = alloc<float>(nelr * NVAR);
initialize_variables(nelr, variables);
float *old_variables = alloc<float>(nelr * NVAR);
float *fluxes = alloc<float>(nelr * NVAR);
float *step_factors = alloc<float>(nelr);
// make sure all memory is floatly allocated before we start timing
initialize_variables(nelr, old_variables);
initialize_variables(nelr, fluxes);
cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
// make sure CUDA isn't still doing something before we start timing
cudaThreadSynchronize();
// these need to be computed the first time in order to compute time step
std::cout << "Starting..." << std::endl;
StopWatchInterface *timer = 0;
// unsigned int timer = 0;
// CUT_SAFE_CALL( cutCreateTimer( &timer));
// CUT_SAFE_CALL( cutStartTimer( timer));
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// Begin iterations
for (int i = 0; i < iterations; i++) {
copy<float>(old_variables, variables, nelr * NVAR);
// for the first iteration we compute the time step
compute_step_factor(nelr, variables, areas, step_factors);
getLastCudaError("compute_step_factor failed");
for (int j = 0; j < RK; j++) {
compute_flux(nelr, elements_surrounding_elements, normals, variables,
fluxes);
getLastCudaError("compute_flux failed");
time_step(j, nelr, old_variables, variables, step_factors, fluxes);
getLastCudaError("time_step failed");
}
}
cudaThreadSynchronize();
// CUT_SAFE_CALL( cutStopTimer(timer) );
sdkStopTimer(&timer);
std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
<< " seconds per iteration" << std::endl;
std::cout << "Saving solution..." << std::endl;
dump(variables, nel, nelr);
std::cout << "Saved solution..." << std::endl;
std::cout << "Cleaning up..." << std::endl;
dealloc<float>(areas);
dealloc<int>(elements_surrounding_elements);
dealloc<float>(normals);
dealloc<float>(variables);
dealloc<float>(old_variables);
dealloc<float>(fluxes);
dealloc<float>(step_factors);
std::cout << "Done..." << std::endl;
return 0;
}

15
examples/cfd/run.sh Normal file
View File

@ -0,0 +1,15 @@
# # #!/bin/bash
clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
./a.out ../rodinia-data/cfd/fvcorr.domn.097K
# ./demo 1024
# # # ./demo -f ../../data/matrix3.txt
# # # run -f ../../data/gaussian/matrix3.txt

View File

@ -0,0 +1,396 @@
; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "gaussian.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_blockDim_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
entry:
%m_cuda.addr = alloca float*, align 8
%a_cuda.addr = alloca float*, align 8
%Size.addr = alloca i32, align 4
%t.addr = alloca i32, align 4
store float* %m_cuda, float** %m_cuda.addr, align 8
store float* %a_cuda, float** %a_cuda.addr, align 8
store i32 %Size, i32* %Size.addr, align 4
store i32 %t, i32* %t.addr, align 4
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call1, %call2
%add = add i32 %call, %mul
%0 = load i32, i32* %Size.addr, align 4
%sub = sub nsw i32 %0, 1
%1 = load i32, i32* %t.addr, align 4
%sub3 = sub nsw i32 %sub, %1
%cmp = icmp uge i32 %add, %sub3
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
br label %return
if.end: ; preds = %entry
%2 = load float*, float** %a_cuda.addr, align 8
%3 = load i32, i32* %Size.addr, align 4
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul6 = mul i32 %call4, %call5
%call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add8 = add i32 %mul6, %call7
%4 = load i32, i32* %t.addr, align 4
%add9 = add i32 %add8, %4
%add10 = add i32 %add9, 1
%mul11 = mul i32 %3, %add10
%idx.ext = zext i32 %mul11 to i64
%add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
%5 = load i32, i32* %t.addr, align 4
%idx.ext12 = sext i32 %5 to i64
%add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
%6 = load float, float* %add.ptr13, align 4
%7 = load float*, float** %a_cuda.addr, align 8
%8 = load i32, i32* %Size.addr, align 4
%9 = load i32, i32* %t.addr, align 4
%mul14 = mul nsw i32 %8, %9
%idx.ext15 = sext i32 %mul14 to i64
%add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
%10 = load i32, i32* %t.addr, align 4
%idx.ext17 = sext i32 %10 to i64
%add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
%11 = load float, float* %add.ptr18, align 4
%div = fdiv float %6, %11
%12 = load float*, float** %m_cuda.addr, align 8
%13 = load i32, i32* %Size.addr, align 4
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul21 = mul i32 %call19, %call20
%call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add23 = add i32 %mul21, %call22
%14 = load i32, i32* %t.addr, align 4
%add24 = add i32 %add23, %14
%add25 = add i32 %add24, 1
%mul26 = mul i32 %13, %add25
%idx.ext27 = zext i32 %mul26 to i64
%add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
%15 = load i32, i32* %t.addr, align 4
%idx.ext29 = sext i32 %15 to i64
%add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
store float %div, float* %add.ptr30, align 4
br label %return
return: ; preds = %if.end, %if.then
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %0
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
entry:
%m_cuda.addr = alloca float*, align 8
%a_cuda.addr = alloca float*, align 8
%b_cuda.addr = alloca float*, align 8
%Size.addr = alloca i32, align 4
%j1.addr = alloca i32, align 4
%t.addr = alloca i32, align 4
%xidx = alloca i32, align 4
%yidx = alloca i32, align 4
store float* %m_cuda, float** %m_cuda.addr, align 8
store float* %a_cuda, float** %a_cuda.addr, align 8
store float* %b_cuda, float** %b_cuda.addr, align 8
store i32 %Size, i32* %Size.addr, align 4
store i32 %j1, i32* %j1.addr, align 4
store i32 %t, i32* %t.addr, align 4
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call1, %call2
%add = add i32 %call, %mul
%0 = load i32, i32* %Size.addr, align 4
%sub = sub nsw i32 %0, 1
%1 = load i32, i32* %t.addr, align 4
%sub3 = sub nsw i32 %sub, %1
%cmp = icmp uge i32 %add, %sub3
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
br label %if.end58
if.end: ; preds = %entry
%call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
%call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
%mul7 = mul i32 %call5, %call6
%add8 = add i32 %call4, %mul7
%2 = load i32, i32* %Size.addr, align 4
%3 = load i32, i32* %t.addr, align 4
%sub9 = sub nsw i32 %2, %3
%cmp10 = icmp uge i32 %add8, %sub9
br i1 %cmp10, label %if.then11, label %if.end12
if.then11: ; preds = %if.end
br label %if.end58
if.end12: ; preds = %if.end
%call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%mul15 = mul i32 %call13, %call14
%call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add17 = add i32 %mul15, %call16
store i32 %add17, i32* %xidx, align 4
%call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
%mul20 = mul i32 %call18, %call19
%call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
%add22 = add i32 %mul20, %call21
store i32 %add22, i32* %yidx, align 4
%4 = load float*, float** %m_cuda.addr, align 8
%5 = load i32, i32* %Size.addr, align 4
%6 = load i32, i32* %xidx, align 4
%add23 = add nsw i32 %6, 1
%7 = load i32, i32* %t.addr, align 4
%add24 = add nsw i32 %add23, %7
%mul25 = mul nsw i32 %5, %add24
%8 = load i32, i32* %t.addr, align 4
%add26 = add nsw i32 %mul25, %8
%idxprom = sext i32 %add26 to i64
%arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
%9 = load float, float* %arrayidx, align 4
%10 = load float*, float** %a_cuda.addr, align 8
%11 = load i32, i32* %Size.addr, align 4
%12 = load i32, i32* %t.addr, align 4
%mul27 = mul nsw i32 %11, %12
%13 = load i32, i32* %yidx, align 4
%14 = load i32, i32* %t.addr, align 4
%add28 = add nsw i32 %13, %14
%add29 = add nsw i32 %mul27, %add28
%idxprom30 = sext i32 %add29 to i64
%arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
%15 = load float, float* %arrayidx31, align 4
%mul32 = fmul contract float %9, %15
%16 = load float*, float** %a_cuda.addr, align 8
%17 = load i32, i32* %Size.addr, align 4
%18 = load i32, i32* %xidx, align 4
%add33 = add nsw i32 %18, 1
%19 = load i32, i32* %t.addr, align 4
%add34 = add nsw i32 %add33, %19
%mul35 = mul nsw i32 %17, %add34
%20 = load i32, i32* %yidx, align 4
%21 = load i32, i32* %t.addr, align 4
%add36 = add nsw i32 %20, %21
%add37 = add nsw i32 %mul35, %add36
%idxprom38 = sext i32 %add37 to i64
%arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
%22 = load float, float* %arrayidx39, align 4
%sub40 = fsub contract float %22, %mul32
store float %sub40, float* %arrayidx39, align 4
%23 = load i32, i32* %yidx, align 4
%cmp41 = icmp eq i32 %23, 0
br i1 %cmp41, label %if.then42, label %if.end58
if.then42: ; preds = %if.end12
%24 = load float*, float** %m_cuda.addr, align 8
%25 = load i32, i32* %Size.addr, align 4
%26 = load i32, i32* %xidx, align 4
%add43 = add nsw i32 %26, 1
%27 = load i32, i32* %t.addr, align 4
%add44 = add nsw i32 %add43, %27
%mul45 = mul nsw i32 %25, %add44
%28 = load i32, i32* %yidx, align 4
%29 = load i32, i32* %t.addr, align 4
%add46 = add nsw i32 %28, %29
%add47 = add nsw i32 %mul45, %add46
%idxprom48 = sext i32 %add47 to i64
%arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
%30 = load float, float* %arrayidx49, align 4
%31 = load float*, float** %b_cuda.addr, align 8
%32 = load i32, i32* %t.addr, align 4
%idxprom50 = sext i32 %32 to i64
%arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
%33 = load float, float* %arrayidx51, align 4
%mul52 = fmul contract float %30, %33
%34 = load float*, float** %b_cuda.addr, align 8
%35 = load i32, i32* %xidx, align 4
%add53 = add nsw i32 %35, 1
%36 = load i32, i32* %t.addr, align 4
%add54 = add nsw i32 %add53, %36
%idxprom55 = sext i32 %add54 to i64
%arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
%37 = load float, float* %arrayidx56, align 4
%sub57 = fsub contract float %37, %mul52
store float %sub57, float* %arrayidx56, align 4
br label %if.end58
if.end58: ; preds = %if.then, %if.then11, %if.then42, %if.end12
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %0
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

522
examples/gauss/gaussian.cu Normal file
View File

@ -0,0 +1,522 @@
/*-----------------------------------------------------------
** gaussian.cu -- The program is to solve a linear system Ax = b
** by using Gaussian Elimination. The algorithm on page 101
** ("Foundations of Parallel Programming") is used.
** The sequential version is gaussian.c. This parallel
** implementation converts three independent for() loops
** into three Fans. Use the data file ge_3.dat to verify
** the correction of the output.
**
** Written by Andreas Kura, 02/15/95
** Modified by Chong-wei Xu, 04/20/95
** Modified by Chris Gregg for CUDA, 07/20/2009
**-----------------------------------------------------------
*/
#include "cuda_runtime.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#ifdef TIMING
#include "timing.h"
#endif
#ifdef RD_WG_SIZE_0_0
#define MAXBLOCKSIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define MAXBLOCKSIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define MAXBLOCKSIZE RD_WG_SIZE
#else
#define MAXBLOCKSIZE 512
#endif
// 2D defines. Go from specific to general
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_XY RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_XY RD_WG_SIZE
#else
#define BLOCK_SIZE_XY 1
#endif
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
int Size;
float *a, *b, *finalVec;
float *m;
FILE *fp;
void InitProblemOnce(char *filename);
void InitPerRun();
void ForwardSub();
void BackSub();
__global__ void Fan1(float *m, float *a, int Size, int t);
__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
void InitMat(float *ary, int nrow, int ncol);
void InitAry(float *ary, int ary_size);
void PrintMat(float *ary, int nrow, int ncolumn);
void PrintAry(float *ary, int ary_size);
void PrintDeviceProperties();
void checkCUDAError(const char *msg);
unsigned int totalKernelTime = 0;
// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
void create_matrix(float *m, int size) {
int i, j;
float lamda = -0.01;
float coe[2 * size - 1];
float coe_i = 0.0;
for (i = 0; i < size; i++) {
coe_i = 10 * exp(lamda * i);
j = size - 1 + i;
coe[j] = coe_i;
j = size - 1 - i;
coe[j] = coe_i;
}
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
m[i * size + j] = coe[size - 1 - i + j];
}
}
}
int main(int argc, char *argv[]) {
printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
int verbose = 1;
int i, j;
char flag;
if (argc < 2) {
printf("Usage: gaussian -f filename / -s size [-q]\n\n");
printf("-q (quiet) suppresses printing the matrix and result values.\n");
printf("-f (filename) path of input file\n");
printf(
"-s (size) size of matrix. Create matrix and rhs in this program \n");
printf(
"The first line of the file contains the dimension of the matrix, n.");
printf("The second line of the file is a newline.\n");
printf("The next n lines contain n tab separated values for the matrix.");
printf("The next line of the file is a newline.\n");
printf("The next line of the file is a 1xn vector with tab separated "
"values.\n");
printf("The next line of the file is a newline. (optional)\n");
printf("The final line of the file is the pre-computed solution. "
"(optional)\n");
printf("Example: matrix4.txt:\n");
printf("4\n");
printf("\n");
printf("-0.6 -0.5 0.7 0.3\n");
printf("-0.3 -0.9 0.3 0.7\n");
printf("-0.4 -0.5 -0.3 -0.8\n");
printf("0.0 -0.1 0.2 0.9\n");
printf("\n");
printf("-0.85 -0.68 0.24 -0.53\n");
printf("\n");
printf("0.7 0.0 -0.4 -0.5\n");
exit(0);
}
cudaSetDevice(0);
PrintDeviceProperties();
// char filename[100];
// sprintf(filename,"matrices/matrix%d.txt",size);
for (i = 1; i < argc; i++) {
if (argv[i][0] == '-') { // flag
flag = argv[i][1];
switch (flag) {
case 's': // platform
i++;
Size = atoi(argv[i]);
printf("Create matrix internally in parse, size = %d \n", Size);
a = (float *)malloc(Size * Size * sizeof(float));
create_matrix(a, Size);
b = (float *)malloc(Size * sizeof(float));
for (j = 0; j < Size; j++)
b[j] = 1.0;
m = (float *)malloc(Size * Size * sizeof(float));
break;
case 'f': // platform
i++;
printf("Read file from %s \n", argv[i]);
InitProblemOnce(argv[i]);
break;
case 'q': // quiet
verbose = 1;
break;
}
}
}
// InitProblemOnce(filename);
InitPerRun();
// begin timing
struct timeval time_start;
gettimeofday(&time_start, NULL);
// run kernels
ForwardSub();
// end timing
struct timeval time_end;
gettimeofday(&time_end, NULL);
unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
(time_start.tv_sec * 1000000 + time_start.tv_usec);
if (verbose) {
printf("Matrix m is: \n");
PrintMat(m, Size, Size);
printf("Matrix a is: \n");
PrintMat(a, Size, Size);
printf("Array b is: \n");
PrintAry(b, Size);
}
BackSub();
if (verbose) {
printf("The final solution is: \n");
PrintAry(finalVec, Size);
}
printf("\nTime total (including memory transfers)\t%f sec\n",
time_total * 1e-6);
printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
/*printf("%d,%d\n",size,time_total);
fprintf(stderr,"%d,%d\n",size,time_total);*/
free(m);
free(a);
free(b);
#ifdef TIMING
printf("Exec: %f\n", kernel_time);
#endif
}
/*------------------------------------------------------
** PrintDeviceProperties
**-----------------------------------------------------
*/
void PrintDeviceProperties() {
cudaDeviceProp deviceProp;
int nDevCount = 0;
cudaGetDeviceCount(&nDevCount);
printf("Total Device found: %d", nDevCount);
for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
memset(&deviceProp, 0, sizeof(deviceProp));
if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
printf("\nDevice Name \t\t - %s ", deviceProp.name);
printf("\n**************************************");
printf("\nTotal Global Memory\t\t\t - %lu KB",
deviceProp.totalGlobalMem / 1024);
printf("\nShared memory available per block \t - %lu KB",
deviceProp.sharedMemPerBlock / 1024);
printf("\nNumber of registers per thread block \t - %d",
deviceProp.regsPerBlock);
printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
printf("\nMaximum threads per block \t\t - %d",
deviceProp.maxThreadsPerBlock);
printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf("\nTotal constant memory \t\t\t - %zu bytes",
deviceProp.totalConstMem);
printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
printf("\nTexture Alignment \t\t\t - %zu bytes",
deviceProp.textureAlignment);
printf("\nDevice Overlap \t\t\t\t - %s",
deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
printf("\nNumber of Multi processors \t\t - %d\n\n",
deviceProp.multiProcessorCount);
} else
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
/*------------------------------------------------------
** InitProblemOnce -- Initialize all of matrices and
** vectors by opening a data file specified by the user.
**
** We used dynamic array *a, *b, and *m to allocate
** the memory storages.
**------------------------------------------------------
*/
void InitProblemOnce(char *filename) {
// char *filename = argv[1];
// printf("Enter the data file name: ");
// scanf("%s", filename);
printf("The file name is: %s\n", filename);
fp = fopen(filename, "r");
fscanf(fp, "%d", &Size);
a = (float *)malloc(Size * Size * sizeof(float));
InitMat(a, Size, Size);
printf("The input matrix a is:\n");
PrintMat(a, Size, Size);
b = (float *)malloc(Size * sizeof(float));
InitAry(b, Size);
printf("The input array b is:\n");
PrintAry(b, Size);
m = (float *)malloc(Size * Size * sizeof(float));
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun() {
int i;
for (i = 0; i < Size * Size; i++)
*(m + i) = 0.0;
}
/*-------------------------------------------------------
** Fan1() -- Calculate multiplier matrix
** Pay attention to the index. Index i give the range
** which starts from 0 to range-1. The real values of
** the index should be adjust and related with the value
** of t which is defined on the ForwardSub().
**-------------------------------------------------------
*/
__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
// if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
// Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
// }
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
return;
*(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
*(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
*(a_cuda + Size * t + t);
}
/*-------------------------------------------------------
** Fan2() -- Modify the matrix A into LUD
**-------------------------------------------------------
*/
__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
int j1, int t) {
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
return;
if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
return;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
// printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
// blockDim.x: %d, blockDim.y:
// %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
// a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
if (yidx == 0) {
// printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
// printf("xidx:%d,yidx:%d\n",xidx,yidx);
b_cuda[xidx + 1 + t] -=
m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
}
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub() {
int t;
float *m_cuda, *a_cuda, *b_cuda;
int A = 1;
int B = 2;
int C = 3;
int D = 4;
int E = 5;
int F = 6;
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
// A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
// threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
// allocate memory on GPU
cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
cudaMalloc((void **)&b_cuda, Size * sizeof(float));
// copy memory to GPU
cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
int block_size, grid_size;
block_size = MAXBLOCKSIZE;
grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
printf("1d grid size: %d\n", grid_size);
dim3 dimBlock(block_size);
dim3 dimGrid(grid_size);
// dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
int blockSize2d, gridSize2d;
blockSize2d = BLOCK_SIZE_XY;
gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
dim3 dimBlockXY(blockSize2d, blockSize2d);
printf("BlockXY: %d \n", blockSize2d);
dim3 dimGridXY(gridSize2d, gridSize2d);
#ifdef TIMING
gettimeofday(&tv_kernel_start, NULL);
#endif
printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
// begin timing kernels
struct timeval time_start;
gettimeofday(&time_start, NULL);
for (t = 0; t < (Size - 1); t++) {
Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
cudaDeviceSynchronize();
Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
cudaDeviceSynchronize();
checkCUDAError("Fan2");
}
// end timing kernels
struct timeval time_end;
gettimeofday(&time_end, NULL);
totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
(time_start.tv_sec * 1000000 + time_start.tv_usec);
#ifdef TIMING
tvsub(&time_end, &tv_kernel_start, &tv);
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
#endif
// copy memory back to CPU
cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(m_cuda);
cudaFree(a_cuda);
cudaFree(b_cuda);
}
/*------------------------------------------------------
** BackSub() -- Backward substitution
**------------------------------------------------------
*/
void BackSub() {
// create a new vector to hold the final answer
finalVec = (float *)malloc(Size * sizeof(float));
// solve "bottom up"
int i, j;
for (i = 0; i < Size; i++) {
finalVec[Size - i - 1] = b[Size - i - 1];
for (j = 0; j < i; j++) {
finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
finalVec[Size - j - 1];
}
finalVec[Size - i - 1] =
finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
}
}
void InitMat(float *ary, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
fscanf(fp, "%f", ary + Size * i + j);
}
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(float *ary, int nrow, int ncol) {
return;
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
printf("%8.2f ", *(ary + Size * i + j));
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n\n");
}
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}

23
examples/gauss/run.sh Executable file
View File

@ -0,0 +1,23 @@
#!/bin/bash
set -e
llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L../../build/runtime \
-L../../build/runtime/threadPool \
-o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
echo "Pass"
else
echo "Error result"
exit 1
fi

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,317 @@
#ifdef __cplusplus
extern "C" {
#endif
/*
* avilib.h
*
* Copyright (C) Thomas Östreich - June 2001
* multiple audio track support Copyright (C) 2002 Thomas Östreich
*
* Original code:
* Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
*
* This file is part of transcode, a linux video stream processing tool
*
* transcode is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* transcode is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <fcntl.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
// #include <windows.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#ifndef AVILIB_H
#define AVILIB_H
#define AVI_MAX_TRACKS 8
typedef struct {
unsigned long key;
unsigned long pos;
unsigned long len;
} video_index_entry;
typedef struct {
unsigned long pos;
unsigned long len;
unsigned long tot;
} audio_index_entry;
typedef struct track_s {
long a_fmt; /* Audio format, see #defines below */
long a_chans; /* Audio channels, 0 for no audio */
long a_rate; /* Rate in Hz */
long a_bits; /* bits per audio sample */
long mp3rate; /* mp3 bitrate kbs*/
long audio_strn; /* Audio stream number */
long audio_bytes; /* Total number of bytes of audio data */
long audio_chunks; /* Chunks of audio data in the file */
char audio_tag[4]; /* Tag of audio data */
long audio_posc; /* Audio position: chunk */
long audio_posb; /* Audio position: byte within chunk */
long a_codech_off; /* absolut offset of audio codec information */
long a_codecf_off; /* absolut offset of audio codec information */
audio_index_entry *audio_index;
} track_t;
typedef struct {
long fdes; /* File descriptor of AVI file */
long mode; /* 0 for reading, 1 for writing */
long width; /* Width of a video frame */
long height; /* Height of a video frame */
double fps; /* Frames per second */
char compressor[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
long video_strn; /* Video stream number */
long video_frames; /* Number of video frames */
char video_tag[4]; /* Tag of video data */
long video_pos; /* Number of next frame to be read
(if index present) */
unsigned long max_len; /* maximum video chunk present */
track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
unsigned long pos; /* position in file */
long n_idx; /* number of index entries actually filled */
long max_idx; /* number of index entries actually allocated */
long v_codech_off; /* absolut offset of video codec (strh) info */
long v_codecf_off; /* absolut offset of video codec (strf) info */
unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
video_index_entry *video_index;
unsigned long last_pos; /* Position of last frame written */
unsigned long last_len; /* Length of last frame written */
int must_use_index; /* Flag if frames are duplicated */
unsigned long movi_start;
int anum; // total number of audio tracks
int aptr; // current audio working track
} avi_t;
#define AVI_MODE_WRITE 0
#define AVI_MODE_READ 1
/* The error codes delivered by avi_open_input_file */
#define AVI_ERR_SIZELIM \
1 /* The write of the data would exceed \
the maximum size of the AVI file. \
This is more a warning than an \
error since the file may be closed safely */
#define AVI_ERR_OPEN \
2 /* Error opening the AVI file - wrong path \
name or file nor readable/writable \
*/
#define AVI_ERR_READ 3 /* Error reading from AVI File */
#define AVI_ERR_WRITE \
4 /* Error writing to AVI File, \
disk full ??? */
#define AVI_ERR_WRITE_INDEX \
5 /* Could not write index to AVI file \
during close, file may still be \
usable */
#define AVI_ERR_CLOSE \
6 /* Could not write header to AVI file \
or not truncate the file during \
close, file is most probably corrupted */
#define AVI_ERR_NOT_PERM \
7 /* Operation not permitted: \
trying to read from a file open \
for writing or vice versa */
#define AVI_ERR_NO_MEM 8 /* malloc failed */
#define AVI_ERR_NO_AVI 9 /* Not an AVI file */
#define AVI_ERR_NO_HDRL \
10 /* AVI file has no has no header list, \
corrupted ??? */
#define AVI_ERR_NO_MOVI \
11 /* AVI file has no has no MOVI list, \
corrupted ??? */
#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
#define AVI_ERR_NO_IDX \
13 /* The file has been opened with \
getIndex==0, but an operation has \
been performed that needs an index */
/* Possible Audio formats */
#ifndef WAVE_FORMAT_PCM
#define WAVE_FORMAT_UNKNOWN (0x0000)
#define WAVE_FORMAT_PCM (0x0001)
#define WAVE_FORMAT_ADPCM (0x0002)
#define WAVE_FORMAT_IBM_CVSD (0x0005)
#define WAVE_FORMAT_ALAW (0x0006)
#define WAVE_FORMAT_MULAW (0x0007)
#define WAVE_FORMAT_OKI_ADPCM (0x0010)
#define WAVE_FORMAT_DVI_ADPCM (0x0011)
#define WAVE_FORMAT_DIGISTD (0x0015)
#define WAVE_FORMAT_DIGIFIX (0x0016)
#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
#define WAVE_FORMAT_GSM610 (0x0031)
#define IBM_FORMAT_MULAW (0x0101)
#define IBM_FORMAT_ALAW (0x0102)
#define IBM_FORMAT_ADPCM (0x0103)
#endif
avi_t *AVI_open_output_file(char *filename);
void AVI_set_video(avi_t *AVI, int width, int height, double fps,
char *compressor);
void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
long mp3rate);
int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
int AVI_dup_frame(avi_t *AVI);
int AVI_write_audio(avi_t *AVI, char *data, long bytes);
int AVI_append_audio(avi_t *AVI, char *data, long bytes);
long AVI_bytes_remain(avi_t *AVI);
int AVI_close(avi_t *AVI);
long AVI_bytes_written(avi_t *AVI);
avi_t *AVI_open_input_file(char *filename, int getIndex);
avi_t *AVI_open_fd(int fd, int getIndex);
int avi_parse_input_file(avi_t *AVI, int getIndex);
long AVI_audio_mp3rate(avi_t *AVI);
long AVI_video_frames(avi_t *AVI);
int AVI_video_width(avi_t *AVI);
int AVI_video_height(avi_t *AVI);
double AVI_frame_rate(avi_t *AVI);
char *AVI_video_compressor(avi_t *AVI);
int AVI_audio_channels(avi_t *AVI);
int AVI_audio_bits(avi_t *AVI);
int AVI_audio_format(avi_t *AVI);
long AVI_audio_rate(avi_t *AVI);
long AVI_audio_bytes(avi_t *AVI);
long AVI_audio_chunks(avi_t *AVI);
long AVI_max_video_chunk(avi_t *AVI);
long AVI_frame_size(avi_t *AVI, long frame);
long AVI_audio_size(avi_t *AVI, long frame);
int AVI_seek_start(avi_t *AVI);
int AVI_set_video_position(avi_t *AVI, long frame);
long AVI_get_video_position(avi_t *AVI, long frame);
long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
int AVI_set_audio_position(avi_t *AVI, long byte);
int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
long AVI_audio_codech_offset(avi_t *AVI);
long AVI_audio_codecf_offset(avi_t *AVI);
long AVI_video_codech_offset(avi_t *AVI);
long AVI_video_codecf_offset(avi_t *AVI);
int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
long max_audbuf, long *len);
void AVI_print_error(char *str);
char *AVI_strerror();
char *AVI_syserror();
int AVI_scan(char *name);
int AVI_dump(char *name, int mode);
char *AVI_codec2str(short cc);
int AVI_file_check(char *import_file);
void AVI_info(avi_t *avifile);
uint64_t AVI_max_size();
int avi_update_header(avi_t *AVI);
int AVI_set_audio_track(avi_t *AVI, int track);
int AVI_get_audio_track(avi_t *AVI);
int AVI_audio_tracks(avi_t *AVI);
struct riff_struct {
unsigned char id[4]; /* RIFF */
unsigned long len;
unsigned char wave_id[4]; /* WAVE */
};
struct chunk_struct {
unsigned char id[4];
unsigned long len;
};
struct common_struct {
unsigned short wFormatTag;
unsigned short wChannels;
unsigned long dwSamplesPerSec;
unsigned long dwAvgBytesPerSec;
unsigned short wBlockAlign;
unsigned short wBitsPerSample; /* Only for PCM */
};
struct wave_header {
struct riff_struct riff;
struct chunk_struct format;
struct common_struct common;
struct chunk_struct data;
};
struct AVIStreamHeader {
long fccType;
long fccHandler;
long dwFlags;
long dwPriority;
long dwInitialFrames;
long dwScale;
long dwRate;
long dwStart;
long dwLength;
long dwSuggestedBufferSize;
long dwQuality;
long dwSampleSize;
};
#endif
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,130 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
#include "avimod.h"
//===============================================================================================================================================================================================================
// FUNCTIONS
//===============================================================================================================================================================================================================
// Flips the specified image and crops it to the specified dimensions
// If scaled == true, all values are scaled to the range [0.0, 1.0
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
int converted) {
// fixed dimensions for cropping or not cropping, square vertices starting
// from initial point in top left corner going down and right
int top;
int bottom;
int left;
int right;
if (cropped == 1) {
top = 0;
bottom = 0;
left = 0;
right = 0;
} else {
top = 0;
bottom = height - 1;
left = 0;
right = width - 1;
}
// dimensions of new cropped image
int height_new = bottom - top + 1;
int width_new = right - left + 1;
// counters
int i, j;
// allocate memory for cropped/flipped frame
fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
// crop/flip and scale frame
fp temp;
if (scaled) {
fp scale = 1.0 / 255.0;
for (i = 0; i < height_new; i++) { // rows
for (j = 0; j < width_new; j++) { // colums
temp =
(fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
if (temp < 0) {
result[i * width_new + j] = temp + 256;
} else {
result[i * width_new + j] = temp;
}
}
}
} else {
for (i = 0; i < height_new; i++) { // rows
for (j = 0; j < width_new; j++) { // colums
temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
if (temp < 0) {
result[i * width_new + j] = temp + 256;
} else {
result[i * width_new + j] = temp;
}
}
}
}
// convert storage method (from row-major to column-major)
fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
if (converted == 1) {
for (i = 0; i < width_new; i++) { // rows
for (j = 0; j < height_new; j++) { // colums
result_converted[i * height_new + j] = result[j * width_new + i];
}
}
} else {
result_converted = result;
}
free(result);
// return
return result_converted;
}
// Returns the specified frame from the specified video file
// If cropped == true, the frame is cropped to pre-determined dimensions
// (hardcoded to the boundaries of the blood vessel in the test video)
// If scaled == true, all values are scaled to the range [0.0, 1.0]
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
int converted) {
// variable
int dummy;
int width = AVI_video_width(cell_file);
int height = AVI_video_height(cell_file);
int status;
// There are 600 frames in this file (i.e. frame_num = 600 causes an error)
AVI_set_video_position(cell_file, frame_num);
// Read in the frame from the AVI
char *image_buf = (char *)malloc(width * height * sizeof(char));
status = AVI_read_frame(cell_file, image_buf, &dummy);
if (status == -1) {
AVI_print_error((char *)"Error with AVI_read_frame");
exit(-1);
}
// The image is read in upside-down, so we need to flip it
fp *image_chopped;
image_chopped =
chop_flip_image(image_buf, height, width, cropped, scaled, converted);
// free image buffer
free(image_buf);
// return
return image_chopped;
}
// #ifdef __cplusplus
// }
// #endif

View File

@ -0,0 +1,24 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
#define fp float
#include "avilib.h"
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
int converted);
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
int converted);
#ifdef __cplusplus
}
#endif

396
examples/heartwall/define.c Normal file
View File

@ -0,0 +1,396 @@
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
#define fp float
/* #define NUMBER_THREADS 512 */
#ifdef RD_WG_SIZE_0_0
#define NUMBER_THREADS RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define NUMBER_THREADS RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define NUMBER_THREADS RD_WG_SIZE
#else
#define NUMBER_THREADS 256
#endif
#define ENDO_POINTS 20
#define EPI_POINTS 31
#define ALL_POINTS 51
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// PARAMS_COMMON_CHANGE STRUCT
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
typedef struct params_common_change {
//======================================================================================================================================================
// FRAME
//======================================================================================================================================================
fp *d_frame;
int frame_no;
} params_common_change;
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// PARAMS_COMMON STRUCTURE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
typedef struct params_common {
//======================================================================================================================================================
// HARDCODED INPUTS FROM MATLAB
//======================================================================================================================================================
//====================================================================================================
// CONSTANTS
//====================================================================================================
int sSize;
int tSize;
int maxMove;
fp alpha;
//====================================================================================================
// FRAME
//====================================================================================================
int no_frames;
int frame_rows;
int frame_cols;
int frame_elem;
int frame_mem;
//====================================================================================================
// ENDO POINTS
//====================================================================================================
int endoPoints;
int endo_mem;
int *endoRow;
int *endoCol;
int *tEndoRowLoc;
int *tEndoColLoc;
int *d_endoRow;
int *d_endoCol;
int *d_tEndoRowLoc;
int *d_tEndoColLoc;
fp *d_endoT;
//====================================================================================================
// EPI POINTS
//====================================================================================================
int epiPoints;
int epi_mem;
int *epiRow;
int *epiCol;
int *tEpiRowLoc;
int *tEpiColLoc;
int *d_epiRow;
int *d_epiCol;
int *d_tEpiRowLoc;
int *d_tEpiColLoc;
fp *d_epiT;
//====================================================================================================
// ALL POINTS
//====================================================================================================
int allPoints;
//======================================================================================================================================================
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
//======================================================================================================================================================
int in_rows;
int in_cols;
int in_elem;
int in_mem;
//======================================================================================================================================================
// AREA AROUND POINT FROM FRAME
//======================================================================================================================================================
int in2_rows;
int in2_cols;
int in2_elem;
int in2_mem;
//======================================================================================================================================================
// CONVOLUTION
//======================================================================================================================================================
int conv_rows;
int conv_cols;
int conv_elem;
int conv_mem;
int ioffset;
int joffset;
//======================================================================================================================================================
// CUMULATIVE SUM 1
//======================================================================================================================================================
//====================================================================================================
// PAD ARRAY, VERTICAL CUMULATIVE SUM
//====================================================================================================
int in2_pad_add_rows;
int in2_pad_add_cols;
int in2_pad_cumv_rows;
int in2_pad_cumv_cols;
int in2_pad_cumv_elem;
int in2_pad_cumv_mem;
//====================================================================================================
// SELECTION
//====================================================================================================
int in2_pad_cumv_sel_rows;
int in2_pad_cumv_sel_cols;
int in2_pad_cumv_sel_elem;
int in2_pad_cumv_sel_mem;
int in2_pad_cumv_sel_rowlow;
int in2_pad_cumv_sel_rowhig;
int in2_pad_cumv_sel_collow;
int in2_pad_cumv_sel_colhig;
//====================================================================================================
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
//====================================================================================================
int in2_pad_cumv_sel2_rowlow;
int in2_pad_cumv_sel2_rowhig;
int in2_pad_cumv_sel2_collow;
int in2_pad_cumv_sel2_colhig;
int in2_sub_cumh_rows;
int in2_sub_cumh_cols;
int in2_sub_cumh_elem;
int in2_sub_cumh_mem;
//====================================================================================================
// SELECTION
//====================================================================================================
int in2_sub_cumh_sel_rows;
int in2_sub_cumh_sel_cols;
int in2_sub_cumh_sel_elem;
int in2_sub_cumh_sel_mem;
int in2_sub_cumh_sel_rowlow;
int in2_sub_cumh_sel_rowhig;
int in2_sub_cumh_sel_collow;
int in2_sub_cumh_sel_colhig;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
int in2_sub_cumh_sel2_rowlow;
int in2_sub_cumh_sel2_rowhig;
int in2_sub_cumh_sel2_collow;
int in2_sub_cumh_sel2_colhig;
int in2_sub2_rows;
int in2_sub2_cols;
int in2_sub2_elem;
int in2_sub2_mem;
//======================================================================================================================================================
// CUMULATIVE SUM 2
//======================================================================================================================================================
//====================================================================================================
// MULTIPLICATION
//====================================================================================================
int in2_sqr_rows;
int in2_sqr_cols;
int in2_sqr_elem;
int in2_sqr_mem;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
int in2_sqr_sub2_rows;
int in2_sqr_sub2_cols;
int in2_sqr_sub2_elem;
int in2_sqr_sub2_mem;
//======================================================================================================================================================
// FINAL
//======================================================================================================================================================
int in_sqr_rows;
int in_sqr_cols;
int in_sqr_elem;
int in_sqr_mem;
//======================================================================================================================================================
// TEMPLATE MASK CREATE
//======================================================================================================================================================
int tMask_rows;
int tMask_cols;
int tMask_elem;
int tMask_mem;
//======================================================================================================================================================
// POINT MASK INITIALIZE
//======================================================================================================================================================
int mask_rows;
int mask_cols;
int mask_elem;
int mask_mem;
//======================================================================================================================================================
// MASK CONVOLUTION
//======================================================================================================================================================
int mask_conv_rows;
int mask_conv_cols;
int mask_conv_elem;
int mask_conv_mem;
int mask_conv_ioffset;
int mask_conv_joffset;
} params_common;
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// PARAMS_UNIQUE STRUCTURE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
typedef struct params_unique {
//======================================================================================================================================================
// POINT NUMBER
//======================================================================================================================================================
int *d_Row;
int *d_Col;
int *d_tRowLoc;
int *d_tColLoc;
fp *d_T;
//======================================================================================================================================================
// POINT NUMBER
//======================================================================================================================================================
int point_no;
//======================================================================================================================================================
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
//======================================================================================================================================================
int in_pointer;
//======================================================================================================================================================
// AREA AROUND POINT FROM FRAME
//======================================================================================================================================================
fp *d_in2;
//======================================================================================================================================================
// CONVOLUTION
//======================================================================================================================================================
fp *d_conv;
fp *d_in_mod;
//======================================================================================================================================================
// CUMULATIVE SUM
//======================================================================================================================================================
//====================================================================================================
// PAD ARRAY, VERTICAL CUMULATIVE SUM
//====================================================================================================
fp *d_in2_pad_cumv;
//====================================================================================================
// SELECTION
//====================================================================================================
fp *d_in2_pad_cumv_sel;
//====================================================================================================
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
//====================================================================================================
fp *d_in2_sub_cumh;
//====================================================================================================
// SELECTION
//====================================================================================================
fp *d_in2_sub_cumh_sel;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
fp *d_in2_sub2;
//======================================================================================================================================================
// CUMULATIVE SUM 2
//======================================================================================================================================================
//====================================================================================================
// MULTIPLICATION
//====================================================================================================
fp *d_in2_sqr;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
fp *d_in2_sqr_sub2;
//======================================================================================================================================================
// FINAL
//======================================================================================================================================================
fp *d_in_sqr;
//======================================================================================================================================================
// TEMPLATE MASK
//======================================================================================================================================================
fp *d_tMask;
//======================================================================================================================================================
// POINT MASK INITIALIZE
//======================================================================================================================================================
fp *d_mask;
//======================================================================================================================================================
// MASK CONVOLUTION
//======================================================================================================================================================
fp *d_mask_conv;
} params_unique;
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// END OF STRUCTURE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================

1239
examples/heartwall/kernel.cu Executable file

File diff suppressed because it is too large Load Diff

795
examples/heartwall/main.cu Normal file
View File

@ -0,0 +1,795 @@
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
//======================================================================================================================================================
// LIBRARIES
//======================================================================================================================================================
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <avilib.h>
#include <avimod.h>
#include <cuda.h>
//======================================================================================================================================================
// STRUCTURES, GLOBAL STRUCTURE VARIABLES
//======================================================================================================================================================
#include "define.c"
params_common_change common_change;
__constant__ params_common_change d_common_change;
params_common common;
__constant__ params_common d_common;
params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
// more than usually needed
__constant__ params_unique d_unique[ALL_POINTS];
//======================================================================================================================================================
// KERNEL CODE
//======================================================================================================================================================
#include "kernel.cu"
// WRITE DATA FUNCTION
//===============================================================================================================================================================================================================200
void write_data(char *filename, int frameNo, int frames_processed,
int endoPoints, int *input_a, int *input_b, int epiPoints,
int *input_2a, int *input_2b) {
//================================================================================80
// VARIABLES
//================================================================================80
FILE *fid;
int i, j;
char c;
//================================================================================80
// OPEN FILE FOR READING
//================================================================================80
fid = fopen(filename, "w+");
if (fid == NULL) {
printf("The file was not opened for writing\n");
return;
}
//================================================================================80
// WRITE VALUES TO THE FILE
//================================================================================80
fprintf(fid, "Total AVI Frames: %d\n", frameNo);
fprintf(fid, "Frames Processed: %d\n", frames_processed);
fprintf(fid, "endoPoints: %d\n", endoPoints);
fprintf(fid, "epiPoints: %d", epiPoints);
for (j = 0; j < frames_processed; j++) {
fprintf(fid, "\n---Frame %d---", j);
fprintf(fid, "\n--endo--\n", j);
for (i = 0; i < endoPoints; i++) {
fprintf(fid, "%d\t", input_a[j + i * frameNo]);
}
fprintf(fid, "\n");
for (i = 0; i < endoPoints; i++) {
// if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
fprintf(fid, "%d\t", input_b[j + i * frameNo]);
}
fprintf(fid, "\n--epi--\n", j);
for (i = 0; i < epiPoints; i++) {
// if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
}
fprintf(fid, "\n");
for (i = 0; i < epiPoints; i++) {
// if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
}
}
// ================================================================================80
// CLOSE FILE
// ================================================================================80
fclose(fid);
}
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// MAIN FUNCTION
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
int main(int argc, char *argv[]) {
cudaSetDevice(0);
printf("WG size of kernel = %d \n", NUMBER_THREADS);
//======================================================================================================================================================
// VARIABLES
//======================================================================================================================================================
// CUDA kernel execution parameters
dim3 threads;
dim3 blocks;
// counter
int i;
int frames_processed;
// frames
char *video_file_name;
avi_t *frames;
fp *frame;
//======================================================================================================================================================
// FRAME
//======================================================================================================================================================
if (argc != 3) {
printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
exit(1);
}
// open movie file
video_file_name = argv[1];
frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
if (frames == NULL) {
AVI_print_error((char *)"Error with AVI_open_input_file");
return -1;
}
// common
common.no_frames = AVI_video_frames(frames);
common.frame_rows = AVI_video_height(frames);
common.frame_cols = AVI_video_width(frames);
common.frame_elem = common.frame_rows * common.frame_cols;
common.frame_mem = sizeof(fp) * common.frame_elem;
// pointers
cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
//======================================================================================================================================================
// CHECK INPUT ARGUMENTS
//======================================================================================================================================================
frames_processed = atoi(argv[2]);
if (frames_processed < 0 || frames_processed > common.no_frames) {
printf("ERROR: %d is an incorrect number of frames specified, select in "
"the range of 0-%d\n",
frames_processed, common.no_frames);
return 0;
}
//======================================================================================================================================================
// HARDCODED INPUTS FROM MATLAB
//======================================================================================================================================================
//====================================================================================================
// CONSTANTS
//====================================================================================================
common.sSize = 40;
common.tSize = 25;
common.maxMove = 10;
common.alpha = 0.87;
//====================================================================================================
// ENDO POINTS
//====================================================================================================
common.endoPoints = ENDO_POINTS;
common.endo_mem = sizeof(int) * common.endoPoints;
common.endoRow = (int *)malloc(common.endo_mem);
common.endoRow[0] = 369;
common.endoRow[1] = 400;
common.endoRow[2] = 429;
common.endoRow[3] = 452;
common.endoRow[4] = 476;
common.endoRow[5] = 486;
common.endoRow[6] = 479;
common.endoRow[7] = 458;
common.endoRow[8] = 433;
common.endoRow[9] = 404;
common.endoRow[10] = 374;
common.endoRow[11] = 346;
common.endoRow[12] = 318;
common.endoRow[13] = 294;
common.endoRow[14] = 277;
common.endoRow[15] = 269;
common.endoRow[16] = 275;
common.endoRow[17] = 287;
common.endoRow[18] = 311;
common.endoRow[19] = 339;
cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
cudaMemcpyHostToDevice);
common.endoCol = (int *)malloc(common.endo_mem);
common.endoCol[0] = 408;
common.endoCol[1] = 406;
common.endoCol[2] = 397;
common.endoCol[3] = 383;
common.endoCol[4] = 354;
common.endoCol[5] = 322;
common.endoCol[6] = 294;
common.endoCol[7] = 270;
common.endoCol[8] = 250;
common.endoCol[9] = 237;
common.endoCol[10] = 235;
common.endoCol[11] = 241;
common.endoCol[12] = 254;
common.endoCol[13] = 273;
common.endoCol[14] = 300;
common.endoCol[15] = 328;
common.endoCol[16] = 356;
common.endoCol[17] = 383;
common.endoCol[18] = 401;
common.endoCol[19] = 411;
cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
cudaMemcpyHostToDevice);
common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEndoRowLoc,
common.endo_mem * common.no_frames);
common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEndoColLoc,
common.endo_mem * common.no_frames);
//====================================================================================================
// EPI POINTS
//====================================================================================================
common.epiPoints = EPI_POINTS;
common.epi_mem = sizeof(int) * common.epiPoints;
common.epiRow = (int *)malloc(common.epi_mem);
common.epiRow[0] = 390;
common.epiRow[1] = 419;
common.epiRow[2] = 448;
common.epiRow[3] = 474;
common.epiRow[4] = 501;
common.epiRow[5] = 519;
common.epiRow[6] = 535;
common.epiRow[7] = 542;
common.epiRow[8] = 543;
common.epiRow[9] = 538;
common.epiRow[10] = 528;
common.epiRow[11] = 511;
common.epiRow[12] = 491;
common.epiRow[13] = 466;
common.epiRow[14] = 438;
common.epiRow[15] = 406;
common.epiRow[16] = 376;
common.epiRow[17] = 347;
common.epiRow[18] = 318;
common.epiRow[19] = 291;
common.epiRow[20] = 275;
common.epiRow[21] = 259;
common.epiRow[22] = 256;
common.epiRow[23] = 252;
common.epiRow[24] = 252;
common.epiRow[25] = 257;
common.epiRow[26] = 266;
common.epiRow[27] = 283;
common.epiRow[28] = 305;
common.epiRow[29] = 331;
common.epiRow[30] = 360;
cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
cudaMemcpyHostToDevice);
common.epiCol = (int *)malloc(common.epi_mem);
common.epiCol[0] = 457;
common.epiCol[1] = 454;
common.epiCol[2] = 446;
common.epiCol[3] = 431;
common.epiCol[4] = 411;
common.epiCol[5] = 388;
common.epiCol[6] = 361;
common.epiCol[7] = 331;
common.epiCol[8] = 301;
common.epiCol[9] = 273;
common.epiCol[10] = 243;
common.epiCol[11] = 218;
common.epiCol[12] = 196;
common.epiCol[13] = 178;
common.epiCol[14] = 166;
common.epiCol[15] = 157;
common.epiCol[16] = 155;
common.epiCol[17] = 165;
common.epiCol[18] = 177;
common.epiCol[19] = 197;
common.epiCol[20] = 218;
common.epiCol[21] = 248;
common.epiCol[22] = 276;
common.epiCol[23] = 304;
common.epiCol[24] = 333;
common.epiCol[25] = 361;
common.epiCol[26] = 391;
common.epiCol[27] = 415;
common.epiCol[28] = 434;
common.epiCol[29] = 448;
common.epiCol[30] = 455;
cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
cudaMemcpyHostToDevice);
common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
//====================================================================================================
// ALL POINTS
//====================================================================================================
common.allPoints = ALL_POINTS;
//======================================================================================================================================================
// TEMPLATE SIZES
//======================================================================================================================================================
// common
common.in_rows = common.tSize + 1 + common.tSize;
common.in_cols = common.in_rows;
common.in_elem = common.in_rows * common.in_cols;
common.in_mem = sizeof(fp) * common.in_elem;
//======================================================================================================================================================
// CREATE ARRAY OF TEMPLATES FOR ALL POINTS
//======================================================================================================================================================
// common
cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
//======================================================================================================================================================
// SPECIFIC TO ENDO OR EPI TO BE SET HERE
//======================================================================================================================================================
for (i = 0; i < common.endoPoints; i++) {
unique[i].point_no = i;
unique[i].d_Row = common.d_endoRow;
unique[i].d_Col = common.d_endoCol;
unique[i].d_tRowLoc = common.d_tEndoRowLoc;
unique[i].d_tColLoc = common.d_tEndoColLoc;
unique[i].d_T = common.d_endoT;
}
for (i = common.endoPoints; i < common.allPoints; i++) {
unique[i].point_no = i - common.endoPoints;
unique[i].d_Row = common.d_epiRow;
unique[i].d_Col = common.d_epiCol;
unique[i].d_tRowLoc = common.d_tEpiRowLoc;
unique[i].d_tColLoc = common.d_tEpiColLoc;
unique[i].d_T = common.d_epiT;
}
//======================================================================================================================================================
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
//======================================================================================================================================================
// pointers
for (i = 0; i < common.allPoints; i++) {
unique[i].in_pointer = unique[i].point_no * common.in_elem;
}
//======================================================================================================================================================
// AREA AROUND POINT FROM FRAME
//======================================================================================================================================================
// common
common.in2_rows = 2 * common.sSize + 1;
common.in2_cols = 2 * common.sSize + 1;
common.in2_elem = common.in2_rows * common.in2_cols;
common.in2_mem = sizeof(float) * common.in2_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
}
//======================================================================================================================================================
// CONVOLUTION
//======================================================================================================================================================
// common
common.conv_rows =
common.in_rows + common.in2_rows - 1; // number of rows in I
common.conv_cols =
common.in_cols + common.in2_cols - 1; // number of columns in I
common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
common.conv_mem = sizeof(float) * common.conv_elem;
common.ioffset = 0;
common.joffset = 0;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
}
//======================================================================================================================================================
// CUMULATIVE SUM
//======================================================================================================================================================
//====================================================================================================
// PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
//====================================================================================================
// common
common.in2_pad_add_rows = common.in_rows;
common.in2_pad_add_cols = common.in_cols;
common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
common.in2_pad_cumv_elem =
common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
}
//====================================================================================================
// SELECTION
//====================================================================================================
// common
common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
common.in2_pad_cumv_sel_collow = 1;
common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
common.in2_pad_cumv_sel_rows =
common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
common.in2_pad_cumv_sel_cols =
common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
common.in2_pad_cumv_sel_elem =
common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
common.in2_pad_cumv_sel_mem);
}
//====================================================================================================
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
//====================================================================================================
// common
common.in2_pad_cumv_sel2_rowlow = 1;
common.in2_pad_cumv_sel2_rowhig =
common.in2_pad_cumv_rows - common.in_rows - 1;
common.in2_pad_cumv_sel2_collow = 1;
common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
common.in2_sub_cumh_rows =
common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
common.in2_sub_cumh_cols =
common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
common.in2_sub_cumh_elem =
common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
}
//====================================================================================================
// SELECTION
//====================================================================================================
// common
common.in2_sub_cumh_sel_rowlow = 1;
common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
common.in2_sub_cumh_sel_rows =
common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
common.in2_sub_cumh_sel_cols =
common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
common.in2_sub_cumh_sel_elem =
common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
common.in2_sub_cumh_sel_mem);
}
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
// common
common.in2_sub_cumh_sel2_rowlow = 1;
common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
common.in2_sub_cumh_sel2_collow = 1;
common.in2_sub_cumh_sel2_colhig =
common.in2_sub_cumh_cols - common.in_cols - 1;
common.in2_sub2_rows =
common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
common.in2_sub2_cols =
common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
}
//======================================================================================================================================================
// CUMULATIVE SUM 2
//======================================================================================================================================================
//====================================================================================================
// MULTIPLICATION
//====================================================================================================
// common
common.in2_sqr_rows = common.in2_rows;
common.in2_sqr_cols = common.in2_cols;
common.in2_sqr_elem = common.in2_elem;
common.in2_sqr_mem = common.in2_mem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
}
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
// common
common.in2_sqr_sub2_rows = common.in2_sub2_rows;
common.in2_sqr_sub2_cols = common.in2_sub2_cols;
common.in2_sqr_sub2_elem = common.in2_sub2_elem;
common.in2_sqr_sub2_mem = common.in2_sub2_mem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
}
//======================================================================================================================================================
// FINAL
//======================================================================================================================================================
// common
common.in_sqr_rows = common.in_rows;
common.in_sqr_cols = common.in_cols;
common.in_sqr_elem = common.in_elem;
common.in_sqr_mem = common.in_mem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
}
//======================================================================================================================================================
// TEMPLATE MASK CREATE
//======================================================================================================================================================
// common
common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
common.tMask_cols = common.tMask_rows;
common.tMask_elem = common.tMask_rows * common.tMask_cols;
common.tMask_mem = sizeof(float) * common.tMask_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
}
//======================================================================================================================================================
// POINT MASK INITIALIZE
//======================================================================================================================================================
// common
common.mask_rows = common.maxMove;
common.mask_cols = common.mask_rows;
common.mask_elem = common.mask_rows * common.mask_cols;
common.mask_mem = sizeof(float) * common.mask_elem;
//======================================================================================================================================================
// MASK CONVOLUTION
//======================================================================================================================================================
// common
common.mask_conv_rows = common.tMask_rows; // number of rows in I
common.mask_conv_cols = common.tMask_cols; // number of columns in I
common.mask_conv_elem =
common.mask_conv_rows * common.mask_conv_cols; // number of elements
common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
if ((common.mask_rows - 1) % 2 > 0.5) {
common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
}
common.mask_conv_joffset = (common.mask_cols - 1) / 2;
if ((common.mask_cols - 1) % 2 > 0.5) {
common.mask_conv_joffset = common.mask_conv_joffset + 1;
}
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
}
//======================================================================================================================================================
// KERNEL
//======================================================================================================================================================
//====================================================================================================
// THREAD BLOCK
//====================================================================================================
// All kernels operations within kernel use same max size of threads. Size of
// block size is set to the size appropriate for max size operation (on padded
// matrix). Other use subsets of that.
threads.x = NUMBER_THREADS; // define the number of threads in the block
threads.y = 1;
blocks.x = common.allPoints; // define the number of blocks in the grid
blocks.y = 1;
//====================================================================================================
// COPY ARGUMENTS
//====================================================================================================
cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
//====================================================================================================
// PRINT FRAME PROGRESS START
//====================================================================================================
printf("frame progress: ");
fflush(NULL);
//====================================================================================================
// LAUNCH
//====================================================================================================
for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
common_change.frame_no++) {
printf("get frame\n");
// Extract a cropped version of the first frame from the video file
frame = get_frame(
frames, // pointer to video file
common_change.frame_no, // number of frame that needs to be returned
0, // cropped?
0, // scaled?
1); // converted
printf("memcpy\n");
// copy frame to GPU memory
cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
cudaMemcpyHostToDevice);
printf("toSymbol\n");
cudaMemcpyToSymbol(d_common_change, &common_change,
sizeof(params_common_change));
// launch GPU kernel
printf("launch\n");
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
printf("return\n");
// free frame after each loop iteration, since AVI library allocates memory
// for every frame fetched
printf("free\n");
free(frame);
// print frame progress
printf("%d ", common_change.frame_no);
fflush(NULL);
}
//====================================================================================================
// PRINT FRAME PROGRESS END
//====================================================================================================
printf("\n");
fflush(NULL);
//====================================================================================================
// OUTPUT
//====================================================================================================
cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
#ifdef OUTPUT
//==================================================50
// DUMP DATA TO FILE
//==================================================50
write_data("result.txt", common.no_frames, frames_processed,
common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
//==================================================50
// End
//==================================================50
#endif
//======================================================================================================================================================
// DEALLOCATION
//======================================================================================================================================================
//====================================================================================================
// COMMON
//====================================================================================================
// frame
cudaFree(common_change.d_frame);
// endo points
free(common.endoRow);
free(common.endoCol);
free(common.tEndoRowLoc);
free(common.tEndoColLoc);
cudaFree(common.d_endoRow);
cudaFree(common.d_endoCol);
cudaFree(common.d_tEndoRowLoc);
cudaFree(common.d_tEndoColLoc);
cudaFree(common.d_endoT);
// epi points
free(common.epiRow);
free(common.epiCol);
free(common.tEpiRowLoc);
free(common.tEpiColLoc);
cudaFree(common.d_epiRow);
cudaFree(common.d_epiCol);
cudaFree(common.d_tEpiRowLoc);
cudaFree(common.d_tEpiColLoc);
cudaFree(common.d_epiT);
//====================================================================================================
// POINTERS
//====================================================================================================
for (i = 0; i < common.allPoints; i++) {
cudaFree(unique[i].d_in2);
cudaFree(unique[i].d_conv);
cudaFree(unique[i].d_in2_pad_cumv);
cudaFree(unique[i].d_in2_pad_cumv_sel);
cudaFree(unique[i].d_in2_sub_cumh);
cudaFree(unique[i].d_in2_sub_cumh_sel);
cudaFree(unique[i].d_in2_sub2);
cudaFree(unique[i].d_in2_sqr);
cudaFree(unique[i].d_in2_sqr_sub2);
cudaFree(unique[i].d_in_sqr);
cudaFree(unique[i].d_tMask);
cudaFree(unique[i].d_mask_conv);
}
}
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// MAIN FUNCTION
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================

17
examples/heartwall/run.sh Normal file
View File

@ -0,0 +1,17 @@
#!/bin/bash
cd AVI; make; cd ..;
clang++ -DOUTPUT main.cu -I./AVI --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20

View File

@ -0,0 +1,5 @@
////////////////////////////////////////////////////////////////////////////////
// Set Device
////////////////////////////////////////////////////////////////////////////////
void setdevice(void) { cudaSetDevice(0); }

View File

@ -0,0 +1,719 @@
; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "hotspot.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
entry:
%iteration.addr = alloca i32, align 4
%power.addr = alloca float*, align 8
%temp_src.addr = alloca float*, align 8
%temp_dst.addr = alloca float*, align 8
%grid_cols.addr = alloca i32, align 4
%grid_rows.addr = alloca i32, align 4
%border_cols.addr = alloca i32, align 4
%border_rows.addr = alloca i32, align 4
%Cap.addr = alloca float, align 4
%Rx.addr = alloca float, align 4
%Ry.addr = alloca float, align 4
%Rz.addr = alloca float, align 4
%step.addr = alloca float, align 4
%time_elapsed.addr = alloca float, align 4
%amb_temp = alloca float, align 4
%step_div_Cap = alloca float, align 4
%Rx_1 = alloca float, align 4
%Ry_1 = alloca float, align 4
%Rz_1 = alloca float, align 4
%bx = alloca i32, align 4
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%small_block_rows = alloca i32, align 4
%small_block_cols = alloca i32, align 4
%blkY = alloca i32, align 4
%blkX = alloca i32, align 4
%blkYmax = alloca i32, align 4
%blkXmax = alloca i32, align 4
%yidx = alloca i32, align 4
%xidx = alloca i32, align 4
%loadYidx = alloca i32, align 4
%loadXidx = alloca i32, align 4
%index = alloca i32, align 4
%validYmin = alloca i32, align 4
%validYmax = alloca i32, align 4
%validXmin = alloca i32, align 4
%validXmax = alloca i32, align 4
%N = alloca i32, align 4
%S = alloca i32, align 4
%W = alloca i32, align 4
%E = alloca i32, align 4
%computed = alloca i8, align 1
%i = alloca i32, align 4
store i32 %iteration, i32* %iteration.addr, align 4
store float* %power, float** %power.addr, align 8
store float* %temp_src, float** %temp_src.addr, align 8
store float* %temp_dst, float** %temp_dst.addr, align 8
store i32 %grid_cols, i32* %grid_cols.addr, align 4
store i32 %grid_rows, i32* %grid_rows.addr, align 4
store i32 %border_cols, i32* %border_cols.addr, align 4
store i32 %border_rows, i32* %border_rows.addr, align 4
store float %Cap, float* %Cap.addr, align 4
store float %Rx, float* %Rx.addr, align 4
store float %Ry, float* %Ry.addr, align 4
store float %Rz, float* %Rz.addr, align 4
store float %step, float* %step.addr, align 4
store float %time_elapsed, float* %time_elapsed.addr, align 4
store float 8.000000e+01, float* %amb_temp, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %bx, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call1, i32* %by, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call2, i32* %tx, align 4
%call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call3, i32* %ty, align 4
%0 = load float, float* %step.addr, align 4
%1 = load float, float* %Cap.addr, align 4
%div = fdiv float %0, %1
store float %div, float* %step_div_Cap, align 4
%2 = load float, float* %Rx.addr, align 4
%div4 = fdiv float 1.000000e+00, %2
store float %div4, float* %Rx_1, align 4
%3 = load float, float* %Ry.addr, align 4
%div5 = fdiv float 1.000000e+00, %3
store float %div5, float* %Ry_1, align 4
%4 = load float, float* %Rz.addr, align 4
%div6 = fdiv float 1.000000e+00, %4
store float %div6, float* %Rz_1, align 4
%5 = load i32, i32* %iteration.addr, align 4
%mul = mul nsw i32 %5, 2
%sub = sub nsw i32 16, %mul
store i32 %sub, i32* %small_block_rows, align 4
%6 = load i32, i32* %iteration.addr, align 4
%mul7 = mul nsw i32 %6, 2
%sub8 = sub nsw i32 16, %mul7
store i32 %sub8, i32* %small_block_cols, align 4
%7 = load i32, i32* %small_block_rows, align 4
%8 = load i32, i32* %by, align 4
%mul9 = mul nsw i32 %7, %8
%9 = load i32, i32* %border_rows.addr, align 4
%sub10 = sub nsw i32 %mul9, %9
store i32 %sub10, i32* %blkY, align 4
%10 = load i32, i32* %small_block_cols, align 4
%11 = load i32, i32* %bx, align 4
%mul11 = mul nsw i32 %10, %11
%12 = load i32, i32* %border_cols.addr, align 4
%sub12 = sub nsw i32 %mul11, %12
store i32 %sub12, i32* %blkX, align 4
%13 = load i32, i32* %blkY, align 4
%add = add nsw i32 %13, 16
%sub13 = sub nsw i32 %add, 1
store i32 %sub13, i32* %blkYmax, align 4
%14 = load i32, i32* %blkX, align 4
%add14 = add nsw i32 %14, 16
%sub15 = sub nsw i32 %add14, 1
store i32 %sub15, i32* %blkXmax, align 4
%15 = load i32, i32* %blkY, align 4
%16 = load i32, i32* %ty, align 4
%add16 = add nsw i32 %15, %16
store i32 %add16, i32* %yidx, align 4
%17 = load i32, i32* %blkX, align 4
%18 = load i32, i32* %tx, align 4
%add17 = add nsw i32 %17, %18
store i32 %add17, i32* %xidx, align 4
%19 = load i32, i32* %yidx, align 4
store i32 %19, i32* %loadYidx, align 4
%20 = load i32, i32* %xidx, align 4
store i32 %20, i32* %loadXidx, align 4
%21 = load i32, i32* %grid_cols.addr, align 4
%22 = load i32, i32* %loadYidx, align 4
%mul18 = mul nsw i32 %21, %22
%23 = load i32, i32* %loadXidx, align 4
%add19 = add nsw i32 %mul18, %23
store i32 %add19, i32* %index, align 4
%24 = load i32, i32* %loadYidx, align 4
%cmp = icmp sge i32 %24, 0
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%25 = load i32, i32* %loadYidx, align 4
%26 = load i32, i32* %grid_rows.addr, align 4
%sub20 = sub nsw i32 %26, 1
%cmp21 = icmp sle i32 %25, %sub20
br i1 %cmp21, label %land.lhs.true22, label %if.end
land.lhs.true22: ; preds = %land.lhs.true
%27 = load i32, i32* %loadXidx, align 4
%cmp23 = icmp sge i32 %27, 0
br i1 %cmp23, label %land.lhs.true24, label %if.end
land.lhs.true24: ; preds = %land.lhs.true22
%28 = load i32, i32* %loadXidx, align 4
%29 = load i32, i32* %grid_cols.addr, align 4
%sub25 = sub nsw i32 %29, 1
%cmp26 = icmp sle i32 %28, %sub25
br i1 %cmp26, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true24
%30 = load float*, float** %temp_src.addr, align 8
%31 = load i32, i32* %index, align 4
%idxprom = sext i32 %31 to i64
%arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
%32 = load float, float* %arrayidx, align 4
%33 = load i32, i32* %ty, align 4
%idxprom27 = sext i32 %33 to i64
%arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
%34 = load i32, i32* %tx, align 4
%idxprom29 = sext i32 %34 to i64
%arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
store float %32, float* %arrayidx30, align 4
%35 = load float*, float** %power.addr, align 8
%36 = load i32, i32* %index, align 4
%idxprom31 = sext i32 %36 to i64
%arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
%37 = load float, float* %arrayidx32, align 4
%38 = load i32, i32* %ty, align 4
%idxprom33 = sext i32 %38 to i64
%arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
%39 = load i32, i32* %tx, align 4
%idxprom35 = sext i32 %39 to i64
%arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
store float %37, float* %arrayidx36, align 4
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
call void @llvm.nvvm.barrier0()
%40 = load i32, i32* %blkY, align 4
%cmp37 = icmp slt i32 %40, 0
br i1 %cmp37, label %cond.true, label %cond.false
cond.true: ; preds = %if.end
%41 = load i32, i32* %blkY, align 4
%sub38 = sub nsw i32 0, %41
br label %cond.end
cond.false: ; preds = %if.end
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
store i32 %cond, i32* %validYmin, align 4
%42 = load i32, i32* %blkYmax, align 4
%43 = load i32, i32* %grid_rows.addr, align 4
%sub39 = sub nsw i32 %43, 1
%cmp40 = icmp sgt i32 %42, %sub39
br i1 %cmp40, label %cond.true41, label %cond.false45
cond.true41: ; preds = %cond.end
%44 = load i32, i32* %blkYmax, align 4
%45 = load i32, i32* %grid_rows.addr, align 4
%sub42 = sub nsw i32 %44, %45
%add43 = add nsw i32 %sub42, 1
%sub44 = sub nsw i32 15, %add43
br label %cond.end46
cond.false45: ; preds = %cond.end
br label %cond.end46
cond.end46: ; preds = %cond.false45, %cond.true41
%cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
store i32 %cond47, i32* %validYmax, align 4
%46 = load i32, i32* %blkX, align 4
%cmp48 = icmp slt i32 %46, 0
br i1 %cmp48, label %cond.true49, label %cond.false51
cond.true49: ; preds = %cond.end46
%47 = load i32, i32* %blkX, align 4
%sub50 = sub nsw i32 0, %47
br label %cond.end52
cond.false51: ; preds = %cond.end46
br label %cond.end52
cond.end52: ; preds = %cond.false51, %cond.true49
%cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
store i32 %cond53, i32* %validXmin, align 4
%48 = load i32, i32* %blkXmax, align 4
%49 = load i32, i32* %grid_cols.addr, align 4
%sub54 = sub nsw i32 %49, 1
%cmp55 = icmp sgt i32 %48, %sub54
br i1 %cmp55, label %cond.true56, label %cond.false60
cond.true56: ; preds = %cond.end52
%50 = load i32, i32* %blkXmax, align 4
%51 = load i32, i32* %grid_cols.addr, align 4
%sub57 = sub nsw i32 %50, %51
%add58 = add nsw i32 %sub57, 1
%sub59 = sub nsw i32 15, %add58
br label %cond.end61
cond.false60: ; preds = %cond.end52
br label %cond.end61
cond.end61: ; preds = %cond.false60, %cond.true56
%cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
store i32 %cond62, i32* %validXmax, align 4
%52 = load i32, i32* %ty, align 4
%sub63 = sub nsw i32 %52, 1
store i32 %sub63, i32* %N, align 4
%53 = load i32, i32* %ty, align 4
%add64 = add nsw i32 %53, 1
store i32 %add64, i32* %S, align 4
%54 = load i32, i32* %tx, align 4
%sub65 = sub nsw i32 %54, 1
store i32 %sub65, i32* %W, align 4
%55 = load i32, i32* %tx, align 4
%add66 = add nsw i32 %55, 1
store i32 %add66, i32* %E, align 4
%56 = load i32, i32* %N, align 4
%57 = load i32, i32* %validYmin, align 4
%cmp67 = icmp slt i32 %56, %57
br i1 %cmp67, label %cond.true68, label %cond.false69
cond.true68: ; preds = %cond.end61
%58 = load i32, i32* %validYmin, align 4
br label %cond.end70
cond.false69: ; preds = %cond.end61
%59 = load i32, i32* %N, align 4
br label %cond.end70
cond.end70: ; preds = %cond.false69, %cond.true68
%cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
store i32 %cond71, i32* %N, align 4
%60 = load i32, i32* %S, align 4
%61 = load i32, i32* %validYmax, align 4
%cmp72 = icmp sgt i32 %60, %61
br i1 %cmp72, label %cond.true73, label %cond.false74
cond.true73: ; preds = %cond.end70
%62 = load i32, i32* %validYmax, align 4
br label %cond.end75
cond.false74: ; preds = %cond.end70
%63 = load i32, i32* %S, align 4
br label %cond.end75
cond.end75: ; preds = %cond.false74, %cond.true73
%cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
store i32 %cond76, i32* %S, align 4
%64 = load i32, i32* %W, align 4
%65 = load i32, i32* %validXmin, align 4
%cmp77 = icmp slt i32 %64, %65
br i1 %cmp77, label %cond.true78, label %cond.false79
cond.true78: ; preds = %cond.end75
%66 = load i32, i32* %validXmin, align 4
br label %cond.end80
cond.false79: ; preds = %cond.end75
%67 = load i32, i32* %W, align 4
br label %cond.end80
cond.end80: ; preds = %cond.false79, %cond.true78
%cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
store i32 %cond81, i32* %W, align 4
%68 = load i32, i32* %E, align 4
%69 = load i32, i32* %validXmax, align 4
%cmp82 = icmp sgt i32 %68, %69
br i1 %cmp82, label %cond.true83, label %cond.false84
cond.true83: ; preds = %cond.end80
%70 = load i32, i32* %validXmax, align 4
br label %cond.end85
cond.false84: ; preds = %cond.end80
%71 = load i32, i32* %E, align 4
br label %cond.end85
cond.end85: ; preds = %cond.false84, %cond.true83
%cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
store i32 %cond86, i32* %E, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %cond.end85
%72 = load i32, i32* %i, align 4
%73 = load i32, i32* %iteration.addr, align 4
%cmp87 = icmp slt i32 %72, %73
br i1 %cmp87, label %for.body, label %for.end
for.body: ; preds = %for.cond
store i8 0, i8* %computed, align 1
%74 = load i32, i32* %tx, align 4
%75 = load i32, i32* %i, align 4
%add88 = add nsw i32 %75, 1
%cmp89 = icmp sge i32 %74, %add88
br i1 %cmp89, label %land.lhs.true90, label %if.end175
land.lhs.true90: ; preds = %for.body
%76 = load i32, i32* %tx, align 4
%77 = load i32, i32* %i, align 4
%sub91 = sub nsw i32 16, %77
%sub92 = sub nsw i32 %sub91, 2
%cmp93 = icmp sle i32 %76, %sub92
br i1 %cmp93, label %land.lhs.true94, label %if.end175
land.lhs.true94: ; preds = %land.lhs.true90
%78 = load i32, i32* %ty, align 4
%79 = load i32, i32* %i, align 4
%add95 = add nsw i32 %79, 1
%cmp96 = icmp sge i32 %78, %add95
br i1 %cmp96, label %land.lhs.true97, label %if.end175
land.lhs.true97: ; preds = %land.lhs.true94
%80 = load i32, i32* %ty, align 4
%81 = load i32, i32* %i, align 4
%sub98 = sub nsw i32 16, %81
%sub99 = sub nsw i32 %sub98, 2
%cmp100 = icmp sle i32 %80, %sub99
br i1 %cmp100, label %land.lhs.true101, label %if.end175
land.lhs.true101: ; preds = %land.lhs.true97
%82 = load i32, i32* %tx, align 4
%83 = load i32, i32* %validXmin, align 4
%cmp102 = icmp sge i32 %82, %83
br i1 %cmp102, label %land.lhs.true103, label %if.end175
land.lhs.true103: ; preds = %land.lhs.true101
%84 = load i32, i32* %tx, align 4
%85 = load i32, i32* %validXmax, align 4
%cmp104 = icmp sle i32 %84, %85
br i1 %cmp104, label %land.lhs.true105, label %if.end175
land.lhs.true105: ; preds = %land.lhs.true103
%86 = load i32, i32* %ty, align 4
%87 = load i32, i32* %validYmin, align 4
%cmp106 = icmp sge i32 %86, %87
br i1 %cmp106, label %land.lhs.true107, label %if.end175
land.lhs.true107: ; preds = %land.lhs.true105
%88 = load i32, i32* %ty, align 4
%89 = load i32, i32* %validYmax, align 4
%cmp108 = icmp sle i32 %88, %89
br i1 %cmp108, label %if.then109, label %if.end175
if.then109: ; preds = %land.lhs.true107
store i8 1, i8* %computed, align 1
%90 = load i32, i32* %ty, align 4
%idxprom110 = sext i32 %90 to i64
%arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
%91 = load i32, i32* %tx, align 4
%idxprom112 = sext i32 %91 to i64
%arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
%92 = load float, float* %arrayidx113, align 4
%conv = fpext float %92 to double
%93 = load float, float* %step_div_Cap, align 4
%conv114 = fpext float %93 to double
%94 = load i32, i32* %ty, align 4
%idxprom115 = sext i32 %94 to i64
%arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
%95 = load i32, i32* %tx, align 4
%idxprom117 = sext i32 %95 to i64
%arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
%96 = load float, float* %arrayidx118, align 4
%conv119 = fpext float %96 to double
%97 = load i32, i32* %S, align 4
%idxprom120 = sext i32 %97 to i64
%arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
%98 = load i32, i32* %tx, align 4
%idxprom122 = sext i32 %98 to i64
%arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
%99 = load float, float* %arrayidx123, align 4
%100 = load i32, i32* %N, align 4
%idxprom124 = sext i32 %100 to i64
%arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
%101 = load i32, i32* %tx, align 4
%idxprom126 = sext i32 %101 to i64
%arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
%102 = load float, float* %arrayidx127, align 4
%add128 = fadd contract float %99, %102
%conv129 = fpext float %add128 to double
%103 = load i32, i32* %ty, align 4
%idxprom130 = sext i32 %103 to i64
%arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
%104 = load i32, i32* %tx, align 4
%idxprom132 = sext i32 %104 to i64
%arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
%105 = load float, float* %arrayidx133, align 4
%conv134 = fpext float %105 to double
%mul135 = fmul contract double 2.000000e+00, %conv134
%sub136 = fsub contract double %conv129, %mul135
%106 = load float, float* %Ry_1, align 4
%conv137 = fpext float %106 to double
%mul138 = fmul contract double %sub136, %conv137
%add139 = fadd contract double %conv119, %mul138
%107 = load i32, i32* %ty, align 4
%idxprom140 = sext i32 %107 to i64
%arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
%108 = load i32, i32* %E, align 4
%idxprom142 = sext i32 %108 to i64
%arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
%109 = load float, float* %arrayidx143, align 4
%110 = load i32, i32* %ty, align 4
%idxprom144 = sext i32 %110 to i64
%arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
%111 = load i32, i32* %W, align 4
%idxprom146 = sext i32 %111 to i64
%arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
%112 = load float, float* %arrayidx147, align 4
%add148 = fadd contract float %109, %112
%conv149 = fpext float %add148 to double
%113 = load i32, i32* %ty, align 4
%idxprom150 = sext i32 %113 to i64
%arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
%114 = load i32, i32* %tx, align 4
%idxprom152 = sext i32 %114 to i64
%arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
%115 = load float, float* %arrayidx153, align 4
%conv154 = fpext float %115 to double
%mul155 = fmul contract double 2.000000e+00, %conv154
%sub156 = fsub contract double %conv149, %mul155
%116 = load float, float* %Rx_1, align 4
%conv157 = fpext float %116 to double
%mul158 = fmul contract double %sub156, %conv157
%add159 = fadd contract double %add139, %mul158
%117 = load float, float* %amb_temp, align 4
%118 = load i32, i32* %ty, align 4
%idxprom160 = sext i32 %118 to i64
%arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
%119 = load i32, i32* %tx, align 4
%idxprom162 = sext i32 %119 to i64
%arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
%120 = load float, float* %arrayidx163, align 4
%sub164 = fsub contract float %117, %120
%121 = load float, float* %Rz_1, align 4
%mul165 = fmul contract float %sub164, %121
%conv166 = fpext float %mul165 to double
%add167 = fadd contract double %add159, %conv166
%mul168 = fmul contract double %conv114, %add167
%add169 = fadd contract double %conv, %mul168
%conv170 = fptrunc double %add169 to float
%122 = load i32, i32* %ty, align 4
%idxprom171 = sext i32 %122 to i64
%arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
%123 = load i32, i32* %tx, align 4
%idxprom173 = sext i32 %123 to i64
%arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
store float %conv170, float* %arrayidx174, align 4
br label %if.end175
if.end175: ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
call void @llvm.nvvm.barrier0()
%124 = load i32, i32* %i, align 4
%125 = load i32, i32* %iteration.addr, align 4
%sub176 = sub nsw i32 %125, 1
%cmp177 = icmp eq i32 %124, %sub176
br i1 %cmp177, label %if.then178, label %if.end179
if.then178: ; preds = %if.end175
br label %for.end
if.end179: ; preds = %if.end175
%126 = load i8, i8* %computed, align 1
%tobool = trunc i8 %126 to i1
br i1 %tobool, label %if.then180, label %if.end189
if.then180: ; preds = %if.end179
%127 = load i32, i32* %ty, align 4
%idxprom181 = sext i32 %127 to i64
%arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
%128 = load i32, i32* %tx, align 4
%idxprom183 = sext i32 %128 to i64
%arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
%129 = load float, float* %arrayidx184, align 4
%130 = load i32, i32* %ty, align 4
%idxprom185 = sext i32 %130 to i64
%arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
%131 = load i32, i32* %tx, align 4
%idxprom187 = sext i32 %131 to i64
%arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
store float %129, float* %arrayidx188, align 4
br label %if.end189
if.end189: ; preds = %if.then180, %if.end179
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end189
%132 = load i32, i32* %i, align 4
%inc = add nsw i32 %132, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %if.then178, %for.cond
%133 = load i8, i8* %computed, align 1
%tobool190 = trunc i8 %133 to i1
br i1 %tobool190, label %if.then191, label %if.end198
if.then191: ; preds = %for.end
%134 = load i32, i32* %ty, align 4
%idxprom192 = sext i32 %134 to i64
%arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
%135 = load i32, i32* %tx, align 4
%idxprom194 = sext i32 %135 to i64
%arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
%136 = load float, float* %arrayidx195, align 4
%137 = load float*, float** %temp_dst.addr, align 8
%138 = load i32, i32* %index, align 4
%idxprom196 = sext i32 %138 to i64
%arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
store float %136, float* %arrayidx197, align 4
br label %if.end198
if.end198: ; preds = %if.then191, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

353
examples/hotspot/hotspot.cu Normal file
View File

@ -0,0 +1,353 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 16
#endif
#define STR_SIZE 256
/* maximum power density possible (say 300W for a 10mm x 10mm chip) */
#define MAX_PD (3.0e6)
/* required precision in degrees */
#define PRECISION 0.001
#define SPEC_HEAT_SI 1.75e6
#define K_SI 100
/* capacitance fitting factor */
#define FACTOR_CHIP 0.5
/* chip parameters */
float t_chip = 0.0005;
float chip_height = 0.016;
float chip_width = 0.016;
/* ambient temperature, assuming no package at all */
float amb_temp = 80.0;
void run(int argc, char **argv);
/* define timer macros */
#define pin_stats_reset() startCycle()
#define pin_stats_pause(cycles) stopCycle(cycles)
#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
int i, j, index = 0;
FILE *fp;
char str[STR_SIZE];
if ((fp = fopen(file, "w")) == 0)
printf("The file was not opened\n");
for (i = 0; i < grid_rows; i++)
for (j = 0; j < grid_cols; j++) {
sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
fputs(str, fp);
index++;
}
fclose(fp);
}
void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
int i, j;
FILE *fp;
char str[STR_SIZE];
float val;
if ((fp = fopen(file, "r")) == 0)
printf("The file was not opened\n");
for (i = 0; i <= grid_rows - 1; i++)
for (j = 0; j <= grid_cols - 1; j++) {
fgets(str, STR_SIZE, fp);
if (feof(fp))
fatal("not enough lines in file");
// if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
// ((i-1)*(grid_cols-2)+j-1)))
if ((sscanf(str, "%f", &val) != 1))
fatal("invalid file format");
vect[i * grid_cols + j] = val;
}
fclose(fp);
}
#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
__global__ void calculate_temp(int iteration, // number of iteration
float *power, // power input
float *temp_src, // temperature input/output
float *temp_dst, // temperature input/output
int grid_cols, // Col of grid
int grid_rows, // Row of grid
int border_cols, // border offset
int border_rows, // border offset
float Cap, // Capacitance
float Rx, float Ry, float Rz, float step,
float time_elapsed) {
__shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float temp_t[BLOCK_SIZE]
[BLOCK_SIZE]; // saving temparary temperature result
float amb_temp = 80.0;
float step_div_Cap;
float Rx_1, Ry_1, Rz_1;
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
step_div_Cap = step / Cap;
Rx_1 = 1 / Rx;
Ry_1 = 1 / Ry;
Rz_1 = 1 / Rz;
// each block finally computes result for a small block
// after N iterations.
// it is the non-overlapping small blocks that cover
// all the input data
// calculate the small block size
int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
// calculate the boundary for the block according to
// the boundary of its small block
int blkY = small_block_rows * by - border_rows;
int blkX = small_block_cols * bx - border_cols;
int blkYmax = blkY + BLOCK_SIZE - 1;
int blkXmax = blkX + BLOCK_SIZE - 1;
// calculate the global thread coordination
int yidx = blkY + ty;
int xidx = blkX + tx;
// load data if it is within the valid input range
int loadYidx = yidx, loadXidx = xidx;
int index = grid_cols * loadYidx + loadXidx;
if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
IN_RANGE(loadXidx, 0, grid_cols - 1)) {
temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
// global memory to shared memory
power_on_cuda[ty][tx] =
power[index]; // Load the power data from global memory to shared memory
}
__syncthreads();
// effective range within this block that falls within
// the valid range of the input data
// used to rule out computation outside the boundary.
int validYmin = (blkY < 0) ? -blkY : 0;
int validYmax = (blkYmax > grid_rows - 1)
? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
: BLOCK_SIZE - 1;
int validXmin = (blkX < 0) ? -blkX : 0;
int validXmax = (blkXmax > grid_cols - 1)
? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
: BLOCK_SIZE - 1;
int N = ty - 1;
int S = ty + 1;
int W = tx - 1;
int E = tx + 1;
N = (N < validYmin) ? validYmin : N;
S = (S > validYmax) ? validYmax : S;
W = (W < validXmin) ? validXmin : W;
E = (E > validXmax) ? validXmax : E;
bool computed;
for (int i = 0; i < iteration; i++) {
computed = false;
if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
IN_RANGE(tx, validXmin, validXmax) &&
IN_RANGE(ty, validYmin, validYmax)) {
computed = true;
temp_t[ty][tx] =
temp_on_cuda[ty][tx] +
step_div_Cap * (power_on_cuda[ty][tx] +
(temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
2.0 * temp_on_cuda[ty][tx]) *
Ry_1 +
(temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
2.0 * temp_on_cuda[ty][tx]) *
Rx_1 +
(amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
}
__syncthreads();
if (i == iteration - 1)
break;
if (computed) // Assign the computation range
temp_on_cuda[ty][tx] = temp_t[ty][tx];
__syncthreads();
}
// update the global memory
// after the last iteration, only threads coordinated within the
// small block perform the calculation and switch on ``computed''
if (computed) {
temp_dst[index] = temp_t[ty][tx];
}
}
/*
compute N time steps
*/
int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
int row, int total_iterations, int num_iterations,
int blockCols, int blockRows, int borderCols,
int borderRows) {
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(blockCols, blockRows);
float grid_height = chip_height / row;
float grid_width = chip_width / col;
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
float Rz = t_chip / (K_SI * grid_height * grid_width);
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
float step = PRECISION / max_slope;
float t;
float time_elapsed;
time_elapsed = 0.001;
int src = 1, dst = 0;
for (t = 0; t < total_iterations; t += num_iterations) {
int temp = src;
src = dst;
dst = temp;
calculate_temp<<<dimGrid, dimBlock>>>(
MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
step, time_elapsed);
cudaDeviceSynchronize();
}
return dst;
}
void usage(int argc, char **argv) {
fprintf(stderr,
"Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
"<temp_file> <power_file> <output_file>\n",
argv[0]);
fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid "
"(positive integer)\n");
fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
fprintf(stderr, "\t<sim_time> - number of iterations\n");
fprintf(stderr, "\t<temp_file> - name of the file containing the initial "
"temperature values of each cell\n");
fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
"power values of each cell\n");
fprintf(stderr, "\t<output_file> - name of the output file\n");
exit(1);
}
int main(int argc, char **argv) {
cudaSetDevice(0);
printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
run(argc, argv);
return EXIT_SUCCESS;
}
void run(int argc, char **argv) {
int size;
int grid_rows, grid_cols;
float *FilesavingTemp, *FilesavingPower, *MatrixOut;
char *tfile, *pfile, *ofile;
int total_iterations = 60;
int pyramid_height = 1; // number of iterations
if (argc != 7)
usage(argc, argv);
if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
(pyramid_height = atoi(argv[2])) <= 0 ||
(total_iterations = atoi(argv[3])) <= 0)
usage(argc, argv);
tfile = argv[4];
pfile = argv[5];
ofile = argv[6];
size = grid_rows * grid_cols;
/* --------------- pyramid parameters --------------- */
#define EXPAND_RATE \
2 // add one iteration will extend the pyramid base by 2 per each borderline
int borderCols = (pyramid_height)*EXPAND_RATE / 2;
int borderRows = (pyramid_height)*EXPAND_RATE / 2;
int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
int blockCols =
grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
int blockRows =
grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
FilesavingTemp = (float *)malloc(size * sizeof(float));
FilesavingPower = (float *)malloc(size * sizeof(float));
MatrixOut = (float *)calloc(size, sizeof(float));
if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
fatal("unable to allocate memory");
printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
"%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
blockCols, blockRows, smallBlockCol, smallBlockRow);
readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
readinput(FilesavingPower, grid_rows, grid_cols, pfile);
float *MatrixTemp[2], *MatrixPower;
cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
cudaMemcpyHostToDevice);
cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
cudaMemcpyHostToDevice);
printf("Start computing the transient temperature\n");
int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
total_iterations, pyramid_height, blockCols,
blockRows, borderCols, borderRows);
printf("Ending simulation\n");
cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
cudaMemcpyDeviceToHost);
writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
cudaFree(MatrixPower);
cudaFree(MatrixTemp[0]);
cudaFree(MatrixTemp[1]);
free(MatrixOut);
}

21
examples/hotspot/run.sh Normal file
View File

@ -0,0 +1,21 @@
#!/bin/bash
set -e
llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
-o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
if head output.out | grep -q "323.829"; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -0,0 +1,587 @@
; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "3D.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockDim_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
entry:
%p.addr = alloca float*, align 8
%tIn.addr = alloca float*, align 8
%tOut.addr = alloca float*, align 8
%sdc.addr = alloca float, align 4
%nx.addr = alloca i32, align 4
%ny.addr = alloca i32, align 4
%nz.addr = alloca i32, align 4
%ce.addr = alloca float, align 4
%cw.addr = alloca float, align 4
%cn.addr = alloca float, align 4
%cs.addr = alloca float, align 4
%ct.addr = alloca float, align 4
%cb.addr = alloca float, align 4
%cc.addr = alloca float, align 4
%amb_temp = alloca float, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
%c = alloca i32, align 4
%xy = alloca i32, align 4
%W = alloca i32, align 4
%E = alloca i32, align 4
%N = alloca i32, align 4
%S = alloca i32, align 4
%temp1 = alloca float, align 4
%temp2 = alloca float, align 4
%temp3 = alloca float, align 4
%k = alloca i32, align 4
store float* %p, float** %p.addr, align 8
store float* %tIn, float** %tIn.addr, align 8
store float* %tOut, float** %tOut.addr, align 8
store float %sdc, float* %sdc.addr, align 4
store i32 %nx, i32* %nx.addr, align 4
store i32 %ny, i32* %ny.addr, align 4
store i32 %nz, i32* %nz.addr, align 4
store float %ce, float* %ce.addr, align 4
store float %cw, float* %cw.addr, align 4
store float %cn, float* %cn.addr, align 4
store float %cs, float* %cs.addr, align 4
store float %ct, float* %ct.addr, align 4
store float %cb, float* %cb.addr, align 4
store float %cc, float* %cc.addr, align 4
store float 8.000000e+01, float* %amb_temp, align 4
%call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call, %call1
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add = add i32 %mul, %call2
store i32 %add, i32* %i, align 4
%call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
%call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
%mul5 = mul i32 %call3, %call4
%call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
%add7 = add i32 %mul5, %call6
store i32 %add7, i32* %j, align 4
%0 = load i32, i32* %i, align 4
%1 = load i32, i32* %j, align 4
%2 = load i32, i32* %nx.addr, align 4
%mul8 = mul nsw i32 %1, %2
%add9 = add nsw i32 %0, %mul8
store i32 %add9, i32* %c, align 4
%3 = load i32, i32* %nx.addr, align 4
%4 = load i32, i32* %ny.addr, align 4
%mul10 = mul nsw i32 %3, %4
store i32 %mul10, i32* %xy, align 4
%5 = load i32, i32* %i, align 4
%cmp = icmp eq i32 %5, 0
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
%6 = load i32, i32* %c, align 4
br label %cond.end
cond.false: ; preds = %entry
%7 = load i32, i32* %c, align 4
%sub = sub nsw i32 %7, 1
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
store i32 %cond, i32* %W, align 4
%8 = load i32, i32* %i, align 4
%9 = load i32, i32* %nx.addr, align 4
%sub11 = sub nsw i32 %9, 1
%cmp12 = icmp eq i32 %8, %sub11
br i1 %cmp12, label %cond.true13, label %cond.false14
cond.true13: ; preds = %cond.end
%10 = load i32, i32* %c, align 4
br label %cond.end16
cond.false14: ; preds = %cond.end
%11 = load i32, i32* %c, align 4
%add15 = add nsw i32 %11, 1
br label %cond.end16
cond.end16: ; preds = %cond.false14, %cond.true13
%cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
store i32 %cond17, i32* %E, align 4
%12 = load i32, i32* %j, align 4
%cmp18 = icmp eq i32 %12, 0
br i1 %cmp18, label %cond.true19, label %cond.false20
cond.true19: ; preds = %cond.end16
%13 = load i32, i32* %c, align 4
br label %cond.end22
cond.false20: ; preds = %cond.end16
%14 = load i32, i32* %c, align 4
%15 = load i32, i32* %nx.addr, align 4
%sub21 = sub nsw i32 %14, %15
br label %cond.end22
cond.end22: ; preds = %cond.false20, %cond.true19
%cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
store i32 %cond23, i32* %N, align 4
%16 = load i32, i32* %j, align 4
%17 = load i32, i32* %ny.addr, align 4
%sub24 = sub nsw i32 %17, 1
%cmp25 = icmp eq i32 %16, %sub24
br i1 %cmp25, label %cond.true26, label %cond.false27
cond.true26: ; preds = %cond.end22
%18 = load i32, i32* %c, align 4
br label %cond.end29
cond.false27: ; preds = %cond.end22
%19 = load i32, i32* %c, align 4
%20 = load i32, i32* %nx.addr, align 4
%add28 = add nsw i32 %19, %20
br label %cond.end29
cond.end29: ; preds = %cond.false27, %cond.true26
%cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
store i32 %cond30, i32* %S, align 4
%21 = load float*, float** %tIn.addr, align 8
%22 = load i32, i32* %c, align 4
%idxprom = sext i32 %22 to i64
%arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
%23 = load float, float* %arrayidx, align 4
store float %23, float* %temp2, align 4
store float %23, float* %temp1, align 4
%24 = load float*, float** %tIn.addr, align 8
%25 = load i32, i32* %c, align 4
%26 = load i32, i32* %xy, align 4
%add31 = add nsw i32 %25, %26
%idxprom32 = sext i32 %add31 to i64
%arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
%27 = load float, float* %arrayidx33, align 4
store float %27, float* %temp3, align 4
%28 = load float, float* %cc.addr, align 4
%29 = load float, float* %temp2, align 4
%mul34 = fmul contract float %28, %29
%30 = load float, float* %cw.addr, align 4
%31 = load float*, float** %tIn.addr, align 8
%32 = load i32, i32* %W, align 4
%idxprom35 = sext i32 %32 to i64
%arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
%33 = load float, float* %arrayidx36, align 4
%mul37 = fmul contract float %30, %33
%add38 = fadd contract float %mul34, %mul37
%34 = load float, float* %ce.addr, align 4
%35 = load float*, float** %tIn.addr, align 8
%36 = load i32, i32* %E, align 4
%idxprom39 = sext i32 %36 to i64
%arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
%37 = load float, float* %arrayidx40, align 4
%mul41 = fmul contract float %34, %37
%add42 = fadd contract float %add38, %mul41
%38 = load float, float* %cs.addr, align 4
%39 = load float*, float** %tIn.addr, align 8
%40 = load i32, i32* %S, align 4
%idxprom43 = sext i32 %40 to i64
%arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
%41 = load float, float* %arrayidx44, align 4
%mul45 = fmul contract float %38, %41
%add46 = fadd contract float %add42, %mul45
%42 = load float, float* %cn.addr, align 4
%43 = load float*, float** %tIn.addr, align 8
%44 = load i32, i32* %N, align 4
%idxprom47 = sext i32 %44 to i64
%arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
%45 = load float, float* %arrayidx48, align 4
%mul49 = fmul contract float %42, %45
%add50 = fadd contract float %add46, %mul49
%46 = load float, float* %cb.addr, align 4
%47 = load float, float* %temp1, align 4
%mul51 = fmul contract float %46, %47
%add52 = fadd contract float %add50, %mul51
%48 = load float, float* %ct.addr, align 4
%49 = load float, float* %temp3, align 4
%mul53 = fmul contract float %48, %49
%add54 = fadd contract float %add52, %mul53
%50 = load float, float* %sdc.addr, align 4
%51 = load float*, float** %p.addr, align 8
%52 = load i32, i32* %c, align 4
%idxprom55 = sext i32 %52 to i64
%arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
%53 = load float, float* %arrayidx56, align 4
%mul57 = fmul contract float %50, %53
%add58 = fadd contract float %add54, %mul57
%54 = load float, float* %ct.addr, align 4
%55 = load float, float* %amb_temp, align 4
%mul59 = fmul contract float %54, %55
%add60 = fadd contract float %add58, %mul59
%56 = load float*, float** %tOut.addr, align 8
%57 = load i32, i32* %c, align 4
%idxprom61 = sext i32 %57 to i64
%arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
store float %add60, float* %arrayidx62, align 4
%58 = load i32, i32* %xy, align 4
%59 = load i32, i32* %c, align 4
%add63 = add nsw i32 %59, %58
store i32 %add63, i32* %c, align 4
%60 = load i32, i32* %xy, align 4
%61 = load i32, i32* %W, align 4
%add64 = add nsw i32 %61, %60
store i32 %add64, i32* %W, align 4
%62 = load i32, i32* %xy, align 4
%63 = load i32, i32* %E, align 4
%add65 = add nsw i32 %63, %62
store i32 %add65, i32* %E, align 4
%64 = load i32, i32* %xy, align 4
%65 = load i32, i32* %N, align 4
%add66 = add nsw i32 %65, %64
store i32 %add66, i32* %N, align 4
%66 = load i32, i32* %xy, align 4
%67 = load i32, i32* %S, align 4
%add67 = add nsw i32 %67, %66
store i32 %add67, i32* %S, align 4
store i32 1, i32* %k, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %cond.end29
%68 = load i32, i32* %k, align 4
%69 = load i32, i32* %nz.addr, align 4
%sub68 = sub nsw i32 %69, 1
%cmp69 = icmp slt i32 %68, %sub68
br i1 %cmp69, label %for.body, label %for.end
for.body: ; preds = %for.cond
%70 = load float, float* %temp2, align 4
store float %70, float* %temp1, align 4
%71 = load float, float* %temp3, align 4
store float %71, float* %temp2, align 4
%72 = load float*, float** %tIn.addr, align 8
%73 = load i32, i32* %c, align 4
%74 = load i32, i32* %xy, align 4
%add70 = add nsw i32 %73, %74
%idxprom71 = sext i32 %add70 to i64
%arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
%75 = load float, float* %arrayidx72, align 4
store float %75, float* %temp3, align 4
%76 = load float, float* %cc.addr, align 4
%77 = load float, float* %temp2, align 4
%mul73 = fmul contract float %76, %77
%78 = load float, float* %cw.addr, align 4
%79 = load float*, float** %tIn.addr, align 8
%80 = load i32, i32* %W, align 4
%idxprom74 = sext i32 %80 to i64
%arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
%81 = load float, float* %arrayidx75, align 4
%mul76 = fmul contract float %78, %81
%add77 = fadd contract float %mul73, %mul76
%82 = load float, float* %ce.addr, align 4
%83 = load float*, float** %tIn.addr, align 8
%84 = load i32, i32* %E, align 4
%idxprom78 = sext i32 %84 to i64
%arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
%85 = load float, float* %arrayidx79, align 4
%mul80 = fmul contract float %82, %85
%add81 = fadd contract float %add77, %mul80
%86 = load float, float* %cs.addr, align 4
%87 = load float*, float** %tIn.addr, align 8
%88 = load i32, i32* %S, align 4
%idxprom82 = sext i32 %88 to i64
%arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
%89 = load float, float* %arrayidx83, align 4
%mul84 = fmul contract float %86, %89
%add85 = fadd contract float %add81, %mul84
%90 = load float, float* %cn.addr, align 4
%91 = load float*, float** %tIn.addr, align 8
%92 = load i32, i32* %N, align 4
%idxprom86 = sext i32 %92 to i64
%arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
%93 = load float, float* %arrayidx87, align 4
%mul88 = fmul contract float %90, %93
%add89 = fadd contract float %add85, %mul88
%94 = load float, float* %cb.addr, align 4
%95 = load float, float* %temp1, align 4
%mul90 = fmul contract float %94, %95
%add91 = fadd contract float %add89, %mul90
%96 = load float, float* %ct.addr, align 4
%97 = load float, float* %temp3, align 4
%mul92 = fmul contract float %96, %97
%add93 = fadd contract float %add91, %mul92
%98 = load float, float* %sdc.addr, align 4
%99 = load float*, float** %p.addr, align 8
%100 = load i32, i32* %c, align 4
%idxprom94 = sext i32 %100 to i64
%arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
%101 = load float, float* %arrayidx95, align 4
%mul96 = fmul contract float %98, %101
%add97 = fadd contract float %add93, %mul96
%102 = load float, float* %ct.addr, align 4
%103 = load float, float* %amb_temp, align 4
%mul98 = fmul contract float %102, %103
%add99 = fadd contract float %add97, %mul98
%104 = load float*, float** %tOut.addr, align 8
%105 = load i32, i32* %c, align 4
%idxprom100 = sext i32 %105 to i64
%arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
store float %add99, float* %arrayidx101, align 4
%106 = load i32, i32* %xy, align 4
%107 = load i32, i32* %c, align 4
%add102 = add nsw i32 %107, %106
store i32 %add102, i32* %c, align 4
%108 = load i32, i32* %xy, align 4
%109 = load i32, i32* %W, align 4
%add103 = add nsw i32 %109, %108
store i32 %add103, i32* %W, align 4
%110 = load i32, i32* %xy, align 4
%111 = load i32, i32* %E, align 4
%add104 = add nsw i32 %111, %110
store i32 %add104, i32* %E, align 4
%112 = load i32, i32* %xy, align 4
%113 = load i32, i32* %N, align 4
%add105 = add nsw i32 %113, %112
store i32 %add105, i32* %N, align 4
%114 = load i32, i32* %xy, align 4
%115 = load i32, i32* %S, align 4
%add106 = add nsw i32 %115, %114
store i32 %add106, i32* %S, align 4
br label %for.inc
for.inc: ; preds = %for.body
%116 = load i32, i32* %k, align 4
%inc = add nsw i32 %116, 1
store i32 %inc, i32* %k, align 4
br label %for.cond
for.end: ; preds = %for.cond
%117 = load float, float* %temp2, align 4
store float %117, float* %temp1, align 4
%118 = load float, float* %temp3, align 4
store float %118, float* %temp2, align 4
%119 = load float, float* %cc.addr, align 4
%120 = load float, float* %temp2, align 4
%mul107 = fmul contract float %119, %120
%121 = load float, float* %cw.addr, align 4
%122 = load float*, float** %tIn.addr, align 8
%123 = load i32, i32* %W, align 4
%idxprom108 = sext i32 %123 to i64
%arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
%124 = load float, float* %arrayidx109, align 4
%mul110 = fmul contract float %121, %124
%add111 = fadd contract float %mul107, %mul110
%125 = load float, float* %ce.addr, align 4
%126 = load float*, float** %tIn.addr, align 8
%127 = load i32, i32* %E, align 4
%idxprom112 = sext i32 %127 to i64
%arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
%128 = load float, float* %arrayidx113, align 4
%mul114 = fmul contract float %125, %128
%add115 = fadd contract float %add111, %mul114
%129 = load float, float* %cs.addr, align 4
%130 = load float*, float** %tIn.addr, align 8
%131 = load i32, i32* %S, align 4
%idxprom116 = sext i32 %131 to i64
%arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
%132 = load float, float* %arrayidx117, align 4
%mul118 = fmul contract float %129, %132
%add119 = fadd contract float %add115, %mul118
%133 = load float, float* %cn.addr, align 4
%134 = load float*, float** %tIn.addr, align 8
%135 = load i32, i32* %N, align 4
%idxprom120 = sext i32 %135 to i64
%arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
%136 = load float, float* %arrayidx121, align 4
%mul122 = fmul contract float %133, %136
%add123 = fadd contract float %add119, %mul122
%137 = load float, float* %cb.addr, align 4
%138 = load float, float* %temp1, align 4
%mul124 = fmul contract float %137, %138
%add125 = fadd contract float %add123, %mul124
%139 = load float, float* %ct.addr, align 4
%140 = load float, float* %temp3, align 4
%mul126 = fmul contract float %139, %140
%add127 = fadd contract float %add125, %mul126
%141 = load float, float* %sdc.addr, align 4
%142 = load float*, float** %p.addr, align 8
%143 = load i32, i32* %c, align 4
%idxprom128 = sext i32 %143 to i64
%arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
%144 = load float, float* %arrayidx129, align 4
%mul130 = fmul contract float %141, %144
%add131 = fadd contract float %add127, %mul130
%145 = load float, float* %ct.addr, align 4
%146 = load float, float* %amb_temp, align 4
%mul132 = fmul contract float %145, %146
%add133 = fadd contract float %add131, %mul132
%147 = load float*, float** %tOut.addr, align 8
%148 = load i32, i32* %c, align 4
%idxprom134 = sext i32 %148 to i64
%arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
store float %add133, float* %arrayidx135, align 4
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

205
examples/hotspot3D/3D.cu Normal file
View File

@ -0,0 +1,205 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#define BLOCK_SIZE 16
#define STR_SIZE 256
#define block_x_ 128
#define block_y_ 2
#define block_z_ 1
#define MAX_PD (3.0e6)
/* required precision in degrees */
#define PRECISION 0.001
#define SPEC_HEAT_SI 1.75e6
#define K_SI 100
/* capacitance fitting factor */
#define FACTOR_CHIP 0.5
#include "opt1.cu"
/* chip parameters */
float t_chip = 0.0005;
float chip_height = 0.016;
float chip_width = 0.016; /* ambient temperature, assuming no package at all
*/
float amb_temp = 80.0;
void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
void readinput(float *vect, int grid_rows, int grid_cols, int layers,
char *file) {
int i, j, k;
FILE *fp;
char str[STR_SIZE];
float val;
if ((fp = fopen(file, "r")) == 0)
fatal("The file was not opened");
for (i = 0; i <= grid_rows - 1; i++)
for (j = 0; j <= grid_cols - 1; j++)
for (k = 0; k <= layers - 1; k++) {
if (fgets(str, STR_SIZE, fp) == NULL)
fatal("Error reading file\n");
if (feof(fp))
fatal("not enough lines in file");
if ((sscanf(str, "%f", &val) != 1))
fatal("invalid file format");
vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
}
fclose(fp);
}
void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
char *file) {
int i, j, k, index = 0;
FILE *fp;
char str[STR_SIZE];
if ((fp = fopen(file, "w")) == 0)
printf("The file was not opened\n");
for (i = 0; i < grid_rows; i++)
for (j = 0; j < grid_cols; j++)
for (k = 0; k < layers; k++) {
sprintf(str, "%d\t%g\n", index,
vect[i * grid_cols + j + k * grid_rows * grid_cols]);
fputs(str, fp);
index++;
}
fclose(fp);
}
void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
float Cap, float Rx, float Ry, float Rz, float dt,
int numiter) {
float ce, cw, cn, cs, ct, cb, cc;
float stepDivCap = dt / Cap;
ce = cw = stepDivCap / Rx;
cn = cs = stepDivCap / Ry;
ct = cb = stepDivCap / Rz;
cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
int c, w, e, n, s, b, t;
int x, y, z;
int i = 0;
do {
for (z = 0; z < nz; z++)
for (y = 0; y < ny; y++)
for (x = 0; x < nx; x++) {
c = x + y * nx + z * nx * ny;
w = (x == 0) ? c : c - 1;
e = (x == nx - 1) ? c : c + 1;
n = (y == 0) ? c : c - nx;
s = (y == ny - 1) ? c : c + nx;
b = (z == 0) ? c : c - nx * ny;
t = (z == nz - 1) ? c : c + nx * ny;
tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
(dt / Cap) * pIn[c] + ct * amb_temp;
}
float *temp = tIn;
tIn = tOut;
tOut = temp;
i++;
} while (i < numiter);
}
float accuracy(float *arr1, float *arr2, int len) {
float err = 0.0;
int i;
for (i = 0; i < len; i++) {
err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
}
return (float)sqrt(err / len);
}
void usage(int argc, char **argv) {
fprintf(stderr,
"Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
"<outputFile>\n",
argv[0]);
fprintf(
stderr,
"\t<rows/cols> - number of rows/cols in the grid (positive integer)\n");
fprintf(stderr,
"\t<layers> - number of layers in the grid (positive integer)\n");
fprintf(stderr, "\t<iteration> - number of iterations\n");
fprintf(stderr, "\t<powerFile> - name of the file containing the initial "
"power values of each cell\n");
fprintf(stderr, "\t<tempFile> - name of the file containing the initial "
"temperature values of each cell\n");
fprintf(stderr, "\t<outputFile - output file\n");
exit(1);
}
int main(int argc, char **argv) {
cudaSetDevice(0);
if (argc != 7) {
usage(argc, argv);
}
char *pfile, *tfile, *ofile;
int iterations = atoi(argv[3]);
pfile = argv[4];
tfile = argv[5];
ofile = argv[6];
int numCols = atoi(argv[1]);
int numRows = atoi(argv[1]);
int layers = atoi(argv[2]);
/* calculating parameters*/
float dx = chip_height / numRows;
float dy = chip_width / numCols;
float dz = t_chip / layers;
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
float Rx = dy / (2.0 * K_SI * t_chip * dx);
float Ry = dx / (2.0 * K_SI * t_chip * dy);
float Rz = dz / (K_SI * dx * dy);
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
float dt = PRECISION / max_slope;
float *powerIn, *tempOut, *tempIn, *tempCopy;
int size = numCols * numRows * layers;
powerIn = (float *)calloc(size, sizeof(float));
tempCopy = (float *)malloc(size * sizeof(float));
tempIn = (float *)calloc(size, sizeof(float));
tempOut = (float *)calloc(size, sizeof(float));
float *answer = (float *)calloc(size, sizeof(float));
readinput(powerIn, numRows, numCols, layers, pfile);
readinput(tempIn, numRows, numCols, layers, tfile);
memcpy(tempCopy, tempIn, size * sizeof(float));
hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
Rz, dt, iterations);
computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
Ry, Rz, dt, iterations);
float acc = accuracy(tempOut, answer, numRows * numCols * layers);
printf("Accuracy: %e\n", acc);
writeoutput(tempOut, numRows, numCols, layers, ofile);
free(tempIn);
free(tempOut);
free(powerIn);
return 0;
}

22
examples/hotspot3D/run.sh Normal file
View File

@ -0,0 +1,22 @@
# # #!/bin/bash
set -e
llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o 3D \
-fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
if head output.out | grep -q "334.017"; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -0,0 +1,24 @@
#ifndef _COMPARISON_HELPERS_H_
#define _COMPARISON_HELPERS_H_
#include <stdio.h>
template <typename T>
__inline int compare_vectors(T *data1, T *data2, unsigned int size) {
printf("Comparing vectors: \n");
bool match = true;
for (unsigned int i = 0; i < size; i++)
if (data1[i] != data2[i]) {
match = false;
printf("Diff: data1[%d]=%d, data1[%d]=%d.\n", i, data1[i], i, data2[i]);
}
if (match) {
printf("PASS! vectors are matching!\n");
return 0;
} else {
printf("FAIL! vectors are NOT matching!\n");
exit(1);
return -1;
}
}
#endif

View File

@ -0,0 +1,116 @@
#include "stdafx.h"
#include "cpuencode.h"
#include "print_helpers.h"
using namespace std;
#if 1
// The max. codeword length for each byte symbol is 32-bits
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens) {
unsigned int *bitstreamPt =
(unsigned int *)outdata; /* Pointer to current byte */
*bitstreamPt = 0x00000000U;
unsigned int startbit = 0;
unsigned int totalBytes = 0;
for (unsigned int k = 0; k < num_elements; k++) {
unsigned int cw32 = 0;
unsigned int val32 = indata[k];
unsigned int numbits = 0;
unsigned int mask32;
for (unsigned int i = 0; i < 4; i++) {
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
cw32 = codewords[symbol];
numbits = codewordlens[symbol];
while (numbits > 0) {
int writebits = min(32 - startbit, numbits);
if (numbits == writebits)
mask32 = (cw32 & ((1 << numbits) - 1))
<< (32 - startbit -
numbits); // first make sure that the start of the word
// is clean, then shift to the left as many
// places as you need
else
mask32 = cw32 >>
(numbits - writebits); // shift out the bits that can not fit
*bitstreamPt = (*bitstreamPt) | mask32;
numbits = numbits - writebits;
startbit = (startbit + writebits) % 32;
if (startbit == 0) {
bitstreamPt++;
*bitstreamPt = 0x00000000;
totalBytes += 4;
}
}
}
}
totalBytes += (startbit / 8) +
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
*outsize = totalBytes;
}
//////////////////////////////////////////////////////////////////////
/// ALTERNATIVE CODER
/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
/// i.e. g 64 bits
///////////////////////////////////////////////////////////////////////
#else
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens) {
unsigned int *bitstreamPt =
(unsigned int *)outdata; /* Pointer to current byte */
// assume memset is done.
*bitstreamPt = 0x00000000U;
unsigned int startbit = 0;
unsigned int totalBytes = 0;
for (unsigned int k = 0; k < num_elements; k++) {
unsigned long long cw64 = 0, mask64 = 0;
unsigned int val32 = indata[k];
unsigned int numbits = 0;
unsigned int mask32, temp32;
for (unsigned int i = 0; i < 4; i++) {
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
numbits += codewordlens[symbol];
// if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
// %d!!!!!!!\n", k, numbits);
}
while (numbits > 0) {
int writebits = min(32 - startbit, numbits);
if (numbits == writebits) {
temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
mask32 = temp32 << (32 - startbit - numbits);
} else {
mask32 = (unsigned int)(cw64 >> (numbits - writebits));
cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
}
*bitstreamPt = (*bitstreamPt) | mask32;
numbits = numbits - writebits;
startbit = (startbit + writebits) % 32;
if (startbit == 0) {
bitstreamPt++;
*bitstreamPt = 0x00000000;
totalBytes += 4;
}
}
}
totalBytes += (startbit / 8) +
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
*outsize = totalBytes;
}
#endif

View File

@ -0,0 +1,8 @@
#ifndef _CE_H_
#define _CE_H_
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens);
#endif

Some files were not shown because too many files have changed in this diff Show More