diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b08bef..f1a9d56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,3 +39,4 @@ set(GCC_COVERAGE_LINK_FLAGS "-L${LLVM_LIB_PATH} ${LLVM_LINK_FLAG} -lz -lrt -ldl -ltinfo -lpthread -lm") add_subdirectory(compilation) +add_subdirectory(runtime) diff --git a/docs/CONTRIBUTING.md b/CONTRIBUTING.md similarity index 72% rename from docs/CONTRIBUTING.md rename to CONTRIBUTING.md index 0284fbc..90d3f91 100644 --- a/docs/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ -# Contributing to CuPBoP +# Contributing to COX -Thank you for your interest in contributing to CuPBoP! +Thank you for your interest in contributing to COX! We appreciate all contributions, including but not limited to: - Add documentation @@ -10,9 +10,9 @@ We appreciate all contributions, including but not limited to: ## How to contribute? 0. (Optional) Open an issue and discuss your idea before start -1. Fork the latest version CuPBoP +1. Fork the latest version COX 2. Commit to the forked repo -3. Create a Pull Request to CuPBoP main branch +3. Create a Pull Request to COX main branch ## Code style @@ -21,15 +21,14 @@ To make sure your contribution is following the correct style, we highly recommend you to install [pre-commit](https://pre-commit.com/) before development. ```bash -# Python3 environment is required +# Python environment is required pip install pre-commit ``` Then, from the repository folder, execute the following instruction: ```bash -# execute in CuPBoP's root folder -pre-commit install + pre-commit install ``` With pre-commit plugin, each local commit will be automatically checked. diff --git a/README.md b/README.md index 013d9f7..34461f1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# CuPBoP: Cuda for Parallelized and Broad-range Processors +# COX: CUDA on X86 ## Introduction -CuPBoP (Cuda for parallelized and broad-range processors) is a framework -aims to execute CUDA source code on non-NVIDIA devices, -including CPU, GPU and other architectures. +This project consists of two parts: a series of LLVM passes that +achieve a SPMD NVVM IR as input, and output the corresponding +MPMD+SIMD version of LLVM IR which can be execute on CPU devices. ## Install @@ -22,8 +22,8 @@ including CPU, GPU and other architectures. 1. Clone from github ```bash - git clone https://github.com/cupbop/CuPBoP - cd CuPBoP + git clone https://github.com/drcut/open_source_template + cd open_source_template ``` 2. Build the transformer for NVVM IR to LLVM IR for X86 @@ -55,12 +55,8 @@ g++ ../compilation/examples/vecadd/host.cpp \ ./vecadd_example ``` -## Contribution - -We sincerely appreciate all kinds of contributions. -Please refer to [CONTRIBUTING](docs/CONTRIBUTING.md) for the contributing guideline. - ## Author -* [Ruobing Han](https://drcut.github.io/) -* [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/) +[Ruobing Han](https://drcut.github.io/) is a CS phd student in +Georgia Institute Technology, under the supervision +of Prof. [Hyesoon Kim](https://www.cc.gatech.edu/~hyesoon/). diff --git a/compilation/HostTranslation.cpp b/compilation/HostTranslation.cpp index 9695b56..7d25df1 100644 --- a/compilation/HostTranslation.cpp +++ b/compilation/HostTranslation.cpp @@ -1,25 +1,43 @@ -#include "ReplaceKernelLaunch.h" +#include "RemoveCudaBuiltin.h" +#include "ReplaceConstantMemory.h" +#include "ReplaceCudaBuiltin.h" +#include "ReplaceKernelArgs.h" #include "tool.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include +#include #include #include using namespace llvm; +std::string PATH = "kernel_meta.log"; + int main(int argc, char **argv) { assert(argc == 3 && "incorrect number of arguments\n"); char *input_host_path = argv[1]; char *output_host_path = argv[2]; + std::ifstream fin; + fin.open(PATH); + // load LLVM module(s) llvm::Module *hostModule = LoadModuleFromFilr(input_host_path); VerifyModule(hostModule); + // replace const memory + ReplaceConstantMemory(hostModule, fin); // process host module - ReplaceKernelLaunch(hostModule); + ReplaceCudaBuiltin(hostModule); + // remove builtin unuse functions and variables + RemoveCudaBuiltin(hostModule); + // replace arguments in kernel_arg, from alloc to malloc + ReplaceKernelArg(hostModule); + VerifyModule(hostModule); DumpModule(hostModule, output_host_path); + + fin.close(); return 0; } diff --git a/compilation/HostTranslation/include/ReplaceKernelLaunch.h b/compilation/HostTranslation/include/RemoveCudaBuiltin.h similarity index 60% rename from compilation/HostTranslation/include/ReplaceKernelLaunch.h rename to compilation/HostTranslation/include/RemoveCudaBuiltin.h index 769489c..09bb5da 100644 --- a/compilation/HostTranslation/include/ReplaceKernelLaunch.h +++ b/compilation/HostTranslation/include/RemoveCudaBuiltin.h @@ -1,11 +1,11 @@ -#ifndef __NVVM2x86_REPLACE_KERNEL_LAUNCH__ -#define __NVVM2x86_REPLACE_KERNEL_LAUNCH__ +#ifndef __NVVM2x86_REMOVE_CUDABUILTIN__ +#define __NVVM2x86_REMOVE_CUDABUILTIN__ #include "llvm/IR/Module.h" /* * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) */ -void ReplaceKernelLaunch(llvm::Module *M); +void RemoveCudaBuiltin(llvm::Module *M); #endif diff --git a/compilation/HostTranslation/include/ReplaceConstantMemory.h b/compilation/HostTranslation/include/ReplaceConstantMemory.h new file mode 100644 index 0000000..0a43533 --- /dev/null +++ b/compilation/HostTranslation/include/ReplaceConstantMemory.h @@ -0,0 +1,12 @@ +#ifndef __NVVM2x86_REPLACE_CONSTANT_MEMORY__ +#define __NVVM2x86_REPLACE_CONSTANT_MEMORY__ + +#include "llvm/IR/Module.h" +#include +/* + * From: @ff_variable = internal global [5 x float] undef, align 16 + * To: @wrapper_global_ff_variable = common global [5 x float] zeroinitializer + */ +void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin); + +#endif diff --git a/compilation/HostTranslation/include/ReplaceCudaBuiltin.h b/compilation/HostTranslation/include/ReplaceCudaBuiltin.h new file mode 100644 index 0000000..943a0f5 --- /dev/null +++ b/compilation/HostTranslation/include/ReplaceCudaBuiltin.h @@ -0,0 +1,11 @@ +#ifndef __NVVM2x86_REPLACE_CUDA_BUILTIN__ +#define __NVVM2x86_REPLACE_CUDA_BUILTIN__ + +#include "llvm/IR/Module.h" +/* + * Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) + * Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) + */ +void ReplaceCudaBuiltin(llvm::Module *M); + +#endif diff --git a/compilation/HostTranslation/include/ReplaceKernelArgs.h b/compilation/HostTranslation/include/ReplaceKernelArgs.h new file mode 100644 index 0000000..d9fcb6b --- /dev/null +++ b/compilation/HostTranslation/include/ReplaceKernelArgs.h @@ -0,0 +1,14 @@ +#ifndef __NVVM2x86_REPLACE_KERNEL_ARGS__ +#define __NVVM2x86_REPLACE_KERNEL_ARGS__ + +#include "llvm/IR/Module.h" +/* + * before: + * %m_cuda.addr = alloca float*, align 8 + * after: + * %m_cuda.addr_tmp = call i8* @malloc(i64 8) + * %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float** + */ +void ReplaceKernelArg(llvm::Module *M); + +#endif diff --git a/compilation/HostTranslation/lib/GenerateHostStub.cpp b/compilation/HostTranslation/lib/GenerateHostStub.cpp new file mode 100644 index 0000000..dc13bb1 --- /dev/null +++ b/compilation/HostTranslation/lib/GenerateHostStub.cpp @@ -0,0 +1,7 @@ +/** + * Generate a file for Cuda Kernel Function Attributes + * + * + * + * + */ diff --git a/compilation/HostTranslation/lib/InitializeDevice.cpp b/compilation/HostTranslation/lib/InitializeDevice.cpp new file mode 100644 index 0000000..bf1a435 --- /dev/null +++ b/compilation/HostTranslation/lib/InitializeDevice.cpp @@ -0,0 +1,6 @@ +/* + + Initialize the cudaDevice as first statements if not set by the User + (cudaSetDevice) + +*/ diff --git a/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp b/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp new file mode 100644 index 0000000..bcd747c --- /dev/null +++ b/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp @@ -0,0 +1,59 @@ +/** + * Remove Clang cuda builtin functions and variables + */ +#include "RemoveCudaBuiltin.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/ToolOutputFile.h" +#include +#include +#include + +using namespace llvm; + +void RemoveCudaBuiltin(llvm::Module *M) { + + std::set need_remove; + + if (GlobalVariable *gv = M->getGlobalVariable("llvm.global_ctors")) { + gv->dropAllReferences(); + gv->eraseFromParent(); + } + Function *c_tor = NULL; + if (c_tor = M->getFunction("__cuda_module_ctor")) { + c_tor->dropAllReferences(); + c_tor->eraseFromParent(); + } + if (c_tor = M->getFunction("__cuda_module_dtor")) { + c_tor->dropAllReferences(); + c_tor->eraseFromParent(); + } + if (c_tor = M->getFunction("__cuda_register_globals")) { + + c_tor->dropAllReferences(); + c_tor->eraseFromParent(); + } + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + + if (func_name == "__cuda_module_dtor" || + func_name == "__cuda_register_globals" || + func_name == "__cudaRegisterFunction" || + func_name == "__cudaRegisterVar" || + func_name == "__cudaRegisterFatBinary" || + func_name == "__cuda_module_ctor" || + func_name == "__cudaRegisterFatBinaryEnd" || + func_name == "__cudaUnregisterFatBinary") { + need_remove.insert(F); + } + } + for (auto f : need_remove) { + f->dropAllReferences(); + f->eraseFromParent(); + } +} diff --git a/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp b/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp new file mode 100644 index 0000000..48b0a0f --- /dev/null +++ b/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp @@ -0,0 +1,93 @@ +#include "ReplaceConstantMemory.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include +#include +#include +#include +#include + +using namespace llvm; + +void ReplaceConstantMemory(llvm::Module *M, std::ifstream &fin) { + std::string s; + bool find_constant_memory = false; + while (getline(fin, s)) { + if (s.find("ConstMemory2GlobalMemory") != std::string::npos) { + find_constant_memory = true; + break; + } + } + if (!find_constant_memory) { + assert(0 && "Do not find constant to global mapping\n"); + } + + std::map corresponding_global_memory; + while (getline(fin, s)) { + if (s.find("END") != std::string::npos) { + break; + } + // get constant name + size_t pos = 0; + pos = s.find(' '); + std::string constant_name = s.substr(0, pos); + s.erase(0, pos + 1); + // get mapped global name + std::string global_name = s.substr(3, s.length() - 1); + corresponding_global_memory.insert( + std::pair(constant_name, global_name)); + } + + std::set need_remove_constant_memory; + // find all constant memory and generate corresponding global memory + for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) { + if (auto constant_memory = dyn_cast(I)) { + if (corresponding_global_memory.find(constant_memory->getName().str()) != + corresponding_global_memory.end()) { + auto global_name = + corresponding_global_memory.find(constant_memory->getName().str()) + ->second; + // create a new global variable + if (auto PT = dyn_cast(I->getType())) { + need_remove_constant_memory.insert(constant_memory); + // generate the corresponding global memory variable + auto element_type = PT->getElementType(); + if (auto array_type = dyn_cast(element_type)) { + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, array_type, false, llvm::GlobalValue::CommonLinkage, NULL, + global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0); + + llvm::ConstantAggregateZero *const_array = + llvm::ConstantAggregateZero::get(array_type); + global_memory->setInitializer(const_array); + constant_memory->replaceAllUsesWith( + llvm::ConstantExpr::getPointerCast( + global_memory, + cast(constant_memory->getType()))); + } else if (element_type->isStructTy()) { + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, element_type, false, llvm::GlobalValue::CommonLinkage, NULL, + global_name, NULL, llvm::GlobalValue::NotThreadLocal, 0); + llvm::ConstantAggregateZero *const_array = + llvm::ConstantAggregateZero::get(element_type); + global_memory->setInitializer(const_array); + constant_memory->replaceAllUsesWith( + llvm::ConstantExpr::getPointerCast( + global_memory, + cast(constant_memory->getType()))); + } else { + assert(0 && "The required Constant Memory Type is not supported\n"); + } + } + } + } + } + for (auto i : need_remove_constant_memory) { + i->dropAllReferences(); + i->eraseFromParent(); + } + return; +} diff --git a/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp b/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp new file mode 100644 index 0000000..01a34b6 --- /dev/null +++ b/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp @@ -0,0 +1,292 @@ +#include "ReplaceCudaBuiltin.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/ToolOutputFile.h" +#include +#include +#include + +using namespace llvm; + +/* +insert sync after cudaKernel launch + call void @_Z13staticReversePii(i32* %55, i32 64) + %57 = call i32 @cudaDeviceSynchronize() +*/ +void InsertSyncAfterKernelLaunch(llvm::Module *M) { + LLVMContext *C = &M->getContext(); + + llvm::Type *Int32T = Type::getInt32Ty(*C); + llvm::FunctionType *LauncherFuncT = FunctionType::get(Int32T, NULL); + llvm::FunctionCallee _f = + M->getOrInsertFunction("cudaDeviceSynchronize", LauncherFuncT); + llvm::Function *func_launch = llvm::cast(_f.getCallee()); + std::set launch_function_name; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + if (llvm::CallBase *callInst = llvm::dyn_cast(inst)) { + if (Function *calledFunction = callInst->getCalledFunction()) { + if (calledFunction->getName().startswith("cudaLaunchKernel")) { + // F is a kernel launch function + launch_function_name.insert(func_name); + } + } + } + } + } + } + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + if (llvm::CallBase *callInst = llvm::dyn_cast(inst)) { + if (Function *calledFunction = callInst->getCalledFunction()) { + if (launch_function_name.find(calledFunction->getName().str()) != + launch_function_name.end()) { + // insert a sync after launch + if (callInst->getNextNonDebugInstruction()) { + llvm::CallInst::Create(func_launch, "", + callInst->getNextNonDebugInstruction()); + } + } + } + } + } + } + } +} + +// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) +// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) +void ReplaceKernelLaunch(llvm::Module *M) { + LLVMContext &context = M->getContext(); + auto VoidTy = llvm::Type::getVoidTy(context); + auto I8 = llvm::Type::getInt8PtrTy(context); + std::map kernels; + + std::set need_remove; + LLVMContext *C = &M->getContext(); + + llvm::Type *Int32T = Type::getInt32Ty(*C); + llvm::Type *Int8T = Type::getInt8Ty(*C); + + llvm::FunctionType *LauncherFuncT = + FunctionType::get(Type::getVoidTy(*C), NULL); + + llvm::FunctionType *LaunchFun2 = + FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL); + + bool done = false; + + std::set cuda_register_kernel_names; + + std::string str; + llvm::raw_string_ostream ss(str); + + /* + + When using << >>, clang generates cudaPushCallConfiguration with the same + function definition as the kernel definition in the kernel bitcode + + define internal void @__cuda_register_globals(i8** %0) { + entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, + float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x + i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 + x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* + null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void + (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8* + getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8* + getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8* + null, i8* null, i8* null, i8* null, i32* null) ret void + } + + */ + Function *f_register_global = M->getFunction("__cuda_register_globals"); + if (f_register_global) { + for (Function::iterator b = f_register_global->begin(); + b != f_register_global->end(); ++b) { + BasicBlock *B = &(*b); + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + if (llvm::CallInst *callInst = llvm::dyn_cast(inst)) { + if (Function *calledFunction = callInst->getCalledFunction()) { + if (calledFunction->getName().str() == "__cudaRegisterFunction") { + Value *callOperand = callInst->getArgOperand(1); + + Function *functionOperand = + dyn_cast(callInst->getArgOperand(1)); + + // call function is wrapped in a bitcast + if (functionOperand == NULL) { + + std::vector arg_sizes; + functionOperand = + dyn_cast(callOperand->stripPointerCasts()); + + cuda_register_kernel_names.insert( + functionOperand->getName().str()); + std::cout << "Cuda Register Global Kernel: " + << functionOperand->getName().str() << std::endl; + } + } + } + } + } + } + } + bool host_changed = false; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + + if (llvm::CallBase *callInst = llvm::dyn_cast(inst)) { + if (Function *calledFunction = callInst->getCalledFunction()) { + + if (calledFunction->getName().startswith("cudaLaunchKernel")) { + + Value *callOperand = callInst->getArgOperand(0); + + Function *functionOperand = + dyn_cast(callInst->getArgOperand(0)); + + // call function is wrapped in a bitcast + if (functionOperand == NULL) { + + std::vector arg_sizes; + functionOperand = + dyn_cast(callOperand->stripPointerCasts()); + + FunctionType *ft = calledFunction->getFunctionType(); + std::cout << " Parent (Caller) Function Name: " << func_name + << ", cudaLaunchKernel Function: " + << functionOperand->getName().str() << ", args " + << functionOperand->arg_size() << std::endl; + auto rep = kernels.find(functionOperand->getName().str()); + if (rep != kernels.end()) { + Function *FC = rep->second; + BitCastInst *B = new BitCastInst(FC, I8, "", callInst); + callInst->setArgOperand(0, B); + + continue; + } + + std::vector Params; + Params.push_back(I8); + FunctionType *FT = FunctionType::get(VoidTy, Params, false); + + /* + Because of the TODO in the 2nd if statement, need to get the + prior name before _host is add + */ + std::string oldName = functionOperand->getName().str(); + + // if parent function is __host and same as the cudaKernelLaunch + std::string newName = oldName + "_wrapper"; + if (func_name == oldName && host_changed && + oldName.find("_host") != std::string::npos) { + newName = + oldName.substr(0, oldName.length() - 5) + "_wrapper"; + } + std::cout << "Change Kernel Name to: " << newName << std::endl; + + Function *F = + Function::Create(FT, Function::ExternalLinkage, newName, M); + F->setDSOLocal(true); + F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + + BitCastInst *BC = new BitCastInst(F, I8, "", callInst); + callInst->setArgOperand(0, BC); + kernels.insert({functionOperand->getName().str(), F}); + } + } else if (cuda_register_kernel_names.find( + calledFunction->getName()) != + cuda_register_kernel_names.end()) { + // if the called function collides with kernel definiton + // TODO: some reason changes all occurences of the function name + // for both cudaKernelLaunch calls and regular function call + // errs() << *inst; + host_changed = true; + calledFunction->setName(calledFunction->getName() + "_host"); + std::cout << std::endl; + std::cout << "Change Host Function Name To: " + << calledFunction->getName().str() << std::endl; + } + } + } + } + } + } +} + +void ReplaceMemcpyToSymbol(llvm::Module *M) { + LLVMContext &context = M->getContext(); + auto I32 = llvm::Type::getInt32Ty(context); + std::vector need_remove; + for (Module::iterator F = M->begin(); F != M->end(); ++F) { + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Call = dyn_cast(BI)) { + if (Call->getCalledFunction()) { + auto func_name = Call->getCalledFunction()->getName().str(); + if (func_name == "cudaMemcpyToSymbol") { + std::vector args; + // i32 @cudaMemcpyToSymbol(i8* %1, i8* %2, i64 %3, i64 %4, i32 %5) + args.push_back(llvm::Type::getInt8PtrTy(context)); + args.push_back(llvm::Type::getInt8PtrTy(context)); + args.push_back(llvm::Type::getInt64Ty(context)); + args.push_back(llvm::Type::getInt64Ty(context)); + args.push_back(llvm::Type::getInt32Ty(context)); + llvm::FunctionType *func_Type = + FunctionType::get(I32, args, false); + + llvm::FunctionCallee _f = + M->getOrInsertFunction("cudaMemcpyToSymbol_host", func_Type); + llvm::Function *func = llvm::cast(_f.getCallee()); + // construct argument(s) + std::vector func_args; + func_args.push_back(Call->getArgOperand(0)); + func_args.push_back(Call->getArgOperand(1)); + func_args.push_back(Call->getArgOperand(2)); + func_args.push_back(Call->getArgOperand(3)); + func_args.push_back(Call->getArgOperand(4)); + + auto c_inst = llvm::CallInst::Create(func, func_args, "", Call); + // insert + Call->replaceAllUsesWith(c_inst); + need_remove.push_back(Call); + } + } + } + } + } + } + for (auto inst : need_remove) { + inst->eraseFromParent(); + } +} +void ReplaceCudaBuiltin(llvm::Module *M) { + InsertSyncAfterKernelLaunch(M); + ReplaceKernelLaunch(M); + ReplaceMemcpyToSymbol(M); +} diff --git a/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp b/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp new file mode 100644 index 0000000..501a783 --- /dev/null +++ b/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp @@ -0,0 +1,90 @@ +#include "ReplaceKernelArgs.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/ToolOutputFile.h" +#include +#include +#include + +using namespace llvm; + +/* + * before: + * %m_cuda.addr = alloca float*, align 8 + * after: + * %m_cuda.addr_tmp = call i8* @malloc(i64 8) + * %m_cuda.addr = bitcast i8* %m_cuda.addr_tmp to float** + */ +// TODO: we use hard-code to implement this replacement, +// to use use-analysis to find the arguments in the future +void ReplaceKernelArg(llvm::Module *M) { + LLVMContext &context = M->getContext(); + auto VoidTy = llvm::Type::getVoidTy(context); + auto I8 = llvm::Type::getInt8PtrTy(context); + std::map kernels; + + std::set need_replace; + LLVMContext *C = &M->getContext(); + + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + if (llvm::CallInst *callInst = llvm::dyn_cast(inst)) { + if (Function *calledFunction = callInst->getCalledFunction()) { + if (calledFunction->getName().startswith("cudaLaunchKernel")) { + need_replace.insert(F); + } + } + } + } + } + } + + // find/create C's malloc function + std::vector args; + args.push_back(llvm::Type::getInt8PtrTy(context)); + llvm::FunctionType *mallocFuncType = + FunctionType::get(llvm::Type::getInt8PtrTy(context), + {llvm::Type::getInt64Ty(context)}, false); + + llvm::FunctionCallee _f = M->getOrInsertFunction("malloc", mallocFuncType); + llvm::Function *func_malloc = llvm::cast(_f.getCallee()); + + for (auto F : need_replace) { + std::set args_set; + int arg_cnt = 0; + for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); + ii != ee; ++ii) { + args_set.insert(&(*ii)); + arg_cnt++; + } + std::vector need_remove; + for (Function::iterator b = F->begin(); b != F->end(); ++b) { + BasicBlock *B = &(*b); + for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { + Instruction *inst = &(*i); + if (llvm::AllocaInst *alloc = llvm::dyn_cast(inst)) { + // just replace all alloc in that function + auto c_malloc_inst = llvm::CallInst::Create( + func_malloc, + ConstantInt::get(llvm::Type::getInt64Ty(context), 256), "", + alloc); + auto bit_cast = new BitCastInst(c_malloc_inst, alloc->getType(), + alloc->getName().str(), alloc); + alloc->replaceAllUsesWith(bit_cast); + need_remove.push_back(alloc); + } + } + } + for (auto inst : need_remove) { + inst->eraseFromParent(); + } + } +} diff --git a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp b/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp deleted file mode 100644 index 67525d7..0000000 --- a/compilation/HostTranslation/lib/ReplaceKernelLaunch.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include "ReplaceKernelLaunch.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include -#include -#include - -using namespace llvm; - -// Change to i8* bitcast (i8* (i8*)* @_Z9vecPKiS0_Pii_wrapper to i8*) -// Original: i8* bitcast (void (i32*, i32*, i32*, i32)* @_Z9vecPKiS0_Pii to i8*) -void ReplaceKernelLaunch(llvm::Module *M) { - LLVMContext &context = M->getContext(); - auto VoidTy = llvm::Type::getVoidTy(context); - auto I8 = llvm::Type::getInt8PtrTy(context); - std::map kernels; - - LLVMContext *C = &M->getContext(); - - llvm::Type *Int32T = Type::getInt32Ty(*C); - llvm::Type *Int8T = Type::getInt8Ty(*C); - - llvm::FunctionType *LauncherFuncT = - FunctionType::get(Type::getVoidTy(*C), NULL); - - llvm::FunctionType *LaunchFun2 = - FunctionType::get(PointerType::get(PointerType::get(Int32T, 0), 0), NULL); - - bool done = false; - - for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { - Function *F = &(*i); - auto func_name = F->getName().str(); - - for (Function::iterator b = F->begin(); b != F->end(); ++b) { - BasicBlock *B = &(*b); - - for (BasicBlock::iterator i = B->begin(); i != B->end(); ++i) { - Instruction *inst = &(*i); - - if (llvm::CallInst *callInst = llvm::dyn_cast(inst)) { - if (Function *calledFunction = callInst->getCalledFunction()) { - - if (calledFunction->getName().startswith("cudaLaunchKernel")) { - - Value *callOperand = callInst->getArgOperand(0); - - Function *functionOperand = - dyn_cast(callInst->getArgOperand(0)); - - // call function is wrapped in a bitcast - if (functionOperand == NULL) { - - std::vector arg_sizes; - functionOperand = - dyn_cast(callOperand->stripPointerCasts()); - - FunctionType *ft = calledFunction->getFunctionType(); - std::cout << " Parent (Caller) Function Name: " << func_name - << ", cudaLaunchKernel Function: " - << functionOperand->getName().str() << ", args " - << functionOperand->arg_size() << std::endl; - auto rep = kernels.find(functionOperand->getName().str()); - if (rep != kernels.end()) { - - callInst->setArgOperand(0, rep->second); - continue; - } - - std::vector Params; - Params.push_back(I8); - FunctionType *FT = FunctionType::get(VoidTy, Params, false); - std::string newName = - functionOperand->getName().str() + "_wrapper"; - - Function *F = - Function::Create(FT, Function::ExternalLinkage, newName, M); - F->setDSOLocal(true); - - BitCastInst *BC = new BitCastInst(F, I8, "", callInst); - callInst->setArgOperand(0, BC); - kernels.insert({functionOperand->getName().str(), BC}); - } - } - } - } - } - } - } -} diff --git a/compilation/KernelTranslation.cpp b/compilation/KernelTranslation.cpp index 1d24c7b..77cc494 100644 --- a/compilation/KernelTranslation.cpp +++ b/compilation/KernelTranslation.cpp @@ -8,46 +8,66 @@ #include "warp_func.h" #include "llvm/IR/Module.h" #include +#include #include +#include #include #include #include using namespace llvm; +std::string PATH = "kernel_meta.log"; + int main(int argc, char **argv) { - assert(argc == 9 && "incorrect number of arguments\n"); + assert(argc == 3 && "incorrect number of arguments\n"); llvm::Module *program = LoadModuleFromFilr(argv[1]); - // get size of grid and dim from input arguments - int *grid_dim = new int[3]; - int *block_dim = new int[3]; - grid_dim[0] = atoi(argv[3]); - grid_dim[1] = atoi(argv[4]); - grid_dim[2] = atoi(argv[5]); - block_dim[0] = atoi(argv[6]); - block_dim[1] = atoi(argv[7]); - block_dim[2] = atoi(argv[8]); + + std::ofstream fout; + fout.open(PATH); // inline, and create auxiliary global variables - init_block(program); + init_block(program, fout); // insert sync before each vote, and replace the // original vote function to warp vote handle_warp_vote(program); + // replace warp shuffle + // VerifyModule(program); handle_warp_shfl(program); // insert sync + // VerifyModule(program); insert_sync(program); // split block by sync + // VerifyModule(program); + std::cout << "split\n" << std::flush; split_block_by_sync(program); // add loop for intra&intera thread - insert_warp_loop(program); - // (TODO): replace this patch - replace_built_in_function(program, grid_dim, block_dim); + // VerifyModule(program); + std::cout << "insert\n" << std::flush; + insert_warp_loop(program); + + // VerifyModule(program); + + // (TODO): replace this patch + std::cout << "replace\n" << std::flush; + replace_built_in_function(program); + + // VerifyModule(program); + std::cout << "generate\n" << std::flush; generate_x86_format(program); + + // VerifyModule(program); + // performance optimization performance_optimization(program); + VerifyModule(program); + DumpModule(program, argv[2]); + + fout.close(); + return 0; } diff --git a/compilation/KernelTranslation/include/generate_x86_format.h b/compilation/KernelTranslation/include/generate_x86_format.h index dff3694..747b1b1 100644 --- a/compilation/KernelTranslation/include/generate_x86_format.h +++ b/compilation/KernelTranslation/include/generate_x86_format.h @@ -5,4 +5,6 @@ void generate_x86_format(llvm::Module *M); +void set_meta_data(llvm::Module *M); + #endif diff --git a/compilation/KernelTranslation/include/init.h b/compilation/KernelTranslation/include/init.h index 10f5186..d0811e4 100644 --- a/compilation/KernelTranslation/include/init.h +++ b/compilation/KernelTranslation/include/init.h @@ -2,6 +2,6 @@ #define __NVVM2x86_INIT__ #include "llvm/IR/Module.h" - -void init_block(llvm::Module *M); +#include +void init_block(llvm::Module *M, std::ofstream &fout); #endif diff --git a/compilation/KernelTranslation/include/memory_hierarchy.h b/compilation/KernelTranslation/include/memory_hierarchy.h index 1f8495c..acc9783 100644 --- a/compilation/KernelTranslation/include/memory_hierarchy.h +++ b/compilation/KernelTranslation/include/memory_hierarchy.h @@ -1,9 +1,10 @@ #ifndef __NVVM2x86_MEMORY_HIERARCHY__ #define __NVVM2x86_MEMORY_HIERARCHY__ #include "llvm/IR/Module.h" - +#include using namespace llvm; void mem_share2global(llvm::Module *M); +void mem_constant2global(llvm::Module *M, std::ofstream &fout); #endif diff --git a/compilation/KernelTranslation/include/tool.h b/compilation/KernelTranslation/include/tool.h index cb1963f..e1b1e90 100644 --- a/compilation/KernelTranslation/include/tool.h +++ b/compilation/KernelTranslation/include/tool.h @@ -12,7 +12,7 @@ llvm::CallInst *CreateIntraWarpBarrier(llvm::Instruction *InsertBefore); void VerifyModule(llvm::Module *); void phi2alloc(llvm::Module *M); void remove_cuda_built_in(llvm::Module *M); -void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim); +void replace_built_in_function(llvm::Module *M); void replace_asm_call(llvm::Module *M); bool find_block_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end); @@ -21,4 +21,5 @@ bool has_warp_barrier(llvm::BasicBlock *B); bool has_barrier(llvm::BasicBlock *B); bool has_block_barrier(llvm::BasicBlock *B); bool has_barrier(llvm::Function *F); +void replace_dynamic_shared_memory(llvm::Module *M); #endif diff --git a/compilation/KernelTranslation/lib/generate_x86_format.cpp b/compilation/KernelTranslation/lib/generate_x86_format.cpp index dbfacbc..b2594cf 100644 --- a/compilation/KernelTranslation/lib/generate_x86_format.cpp +++ b/compilation/KernelTranslation/lib/generate_x86_format.cpp @@ -18,6 +18,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include using namespace llvm; @@ -40,6 +41,10 @@ void decode_input(llvm::Module *M) { llvm::FunctionType *LauncherFuncT = FunctionType::get( Type::getVoidTy(*C), {PointerType::get(Int8T, 0)}, false); + std::set dynmaic_memory; + + std::map corres_dynamic_memory_load_address; + // generate Wrapper Function type // now we only support a single int32* for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { @@ -64,6 +69,51 @@ void decode_input(llvm::Module *M) { // convert to int** input_arg = Builder.CreateBitOrPointerCast( input_arg, PointerType::get(PointerType::get(Int32T, 0), 0)); + + // dynamic memory load in the wrapper function + GlobalVariable *share_memory = M->getGlobalVariable("wrapper_global_data"); + if (share_memory != NULL) { + dynmaic_memory.insert(share_memory); + llvm::GlobalVariable *global_mem = new llvm::GlobalVariable( + *M, Int32T, false, llvm::GlobalValue::ExternalLinkage, NULL, + "thread_memory_size", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, + 0, false); + Value *loadedValue = Builder.CreateLoad(global_mem); + + llvm::FunctionType *LaunchFun2 = FunctionType::get( + PointerType::get(PointerType::get(Int32T, 0), 0), NULL); + + FunctionCallee fc2 = + M->getOrInsertFunction("_wrapper_global_data", LaunchFun2); + + Function *WorkGroup2 = dyn_cast(fc2.getCallee()); + + WorkGroup2->setLinkage(GlobalValue::WeakODRLinkage); + WorkGroup2->setVisibility(GlobalValue::HiddenVisibility); + Comdat *co = M->getOrInsertComdat("_wrapper_global_data"); + co->setSelectionKind(Comdat::SelectionKind::Any); + WorkGroup2->setComdat(co); + + BasicBlock *Block2 = BasicBlock::Create(M->getContext(), "", WorkGroup2); + + llvm::IRBuilder<> Builder2(M->getContext()); + Builder2.SetInsertPoint(Block2); + Builder2.CreateRet(share_memory); + + auto PT = dyn_cast(share_memory->getType()); + auto element_type = PT->getElementType(); + // std::cout << element_type->getTypeID() << " Got global memor $$$$$$" + // << share_memory->getName().str() << std::endl; + + AllocaInst *new_arr = Builder.CreateAlloca(Int8T, loadedValue, "new_arr"); + // new_arr->setAlignment(llvm::MaybeAlign(16)); + Value *new_ar = new_arr; + Value *gptr = Builder.CreateBitOrPointerCast( + share_memory, PointerType::get(PointerType::get(Int8T, 0), 0)); + + Builder.CreateStore(new_ar, gptr); + } + size_t idx = 0; for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); ii != ee; ++ii) { @@ -95,6 +145,8 @@ void remove_barrier(llvm::Module *M) { for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BI = BB->begin(); BI != BB->end(); BI++) { if (auto Call = dyn_cast(BI)) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.bar.warp.sync" || func_name == "llvm.nvvm.barrier0" || @@ -109,6 +161,11 @@ void remove_barrier(llvm::Module *M) { } } +void remove_useless_var(llvm::Module *M) { + M->getGlobalVariable("intra_warp_index")->eraseFromParent(); + M->getGlobalVariable("inter_warp_index")->eraseFromParent(); +} + void generate_x86_format(llvm::Module *M) { // change metadata set_meta_data(M); @@ -116,4 +173,6 @@ void generate_x86_format(llvm::Module *M) { decode_input(M); // remove barrier remove_barrier(M); + // remove useless func/variable + remove_useless_var(M); } diff --git a/compilation/KernelTranslation/lib/handle_sync.cpp b/compilation/KernelTranslation/lib/handle_sync.cpp index e0fb19b..565d636 100644 --- a/compilation/KernelTranslation/lib/handle_sync.cpp +++ b/compilation/KernelTranslation/lib/handle_sync.cpp @@ -27,6 +27,8 @@ void split_block_by_sync(llvm::Function *F) { } llvm::CallInst *Call = llvm::dyn_cast(inst); if (Call) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.bar.warp.sync" || diff --git a/compilation/KernelTranslation/lib/init.cpp b/compilation/KernelTranslation/lib/init.cpp index 8007470..2c31392 100644 --- a/compilation/KernelTranslation/lib/init.cpp +++ b/compilation/KernelTranslation/lib/init.cpp @@ -1,6 +1,7 @@ #include "init.h" #include "memory_hierarchy.h" #include "tool.h" +#include #include #include @@ -23,7 +24,8 @@ using namespace llvm; -void inline_func_vote(llvm::Module *M) { +bool inline_warp_level_func(llvm::Module *M) { + bool changed = false; std::set need_remove; for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { @@ -36,10 +38,13 @@ void inline_func_vote(llvm::Module *M) { for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { if (CallInst *c = dyn_cast(BI++)) { if (c->getCalledFunction()) { - if (c->getCalledFunction()->getName().str() == "_Z10__any_syncji") { + auto func_name = c->getCalledFunction()->getName().str(); + if (func_name == "_Z10__any_syncji" || + func_name.find("shfl_down_sync") != std::string::npos) { InlineFunctionInfo IFI; InlineFunction(c, IFI); need_remove.insert(c->getCalledFunction()); + changed = true; } } } @@ -50,6 +55,56 @@ void inline_func_vote(llvm::Module *M) { f->dropAllReferences(); f->eraseFromParent(); } + return changed; +} + +bool find_sreg_inst(llvm::Function *F) { + Function::iterator I = F->begin(); + for (Function::iterator E = F->end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + if (CallInst *c = dyn_cast(BI++)) { + if (c->getCalledFunction()) { + auto func_name = c->getCalledFunction()->getName().str(); + if (func_name.find("llvm.nvvm.read.ptx.sreg.") != std::string::npos) { + return true; + } + } + } + } + } + return false; +} +bool inline_func_with_tid(llvm::Module *M) { + bool changed = false; + std::set need_remove; + std::set need_inline; + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + Function::iterator I = F->begin(); + for (Function::iterator E = F->end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + if (CallInst *c = dyn_cast(BI++)) { + if (c->getCalledFunction()) { + if (find_sreg_inst(c->getCalledFunction())) { + printf("inline: %s\n", + c->getCalledFunction()->getName().str().c_str()); + need_inline.insert(c); + need_remove.insert(c->getCalledFunction()); + } + } + } + } + } + } + if (!need_inline.empty()) { + changed = true; + } + for (auto c : need_inline) { + InlineFunctionInfo IFI; + InlineFunction(c, IFI); + } + return changed; } void create_global_variable(llvm::Module *M) { @@ -70,21 +125,33 @@ void create_global_variable(llvm::Module *M) { llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, NULL, "block_size", NULL, - llvm::GlobalValue::NotThreadLocal, 0, false); + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, NULL, "block_size_x", NULL, - llvm::GlobalValue::NotThreadLocal, 0, false); + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, NULL, "block_size_y", NULL, - llvm::GlobalValue::NotThreadLocal, 0, false); + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, NULL, "block_size_z", NULL, - llvm::GlobalValue::NotThreadLocal, 0, false); + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, - NULL, "grid_size", NULL, - llvm::GlobalValue::NotThreadLocal, 0, false); + NULL, "grid_size_x", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, - NULL, "block_index", NULL, + NULL, "grid_size_y", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "grid_size_z", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_index_x", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_index_y", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + new llvm::GlobalVariable(*M, I32, false, llvm::GlobalValue::ExternalLinkage, + NULL, "block_index_z", NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); // TLS variable used for warp-level collective operators new llvm::GlobalVariable( @@ -224,24 +291,23 @@ bool lower_constant_expr(llvm::Module *M) { auto load_from = load_inst->getOperand(0); if (auto get_element_ptr = dyn_cast(load_from)) { modified = true; - auto ReplInst = get_element_ptr->getAsInstruction(); - ReplInst->insertBefore(load_inst); std::vector Users; - // Do not replace use during iteration of use. Do it in another loop for (auto U : get_element_ptr->users()) { if (auto InstUser = dyn_cast(U)) { Users.push_back(InstUser); } } - for (auto &User : Users) + for (auto &User : Users) { + auto ReplInst = get_element_ptr->getAsInstruction(); + ReplInst->insertBefore(User); User->replaceUsesOfWith(get_element_ptr, ReplInst); + } } } else if (auto store_inst = dyn_cast(BI)) { auto store_to = store_inst->getOperand(1); if (auto addr_cast = dyn_cast(store_to)) { modified = true; - auto ReplInst = addr_cast->getAsInstruction(); - ReplInst->insertBefore(store_inst); + std::vector Users; // Do not replace use during iteration of use. Do it in another loop for (auto U : addr_cast->users()) { @@ -249,16 +315,19 @@ bool lower_constant_expr(llvm::Module *M) { Users.push_back(InstUser); } } - for (auto &User : Users) + for (auto &User : Users) { + auto ReplInst = addr_cast->getAsInstruction(); + ReplInst->insertBefore(User); User->replaceUsesOfWith(addr_cast, ReplInst); + } } } else if (auto get_element_ptr = dyn_cast(BI)) { auto get_from = get_element_ptr->getOperand(0); if (auto addr_cast = dyn_cast(get_from)) { modified = true; - auto ReplInst = addr_cast->getAsInstruction(); - ReplInst->insertBefore(get_element_ptr); + // auto ReplInst = addr_cast->getAsInstruction(); + // ReplInst->insertBefore(get_element_ptr); std::vector Users; // Do not replace use during iteration of use. Do it in another loop for (auto U : addr_cast->users()) { @@ -266,8 +335,11 @@ bool lower_constant_expr(llvm::Module *M) { Users.push_back(InstUser); } } - for (auto &User : Users) + for (auto &User : Users) { + auto ReplInst = addr_cast->getAsInstruction(); + ReplInst->insertBefore(User); User->replaceUsesOfWith(addr_cast, ReplInst); + } } } } @@ -276,11 +348,24 @@ bool lower_constant_expr(llvm::Module *M) { return modified; } -void init_block(llvm::Module *M) { +void replace_cuda_math_built_in(llvm::Module *M) { + // replace _ZL3expd, just delete its body + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + auto func_name = F->getName().str(); + if (func_name.find("_ZL3expd") != std::string::npos) { + F->deleteBody(); + } + } +} + +void init_block(llvm::Module *M, std::ofstream &fout) { // using official llvm preprocess llvm_preprocess(M); // remove useles Cuda function remove_cuda_built_in(M); + // replace CUDA math function, like expf + replace_cuda_math_built_in(M); // lower ConstantExpression bool modified; @@ -289,14 +374,26 @@ void init_block(llvm::Module *M) { } while (modified); // remove useless metadata remove_metadata(M); - // inline vote function - inline_func_vote(M); + // inline warp-level function + while (1) { + if (!inline_warp_level_func(M)) + break; + } + // TODO: remove the hardcode + while (1) { + if (!inline_func_with_tid(M)) + break; + } // create global variable for warp and vote create_global_variable(M); // replace phi with data load phi2alloc(M); // replace share memory mem_share2global(M); + // replace share memory + mem_constant2global(M, fout); // replace asm Inline replace_asm_call(M); + // replace dynamic shared memory + replace_dynamic_shared_memory(M); } diff --git a/compilation/KernelTranslation/lib/insert_sync.cpp b/compilation/KernelTranslation/lib/insert_sync.cpp index dfe0676..f7483fe 100644 --- a/compilation/KernelTranslation/lib/insert_sync.cpp +++ b/compilation/KernelTranslation/lib/insert_sync.cpp @@ -212,11 +212,22 @@ public: changed = true; // we may create a new conditional barrier after insert - if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) - conditionalBarriers.push_back(pred); + if (!PDT->getPostDomTree().dominates(pred, &F.getEntryBlock())) { + // if the block postdominates all its predecessor + // then it is not a conditional barriers + bool post_dominate_all = true; + for (auto I = pred_begin(pred); I != pred_end(pred); I++) { + if (!PDT->getPostDomTree().dominates(pred, *I)) { + post_dominate_all = false; + break; + } + } + if (!post_dominate_all) + conditionalBarriers.push_back(pred); + } // find any block which are not dominated by header - // but be posdiminated by merge point + // but be postdominated by merge point std::queue if_body; std::set visited_block; for (int i = 0; i < pred->getTerminator()->getNumSuccessors(); i++) { @@ -234,19 +245,26 @@ public: PDT->getPostDomTree().dominates(merge_point, curr)) { // we should insert barrier at the beginning and // end of its predecessor + printf("insert [255]: %s\n", curr->getName().str().c_str()); if (has_warp_barrier(b)) { CreateIntraWarpBarrier(&(*curr->begin())); for (BasicBlock *Pred : predecessors(curr)) { + printf("insert [262]: %s\n", Pred->getName().str().c_str()); CreateIntraWarpBarrier(&(*Pred->getTerminator())); } } else { CreateInterWarpBarrier(&(*curr->begin())); for (BasicBlock *Pred : predecessors(curr)) { + printf("insert [268]: %s\n", Pred->getName().str().c_str()); CreateInterWarpBarrier(&(*Pred->getTerminator())); } } } for (int i = 0; i < curr->getTerminator()->getNumSuccessors(); i++) { + // avoid backedge + if (DT->dominates(curr->getTerminator()->getSuccessor(i), pred)) { + continue; + } if_body.push(curr->getTerminator()->getSuccessor(i)); } } @@ -266,6 +284,32 @@ public: AU.addRequired(); } + BasicBlock *find_merge_point(BasicBlock *start, PostDominatorTree &PDT) { + assert(start->getTerminator()->getNumSuccessors() == 2); + std::set visit; + std::queue pending_blocks; + for (int i = 0; i < start->getTerminator()->getNumSuccessors(); i++) { + pending_blocks.push(start->getTerminator()->getSuccessor(i)); + } + while (!pending_blocks.empty()) { + BasicBlock *current = pending_blocks.front(); + pending_blocks.pop(); + + if (visit.find(current) != visit.end()) + continue; + + visit.insert(current); + if (PDT.dominates(current, start)) + return current; + for (int i = 0; i < current->getTerminator()->getNumSuccessors(); i++) { + auto succ = current->getTerminator()->getSuccessor(i); + if (visit.find(succ) == visit.end()) + pending_blocks.push(succ); + } + } + assert(0 && "Do not find merge point\n"); + return NULL; + } virtual bool runOnFunction(Function &F) { if (!isKernelFunction(F.getParent(), &F)) return 0; @@ -280,18 +324,8 @@ public: for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { BasicBlock *b = &*i; - BasicBlock *merge_point = NULL; if (b->getTerminator()->getNumSuccessors() == 2) { - auto b1 = b->getTerminator()->getSuccessor(0); - auto b2 = b->getTerminator()->getSuccessor(1); - if (PDT->getPostDomTree().dominates(b1, b2)) { - merge_point = b1; - } else if (PDT->getPostDomTree().dominates(b2, b2)) { - merge_point = b2; - } else { - assert(0 && "find complex if-else branch\n"); - } - std::cout << std::flush; + auto merge_point = find_merge_point(b, PDT->getPostDomTree()); for (BasicBlock *Pred : predecessors(merge_point)) { if (!DT->dominates(b, Pred)) { // we need to insert an extra block to be the merge point @@ -305,14 +339,8 @@ public: auto M = F.getParent(); for (auto head : if_head) { assert(head->getTerminator()->getNumSuccessors() == 2); - BasicBlock *merge_point = NULL; - auto s1 = head->getTerminator()->getSuccessor(0); - auto s2 = head->getTerminator()->getSuccessor(1); - if (PDT->getPostDomTree().dominates(s1, s2)) { - merge_point = s1; - } else { - merge_point = s2; - } + BasicBlock *merge_point = find_merge_point(head, PDT->getPostDomTree()); + assert(PDT->getPostDomTree().dominates(merge_point, head)); if (!find_barrier_in_region(head, merge_point)) { printf("do not need to handle tri-income if: %s\n", merge_point->getName().str().c_str()); @@ -368,6 +396,8 @@ public: for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); j != e; ++j) { if (auto Call = dyn_cast(j)) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.bar.warp.sync" || @@ -383,7 +413,7 @@ public: } if (!is_conditional_loop) return 0; - // insert barrier at the beginning of header + // insert barrier at the beginning of header (for_cond) // and the end of pre header, so that we can get a // single block connected with latch if (!is_warp) { @@ -399,17 +429,40 @@ public: } // as we assume all loops are rotated, we have to insert - // barrier before the condition jump of the loop exit - - if (auto exit_block = L->getExitingBlock()) { + // barrier before the condition jump of the for_cond + if (auto for_cond = L->getExitingBlock()) { + assert(for_cond->getTerminator()->getNumSuccessors() == 2 && + "has more than 2 successors of the for-cond\n"); auto conditional_br = - dyn_cast(exit_block->getTerminator()); + dyn_cast(for_cond->getTerminator()); assert(conditional_br && conditional_br->isConditional()); - // insert barrier at the beginning of successor of exit + // insert barrier before the condition jump of the loop cond if (!is_warp) CreateInterWarpBarrier(conditional_br); else CreateIntraWarpBarrier(conditional_br); + // insert barrier before the for_body + auto for_body = for_cond->getTerminator()->getSuccessor(0); + if (for_body == L->getExitBlock()) { + for_body = for_cond->getTerminator()->getSuccessor(1); + } + // insert at the beginning of for_body + if (!is_warp) + CreateInterWarpBarrier(&(*for_body->begin())); + else + CreateIntraWarpBarrier(&(*for_body->begin())); + // insert at the beginning and end in for_inc block + if (auto for_inc = L->getLoopLatch()) { + if (!is_warp) { + CreateInterWarpBarrier(&(*for_inc->begin())); + CreateInterWarpBarrier(for_inc->getTerminator()); + } else { + CreateIntraWarpBarrier(&(*for_inc->begin())); + CreateIntraWarpBarrier(for_inc->getTerminator()); + } + } else { + assert(0 && "has continue in a barrier loop\n"); + } } else { // handle break in for-loop printf("loop has multiply exists\n"); diff --git a/compilation/KernelTranslation/lib/insert_warp_loop.cpp b/compilation/KernelTranslation/lib/insert_warp_loop.cpp index f4023f0..7855778 100644 --- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp +++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp @@ -67,9 +67,15 @@ std::map contextArrays; int tempInstructionIndex = 0; int need_nested_loop; +// adding multiple kenerl in file support + bool ShouldNotBeContextSaved(llvm::Instruction *instr) { if (isa(instr)) return true; + // if (isa(instr)) + // return true; + // if (isa(instr)) + // return true; llvm::Module *M = instr->getParent()->getParent()->getParent(); llvm::LoadInst *load = dyn_cast(instr); @@ -111,6 +117,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction, return contextArrays[varName]; BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock(); + IRBuilder<> builder(&*(bb.getFirstInsertionPt())); Function *FF = instruction->getParent()->getParent(); Module *M = instruction->getParent()->getParent()->getParent(); @@ -127,6 +134,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction, Type *AllocType = elementType; AllocaInst *InstCast = dyn_cast(instruction); + /* if (InstCast) { unsigned Alignment = InstCast->getAlignment(); @@ -166,7 +174,7 @@ llvm::Instruction *GetContextArray(llvm::Instruction *instruction, } } } - + */ llvm::Value *ItemSize = nullptr; llvm::AllocaInst *Alloca = nullptr; @@ -354,13 +362,36 @@ void handle_local_variable_intra_warp(std::vector PRs) { auto F = PRs[0].start_block->getParent(); for (auto bb = F->begin(); bb != F->end(); bb++) { for (auto ii = bb->begin(); ii != bb->end(); ii++) { - if (isa(&(*ii))) - instruction_to_fix.push_back(&(*ii)); - } - for (auto inst : instruction_to_fix) { - AddContextSaveRestore(inst, intra_warp_loop); + if (isa(&(*ii))) { + auto alloc = dyn_cast(&(*ii)); + // Do not duplicate var used outside PRs + bool used_in_non_PR = false; + for (Instruction::use_iterator ui = alloc->use_begin(), + ue = alloc->use_end(); + ui != ue; ++ui) { + llvm::Instruction *user = dyn_cast(ui->getUser()); + auto user_block = user->getParent(); + bool find_in_PR = false; + for (auto PR : PRs) { + if (PR.wrapped_block.find(user_block) != PR.wrapped_block.end()) { + find_in_PR = true; + break; + } + } + if (find_in_PR == false) { + used_in_non_PR = true; + break; + } + } + if (!used_in_non_PR) { + instruction_to_fix.push_back(alloc); + } + } } } + for (auto inst : instruction_to_fix) { + AddContextSaveRestore(inst, intra_warp_loop); + } } for (auto parallel_regions : PRs) { @@ -380,10 +411,8 @@ void handle_local_variable_intra_warp(std::vector PRs) { for (llvm::BasicBlock::iterator instr = bb->begin(); instr != bb->end(); ++instr) { llvm::Instruction *instruction = &*instr; - if (ShouldNotBeContextSaved(instruction)) continue; - for (Instruction::use_iterator ui = instruction->use_begin(), ue = instruction->use_end(); ui != ue; ++ui) { @@ -582,6 +611,8 @@ void remove_barrier(llvm::Function *F, bool intra_warp_loop) { for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BI = BB->begin(); BI != BB->end(); BI++) { if (auto Call = dyn_cast(BI)) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.bar.warp.sync") { need_remove.push_back(Call); @@ -648,6 +679,8 @@ public: bool has_barrier = 0; for (auto i = current->begin(), e = current->end(); i != e; ++i) { if (llvm::CallInst *call_inst = llvm::dyn_cast(&(*i))) { + if (call_inst->isInlineAsm()) + continue; auto func_name = call_inst->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.barrier.sync") @@ -761,6 +794,8 @@ public: for (Function::iterator s = F->begin(); s != F->end(); s++) { if (llvm::CallInst *call_inst = llvm::dyn_cast(s->begin())) { + if (call_inst->isInlineAsm()) + continue; auto func_name = call_inst->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.barrier.sync") { @@ -787,6 +822,12 @@ public: if (!isKernelFunction(F.getParent(), &F)) return 0; + auto func_name = (&F)->getName().str(); + // clear context array, temp variables for new kernel function + contextArrays.clear(); + tempInstructionIds.clear(); + tempInstructionIndex = 0; + DT = &getAnalysis().getDomTree(); PDT = &getAnalysis().getPostDomTree(); @@ -794,11 +835,11 @@ public: auto parallel_regions = getParallelRegions(&F, intra_warp_loop); assert(!parallel_regions.empty() && "can not find any parallel regions\n"); // print_parallel_region(parallel_regions); - add_warp_loop(parallel_regions, intra_warp_loop); if (intra_warp_loop) { handle_local_variable_intra_warp(parallel_regions); } + add_warp_loop(parallel_regions, intra_warp_loop); remove_barrier(&F, intra_warp_loop); return 1; } @@ -816,6 +857,8 @@ bool has_warp_barrier(llvm::Module *M) { for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BI = BB->begin(); BI != BB->end(); BI++) { if (auto Call = dyn_cast(BI)) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.bar.warp.sync") { return true; @@ -841,8 +884,8 @@ void insert_warp_loop(llvm::Module *M) { // only need a single loop, with size=block_size Passes.add(new InsertWarpLoopPass(intra_warp)); Passes.run(*M); - // remove all barriers - for (auto F = M->begin(); F != M->end(); ++F) - remove_barrier(dyn_cast(F), false); } + // remove all barriers + for (auto F = M->begin(); F != M->end(); ++F) + remove_barrier(dyn_cast(F), false); } diff --git a/compilation/KernelTranslation/lib/memory_hierarchy.cpp b/compilation/KernelTranslation/lib/memory_hierarchy.cpp index 9152500..96eacd4 100644 --- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp +++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp @@ -9,6 +9,8 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include +#include +#include #include #include #include @@ -36,15 +38,35 @@ void mem_share2global(llvm::Module *M) { auto new_name = "wrapper_global_" + share_memory->getName().str(); auto element_type = PT->getElementType(); if (auto array_type = dyn_cast(element_type)) { - llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( - *M, array_type, false, llvm::GlobalValue::ExternalLinkage, NULL, - new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 1); - ConstantAggregateZero *const_array = - ConstantAggregateZero::get(array_type); - global_memory->setInitializer(const_array); - corresponding_global_memory.insert( - std::pair(share_memory, - global_memory)); + if (share_memory->hasExternalLinkage() && + array_type->getArrayNumElements() == 0) { + // external shared memory of [] + // generate global type pointer + PointerType *PointerTy = + PointerType::get(array_type->getElementType(), 0); + llvm::Constant *x1 = ConstantPointerNull::get(PointerTy); + llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable( + *M, PointerTy, false, llvm::GlobalValue::CommonLinkage, x1, + "wrapper_global_data", NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); + + global_ptr->setDSOLocal(true); + + corresponding_global_memory.insert( + std::pair(share_memory, + global_ptr)); + } else { + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, array_type, false, llvm::GlobalValue::ExternalLinkage, + NULL, new_name, NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 1); + ConstantAggregateZero *const_array = + ConstantAggregateZero::get(array_type); + global_memory->setInitializer(const_array); + corresponding_global_memory.insert( + std::pair(share_memory, + global_memory)); + } } else if (auto int_type = dyn_cast(element_type)) { auto zero = llvm::ConstantInt::get(int_type, 0, true); llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( @@ -54,6 +76,16 @@ void mem_share2global(llvm::Module *M) { corresponding_global_memory.insert( std::pair(share_memory, global_memory)); + } else if (element_type->isFloatTy()) { + auto FP_type = llvm::Type::getFloatTy(*C); + auto zero = llvm::ConstantFP::get(FP_type, 0); + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, FP_type, false, llvm::GlobalValue::ExternalLinkage, zero, + new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, + false); + corresponding_global_memory.insert( + std::pair(share_memory, + global_memory)); } else { assert(0 && "The required Share Memory Type is not supported\n"); } @@ -62,57 +94,11 @@ void mem_share2global(llvm::Module *M) { } } - for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { - Function *F = &(*i); - for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { - BasicBlock *b = &*i; - for (BasicBlock::iterator i = b->begin(), e = b->end(); i != e; ++i) { - if (auto get_element_ptr = dyn_cast(i)) { - auto read_array = get_element_ptr->getPointerOperand(); - if (GlobalVariable *read_share_memory = - dyn_cast(read_array)) { - // find a GetElementPtr which read share memory - if (corresponding_global_memory.find(read_share_memory) != - corresponding_global_memory.end()) { - std::vector Indices; - for (int i = 0; i < get_element_ptr->getNumIndices(); i++) - Indices.push_back(get_element_ptr->getOperand(i + 1)); - - auto new_GEP = GetElementPtrInst::Create( - NULL, // Pointee type - corresponding_global_memory.find(read_share_memory) - ->second, // Alloca - Indices, // Indices - "", get_element_ptr); - // replace all get_element_ptr with new_GEP: - // we can not directly use: - // get_element_ptr->replaceAllUsesWith(new_GEP); - // as get_element_ptr and new_GEP have different return type - llvm::Type *original_type = get_element_ptr->getType(); - auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast( - new_GEP, original_type, "", get_element_ptr); - get_element_ptr->replaceAllUsesWith(FormatASC); - need_remove.insert(get_element_ptr); - } - } - } else if (auto addr_cast = dyn_cast(i)) { - auto read_array = addr_cast->getOperand(0); - if (GlobalVariable *read_share_memory = - dyn_cast(read_array)) { - // find a GetElementPtr which read share memory - if (corresponding_global_memory.find(read_share_memory) != - corresponding_global_memory.end()) { - llvm::Type *original_type = addr_cast->getType(); - auto FormatASC = CastInst::CreatePointerBitCastOrAddrSpaceCast( - corresponding_global_memory.find(read_share_memory)->second, - original_type, "", addr_cast); - addr_cast->replaceAllUsesWith(FormatASC); - need_remove.insert(addr_cast); - } - } - } - } - } + for (auto k : corresponding_global_memory) { + auto share_addr = k.first; + auto global_addr = k.second; + share_addr->replaceAllUsesWith(ConstantExpr::getPointerCast( + global_addr, cast(share_addr->getType()))); } for (auto i : need_remove) { @@ -124,3 +110,83 @@ void mem_share2global(llvm::Module *M) { i->eraseFromParent(); } } + +void mem_constant2global(llvm::Module *M, std::ofstream &fout) { + LLVMContext *C = &M->getContext(); + llvm::Type *Int32T = Type::getInt32Ty(*C); + llvm::Type *Int64T = Type::getInt64Ty(*C); + llvm::Type *Int8T = Type::getInt8Ty(*C); + + std::map corresponding_global_memory; + std::set need_remove; + std::set need_remove_constant_memory; + + // find all constant memory and generate corresponding global memory + for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) { + if (GlobalVariable *constant_memory = dyn_cast(I)) { + if (auto PT = dyn_cast(I->getType())) { + unsigned AS = PT->getAddressSpace(); + if (AS == 4) { // find a share memory + need_remove_constant_memory.insert(constant_memory); + // generate the corresponding global memory variable + auto new_name = "wrapper_global_" + constant_memory->getName().str(); + auto element_type = PT->getElementType(); + if (auto array_type = dyn_cast(element_type)) { + if (constant_memory->hasExternalLinkage() && + array_type->getArrayNumElements() == 0) { + // external shared memory of [] + // generate global type pointer + PointerType *PointerTy = + PointerType::get(array_type->getElementType(), 0); + llvm::Constant *x1 = ConstantPointerNull::get(PointerTy); + llvm::GlobalVariable *global_ptr = new llvm::GlobalVariable( + *M, PointerTy, false, llvm::GlobalValue::ExternalLinkage, x1, + "wrapper_global_data", NULL, + llvm::GlobalValue::NotThreadLocal, 0, true); + + corresponding_global_memory.insert( + std::pair(constant_memory, + global_ptr)); + } else { + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, array_type, false, llvm::GlobalValue::ExternalLinkage, + NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0); + corresponding_global_memory.insert( + std::pair(constant_memory, + global_memory)); + } + } else if (element_type->isStructTy()) { + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, element_type, false, llvm::GlobalValue::ExternalLinkage, + NULL, new_name, NULL, llvm::GlobalValue::NotThreadLocal, 0); + corresponding_global_memory.insert( + std::pair(constant_memory, + global_memory)); + } else { + assert(0 && "The required Constant Memory Type is not supported\n"); + } + } + } + } + } + fout << "ConstMemory2GlobalMemory\n"; + for (auto k : corresponding_global_memory) { + auto const_addr = k.first; + auto global_addr = k.second; + const_addr->replaceAllUsesWith(ConstantExpr::getPointerCast( + global_addr, cast(const_addr->getType()))); + // this file will be used by host translator + fout << const_addr->getName().str().c_str() << " to " + << global_addr->getName().str().c_str() << std::endl; + } + fout << "END\n"; + + for (auto i : need_remove) { + i->dropAllReferences(); + i->eraseFromParent(); + } + for (auto i : need_remove_constant_memory) { + i->dropAllReferences(); + i->eraseFromParent(); + } +} diff --git a/compilation/KernelTranslation/lib/tool.cpp b/compilation/KernelTranslation/lib/tool.cpp index c3c379e..c1652c5 100644 --- a/compilation/KernelTranslation/lib/tool.cpp +++ b/compilation/KernelTranslation/lib/tool.cpp @@ -1,5 +1,6 @@ #include "tool.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" @@ -187,7 +188,52 @@ void remove_cuda_built_in(llvm::Module *M) { } } -void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { +// copied from POCL +static void breakConstantExpressions(llvm::Value *Val, llvm::Function *Func) { + std::vector Users(Val->user_begin(), Val->user_end()); + for (auto *U : Users) { + if (auto *CE = llvm::dyn_cast(U)) { + // First, make sure no users of this constant expression are themselves + // constant expressions. + breakConstantExpressions(U, Func); + // Convert this constant expression to an instruction. + llvm::Instruction *I = CE->getAsInstruction(); + I->insertBefore(&*Func->begin()->begin()); + CE->replaceAllUsesWith(I); + CE->destroyConstant(); + } + } +} + +void replace_dynamic_shared_memory(llvm::Module *M) { + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + if (!isKernelFunction(M, F)) + continue; + for (Module::global_iterator i = M->global_begin(), e = M->global_end(); + i != e; ++i) { + breakConstantExpressions(&*i, F); + } + auto dynamic_shared_memory_addr = + M->getGlobalVariable("dynamic_shared_memory"); + if (!dynamic_shared_memory_addr) { + return; + } + auto load_shared_memory = + new LoadInst(dynamic_shared_memory_addr, "new_load"); + auto new_bit_cast = + new BitCastInst(load_shared_memory, + dynamic_shared_memory_addr->getType(), "new_bit_cast"); + new_bit_cast->insertBefore(&*F->begin()->begin()); + load_shared_memory->insertBefore(new_bit_cast); + dynamic_shared_memory_addr->replaceUsesWithIf(new_bit_cast, [&](Use &U) { + auto *Instr = dyn_cast(U.getUser()); + return Instr != new_bit_cast && Instr != load_shared_memory; + }); + } +} + +void replace_built_in_function(llvm::Module *M) { LLVMContext &context = M->getContext(); auto I32 = llvm::Type::getInt32Ty(context); std::vector need_remove; @@ -203,28 +249,60 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { auto local_intra_warp_idx = builder.CreateAlloca(global_intra_warp_idx->getType()->getElementType(), 0, "local_intra_warp_idx"); - global_intra_warp_idx->replaceAllUsesWith(local_intra_warp_idx); + global_intra_warp_idx->replaceUsesWithIf(local_intra_warp_idx, [&](Use &U) { + auto *Instr = dyn_cast(U.getUser()); + return Instr->getParent()->getParent()->getName().str() == func_name; + }); + auto global_inter_warp_idx = F->getParent()->getGlobalVariable("inter_warp_index"); + auto local_inter_warp_idx = builder.CreateAlloca(global_inter_warp_idx->getType()->getElementType(), 0, "local_inter_warp_idx"); - global_inter_warp_idx->replaceAllUsesWith(local_inter_warp_idx); + + builder.CreateStore(ConstantInt::get(I32, 0), local_inter_warp_idx); + + global_inter_warp_idx->replaceUsesWithIf(local_inter_warp_idx, [&](Use &U) { + auto *Instr = dyn_cast(U.getUser()); + return Instr->getParent()->getParent()->getName().str() == func_name; + }); for (auto BB = F->begin(); BB != F->end(); ++BB) { for (auto BI = BB->begin(); BI != BB->end(); BI++) { if (auto Load = dyn_cast(BI)) { auto load_from = Load->getOperand(0); - if (load_from == F->getParent()->getGlobalVariable("block_size")) { - Load->replaceAllUsesWith(ConstantInt::get( - I32, block_dim[0] * block_dim[1] * block_dim[2])); - need_remove.push_back(Load); - } } else if (auto Call = dyn_cast(BI)) { if (Call->getCalledFunction()) { auto func_name = Call->getCalledFunction()->getName().str(); - if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x") { + if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x" || + func_name == + "_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv") { + auto block_size_addr = M->getGlobalVariable("block_size_x"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto val = builder.CreateLoad(block_size_addr); + Call->replaceAllUsesWith(val); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") { + auto block_size_addr = M->getGlobalVariable("block_size_y"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto val = builder.CreateLoad(block_size_addr); + Call->replaceAllUsesWith(val); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") { + auto block_size_addr = M->getGlobalVariable("block_size_z"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto val = builder.CreateLoad(block_size_addr); + Call->replaceAllUsesWith(val); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.x" || + func_name == "_ZN26__cuda_builtin_threadIdx_t17__fetch_" + "builtin_xEv") { // replace it by warp_id + IRBuilder<> builder(context); builder.SetInsertPoint(Call); @@ -234,12 +312,11 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { thread_idx = builder.CreateBinOp( Instruction::Add, builder.CreateLoad(local_intra_warp_idx), thread_idx, "thread_idx"); - if (block_dim[1] != 1 || block_dim[2] != 1) { - printf("block y: %d block z: %d\n", block_dim[1], block_dim[2]); - thread_idx = builder.CreateBinOp( - Instruction::SRem, thread_idx, - ConstantInt::get(I32, block_dim[0]), "thread_id_x"); - } + + thread_idx = builder.CreateBinOp( + Instruction::SRem, thread_idx, + builder.CreateLoad(M->getGlobalVariable("block_size_x")), + "thread_id_x"); Call->replaceAllUsesWith(thread_idx); need_remove.push_back(Call); @@ -257,63 +334,61 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { // tidy = tid / block_dim.x thread_idx = builder.CreateBinOp( Instruction::SDiv, thread_idx, - ConstantInt::get(I32, block_dim[0]), - // builder.CreateLoad(M->getGlobalVariable("block_size_x")), + builder.CreateLoad(M->getGlobalVariable("block_size_x")), "thread_id_y"); - Call->replaceAllUsesWith(thread_idx); need_remove.push_back(Call); } else if (func_name == "llvm.nvvm.read.ptx.sreg.tid.z") { - printf("[WARNING] We DO NOT support multi-dim block\n"); + printf("[WARNING] We DO NOT support triple-dim block\n"); + exit(1); auto zero = ConstantInt::get(I32, 0); Call->replaceAllUsesWith(zero); need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x") { - auto block_index_addr = M->getGlobalVariable("block_index"); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.x" || + func_name == "_ZN25__cuda_builtin_blockIdx_t17__fetch_" + "builtin_xEv") { + auto block_index_addr = M->getGlobalVariable("block_index_x"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); auto block_idx = builder.CreateLoad(block_index_addr); Call->replaceAllUsesWith(block_idx); need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y" || - func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") { - printf("[WARNING We DO NOT support multi-dim grid\n"); - auto zero = ConstantInt::get(I32, 0); - Call->replaceAllUsesWith(zero); - need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.x") { - auto block_size_addr = M->getGlobalVariable("block_size_x"); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.y") { + auto block_index_addr = M->getGlobalVariable("block_index_y"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto block_size = ConstantInt::get(I32, block_dim[0]); - Call->replaceAllUsesWith(block_size); + auto block_idx = builder.CreateLoad(block_index_addr); + Call->replaceAllUsesWith(block_idx); need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.y") { - auto block_size_addr = M->getGlobalVariable("block_size_y"); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.ctaid.z") { + auto block_index_addr = M->getGlobalVariable("block_index_z"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto block_size = ConstantInt::get(I32, block_dim[1]); - Call->replaceAllUsesWith(block_size); + auto block_idx = builder.CreateLoad(block_index_addr); + Call->replaceAllUsesWith(block_idx); need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.ntid.z") { - auto block_size_addr = M->getGlobalVariable("block_size_z"); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x" || + func_name == "_ZN24__cuda_builtin_gridDim_t17__fetch_" + "builtin_xEv") { + auto grid_size_addr = M->getGlobalVariable("grid_size_x"); IRBuilder<> builder(context); builder.SetInsertPoint(Call); - auto block_size = ConstantInt::get(I32, block_dim[2]); - Call->replaceAllUsesWith(block_size); - need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.x") { - auto grid_size_addr = M->getGlobalVariable("grid_size"); - IRBuilder<> builder(context); - builder.SetInsertPoint(Call); - auto grid_size = ConstantInt::get(I32, grid_dim[0]); + auto grid_size = builder.CreateLoad(grid_size_addr); Call->replaceAllUsesWith(grid_size); need_remove.push_back(Call); - } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y" || - func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") { - printf("[WARNING We DO NOT support multi-dim grid\n"); - auto one = ConstantInt::get(I32, 1); - Call->replaceAllUsesWith(one); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.y") { + auto grid_size_addr = M->getGlobalVariable("grid_size_y"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto grid_size = builder.CreateLoad(grid_size_addr); + Call->replaceAllUsesWith(grid_size); + need_remove.push_back(Call); + } else if (func_name == "llvm.nvvm.read.ptx.sreg.nctaid.z") { + auto grid_size_addr = M->getGlobalVariable("grid_size_z"); + IRBuilder<> builder(context); + builder.SetInsertPoint(Call); + auto grid_size = builder.CreateLoad(grid_size_addr); + Call->replaceAllUsesWith(grid_size); need_remove.push_back(Call); } } @@ -334,6 +409,98 @@ void replace_built_in_function(llvm::Module *M, int *grid_dim, int *block_dim) { } } } + for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { + Function *F = &(*i); + for (auto BB = F->begin(); BB != F->end(); ++BB) { + for (auto BI = BB->begin(); BI != BB->end(); BI++) { + if (auto Call = dyn_cast(BI)) { + if (Call->getCalledFunction()) { + auto func_name = Call->getCalledFunction()->getName().str(); + auto callFn = Call->getCalledFunction(); + if (func_name == "vprintf") { + /* + * replace CUDA's printf to C's printf + * CUDA: + * %0 = tail call i32 @vprintf(i8* getelementptr inbounds ([19 x + * i8], [19 x i8]* @.str, i64 0, i64 0), i8* null) + * C: %call1 = call i32 (i8*, ...) @printf(i8* getelementptr + * inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0)) + */ + // find/create C's printf function + std::vector args; + args.push_back(llvm::Type::getInt8PtrTy(context)); + llvm::FunctionType *printfType = + FunctionType::get(I32, args, true); + + llvm::FunctionCallee _f = + M->getOrInsertFunction("printf", printfType); + llvm::Function *func_printf = + llvm::cast(_f.getCallee()); + // construct argument(s) + std::vector printf_args; + // first argument is same between CUDA and C + auto placeholder = Call->getArgOperand(0); + printf_args.push_back(placeholder); + // insert arguments + auto compressed_args = Call->getArgOperand(1); + if (auto BC = dyn_cast(compressed_args)) { + auto src_alloc = BC->getOperand(0); + auto SrcPointTy = + dyn_cast(BC->getOperand(0)->getType()); + auto SrcTy = SrcPointTy->getElementType(); + // reverse the bitcast + auto reverse_BC = new BitCastInst(BC, SrcPointTy, "", Call); + assert(SrcTy->isStructTy() == 1); + auto StructTy = dyn_cast(SrcTy); + for (int i = 0; i < StructTy->getNumElements(); i++) { + std::vector Indices; + Indices.push_back(ConstantInt::get(I32, 0)); + Indices.push_back(ConstantInt::get(I32, i)); + auto new_GEP = GetElementPtrInst::Create(NULL, // Pointee type + src_alloc, // Alloca + Indices, // Indices + "", Call); + auto new_load = new LoadInst(new_GEP, "", Call); + printf_args.push_back(new_load); + } + } + auto c_printf_inst = + llvm::CallInst::Create(func_printf, printf_args, "", Call); + // insert + Call->replaceAllUsesWith(c_printf_inst); + need_remove.push_back(Call); + } else if (func_name == "__nv_fast_log2f" || + func_name == "__nv_log2f" || + func_name == "__nv_fast_powf" || + func_name == "__nv_powf" || func_name == "__nv_logf" || + func_name == "__nv_expf" || func_name == "__nv_fabsf" || + func_name == "__nv_log10f" || + func_name == "__nv_fmodf" || func_name == "__nv_sqrt" || + func_name == "__nv_sqrtf" || func_name == "__nv_exp" || + func_name == "__nv_isnanf" || + func_name == "__nv_isinff" || func_name == "__nv_powi" || + func_name == "__nv_powif") { + Call->getCalledFunction()->deleteBody(); + } else if (func_name == "llvm.nvvm.fma.rn.d") { + Call->getCalledFunction()->setName("__nvvm_fma_rn_d"); + } else if (func_name == "llvm.nvvm.d2i.lo") { + Call->getCalledFunction()->setName("__nvvm_d2i_lo"); + } else if (func_name == "llvm.nvvm.d2i.hi") { + Call->getCalledFunction()->setName("__nvvm_d2i_hi"); + } else if (func_name == "llvm.nvvm.add.rn.d") { + Call->getCalledFunction()->setName("__nvvm_add_rn_d"); + } else if (func_name == "llvm.nvvm.lohi.i2d") { + Call->getCalledFunction()->setName("__nvvm_lohi_i2d"); + } else if (func_name == "llvm.nvvm.fabs.f") { + Call->getCalledFunction()->setName("__nvvm_fabs_f"); + } else if (func_name == "llvm.nvvm.mul24.i") { + Call->getCalledFunction()->setName("__nvvm_mul24_i"); + } + } + } + } + } + } for (auto inst : need_remove) { inst->eraseFromParent(); @@ -382,6 +549,8 @@ bool has_warp_barrier(llvm::BasicBlock *B) { Instruction *inst = &(*i); llvm::CallInst *Call = llvm::dyn_cast(inst); if (Call) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.bar.warp.sync") { return true; @@ -396,6 +565,8 @@ bool has_barrier(llvm::BasicBlock *B) { Instruction *inst = &(*i); llvm::CallInst *Call = llvm::dyn_cast(inst); if (Call) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.bar.warp.sync" || @@ -412,6 +583,8 @@ bool has_block_barrier(llvm::BasicBlock *B) { Instruction *inst = &(*i); llvm::CallInst *Call = llvm::dyn_cast(inst); if (Call) { + if (Call->isInlineAsm()) + continue; auto func_name = Call->getCalledFunction()->getName().str(); if (func_name == "llvm.nvvm.barrier0" || func_name == "llvm.nvvm.barrier.sync") { @@ -478,3 +651,21 @@ bool find_barrier_in_region(llvm::BasicBlock *start, llvm::BasicBlock *end) { } return 0; } + +/* + Print IR to String Output for Debugging Purposes +*/ +// void printModule(llvm::Module *M) { +// std::string str; +// llvm::raw_string_ostream ss(str); +// std::cout << "### Printing Module ###" << std::endl; +// for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { +// Function *F = &(*i); +// auto func_name = F->getName().str(); +// std::cout << func_name << std::endl; +// for (Function::iterator b = F->begin(); b != F->end(); ++b) { +// BasicBlock *B = &(*b); +// errs() << *B; +// } +// } +// } diff --git a/compilation/KernelTranslation/lib/warp_func.cpp b/compilation/KernelTranslation/lib/warp_func.cpp index 9708d74..a25979a 100644 --- a/compilation/KernelTranslation/lib/warp_func.cpp +++ b/compilation/KernelTranslation/lib/warp_func.cpp @@ -44,6 +44,8 @@ void handle_warp_vote(llvm::Module *M) { for (Function::iterator E = F->end(); I != E; ++I) { for (BasicBlock::iterator BI = I->begin(); BI != I->end(); BI++) { if (CallInst *vote_any_sync = dyn_cast(BI)) { + if (vote_any_sync->isInlineAsm()) + continue; auto func_name = vote_any_sync->getCalledFunction()->getName(); if (func_name == "llvm.nvvm.vote.any.sync" || func_name == "llvm.nvvm.vote.all.sync") { diff --git a/compilation/examples/reduce/host.cpp b/compilation/examples/reduce/host.cpp deleted file mode 100644 index 297ff17..0000000 --- a/compilation/examples/reduce/host.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include -#include -#include - -#define NUM_WARP 2 -#define NUM_BLOCK 1 - -int block_size = 32 * NUM_WARP; -int block_size_x = block_size; -int block_size_y = 1; -int block_size_z = 1; -__thread int block_index = 0; -int grid_size = NUM_BLOCK; - -extern "C" { -void *_Z7reduce0PiS_j_wrapper(void *); -__thread int warp_shfl[32]; -} - -void *wrap(void *p) { - int **res = (int **)p; - block_index = (*(int *)res[3]); - _Z7reduce0PiS_j_wrapper(p); - return NULL; -} - -void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) { - int **ret = new int *[4]; - - int **p0 = new int *; - *p0 = g_idata; - ret[0] = (int *)(p0); - - int **p1 = new int *; - *p1 = g_odata; - ret[1] = (int *)(p1); - - unsigned int *p2 = new unsigned int; - *p2 = n; - ret[2] = (int *)p2; - - int *p3 = new int; - *p3 = bid; - ret[3] = (int *)p3; - - return (void *)ret; -} - -int main(int argc, char *argv[]) { - int *g_idata; - - int size = block_size * NUM_BLOCK; - g_idata = new int[size * 2]; - int *res = new int[size]; - - for (int i = 0; i < size; i++) { - g_idata[i] = i; - } - - pthread_t threads[NUM_BLOCK]; - - void *inp[NUM_BLOCK]; - for (long t = 0; t < NUM_BLOCK; t++) { - inp[t] = gen_input(t, g_idata, res, size); - } - - for (long t = 0; t < NUM_BLOCK; t++) { - pthread_create(&threads[t], NULL, wrap, inp[t]); - } - for (long t = 0; t < NUM_BLOCK; t++) - pthread_join(threads[t], NULL); - int gold = 0; - for (int i = 0; i < size; i++) { - gold += g_idata[i]; - } - assert(*res == gold && "Incorrect res\n"); - printf("PASS\n"); - - pthread_exit(NULL); -} diff --git a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll deleted file mode 100644 index 50b112d..0000000 --- a/compilation/examples/reduce/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +++ /dev/null @@ -1,150 +0,0 @@ -; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc' -source_filename = "kernel.cu" -target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } - -@_ZZ7reduce0PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 - -; Function Attrs: nounwind -define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: convergent nounwind -define dso_local void @_Z7reduce0PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 { -entry: - %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10 - %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4, !range !11 - %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4, !range !12 - %mul = mul i32 %2, %1 - %add = add i32 %mul, %0 - %cmp = icmp ult i32 %add, %n - br i1 %cmp, label %cond.true, label %cond.end - -cond.true: ; preds = %entry - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom - %3 = load i32, i32* %arrayidx, align 4, !tbaa !13 - br label %cond.end - -cond.end: ; preds = %entry, %cond.true - %cond = phi i32 [ %3, %cond.true ], [ 0, %entry ] - %idxprom5 = zext i32 %0 to i64 - %arrayidx635 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom5 - %arrayidx6 = addrspacecast i32 addrspace(3)* %arrayidx635 to i32* - store i32 %cond, i32* %arrayidx6, align 4, !tbaa !13 - tail call void @llvm.nvvm.barrier.sync(i32 0) #4 - %cmp839 = icmp ugt i32 %2, 1 - br i1 %cmp839, label %for.body, label %for.cond.cleanup - -for.cond.cleanup: ; preds = %if.end, %cond.end - %cmp18 = icmp eq i32 %0, 0 - br i1 %cmp18, label %if.then19, label %if.end23 - -for.body: ; preds = %cond.end, %if.end - %s.040 = phi i32 [ %mul9, %if.end ], [ 1, %cond.end ] - %mul9 = shl nuw nsw i32 %s.040, 1 - %rem = urem i32 %0, %mul9 - %cmp10 = icmp eq i32 %rem, 0 - br i1 %cmp10, label %if.then, label %if.end - -if.then: ; preds = %for.body - %add11 = add i32 %s.040, %0 - %idxprom12 = zext i32 %add11 to i64 - %arrayidx1336 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata, i64 0, i64 %idxprom12 - %arrayidx13 = addrspacecast i32 addrspace(3)* %arrayidx1336 to i32* - %4 = load i32, i32* %arrayidx13, align 4, !tbaa !13 - %5 = load i32, i32* %arrayidx6, align 4, !tbaa !13 - %add16 = add nsw i32 %5, %4 - store i32 %add16, i32* %arrayidx6, align 4, !tbaa !13 - br label %if.end - -if.end: ; preds = %if.then, %for.body - tail call void @llvm.nvvm.barrier.sync(i32 0) #4 - %cmp8 = icmp ult i32 %mul9, %2 - br i1 %cmp8, label %for.body, label %for.cond.cleanup - -if.then19: ; preds = %for.cond.cleanup - %idxprom21 = zext i32 %1 to i64 - %arrayidx22 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom21 - %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* addrspacecast ([64 x i32] addrspace(3)* @_ZZ7reduce0PiS_jE5sdata to [64 x i32]*), i64 0, i64 0), align 4, !tbaa !13 - store i32 %6, i32* %arrayidx22, align 4, !tbaa !13 - br label %if.end23 - -if.end23: ; preds = %if.then19, %for.cond.cleanup - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 - -; Function Attrs: convergent nounwind -declare void @llvm.nvvm.barrier.sync(i32) #3 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } -attributes #3 = { convergent nounwind } -attributes #4 = { nounwind } - -!llvm.module.flags = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} -!llvm.ident = !{!8} -!nvvmir.version = !{!9} - -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} -!1 = !{i32 1, !"wchar_size", i32 4} -!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} -!3 = !{void (i32*, i32*, i32)* @_Z7reduce0PiS_j, !"kernel", i32 1} -!4 = !{null, !"align", i32 8} -!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} -!6 = !{null, !"align", i32 16} -!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} -!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} -!9 = !{i32 1, i32 4} -!10 = !{i32 0, i32 1024} -!11 = !{i32 0, i32 2147483647} -!12 = !{i32 1, i32 1025} -!13 = !{!14, !14, i64 0} -!14 = !{!"int", !15, i64 0} -!15 = !{!"omnipotent char", !16, i64 0} -!16 = !{!"Simple C++ TBAA"} diff --git a/compilation/examples/reduce/run.sh b/compilation/examples/reduce/run.sh deleted file mode 100644 index 93cd3fd..0000000 --- a/compilation/examples/reduce/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll -../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1 -llc --filetype=obj kernel.bc -g++ host.cpp kernel.o -lpthread -o test -./test diff --git a/compilation/examples/reduce_shuffle/host.cpp b/compilation/examples/reduce_shuffle/host.cpp deleted file mode 100644 index 41c5ae8..0000000 --- a/compilation/examples/reduce_shuffle/host.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include -#include -#include - -#define NUM_WARP 2 -#define NUM_BLOCK 1 - -int block_size = 32 * NUM_WARP; -int block_size_x = block_size; -int block_size_y = 1; -int block_size_z = 1; -__thread int block_index = 0; -int grid_size = NUM_BLOCK; - -extern "C" { -void *_Z7reduce5PiS_j_wrapper(void *); -__thread int warp_shfl[32]; -} - -void *wrap(void *p) { - int **res = (int **)p; - block_index = (*(int *)res[3]); - _Z7reduce5PiS_j_wrapper(p); - return NULL; -} - -void *gen_input(int bid, int *g_idata, int *g_odata, unsigned int n) { - int **ret = new int *[4]; - - int **p0 = new int *; - *p0 = g_idata; - ret[0] = (int *)(p0); - - int **p1 = new int *; - *p1 = g_odata; - ret[1] = (int *)(p1); - - unsigned int *p2 = new unsigned int; - *p2 = n; - ret[2] = (int *)p2; - - int *p3 = new int; - *p3 = bid; - ret[3] = (int *)p3; - - return (void *)ret; -} - -int main(int argc, char *argv[]) { - int *g_idata; - - int size = block_size * NUM_BLOCK; - g_idata = new int[size * 2]; - int *res = new int[size]; - - for (int i = 0; i < size; i++) { - g_idata[i] = i; - } - - pthread_t threads[NUM_BLOCK]; - - void *inp[NUM_BLOCK]; - for (long t = 0; t < NUM_BLOCK; t++) { - inp[t] = gen_input(t, g_idata, res, size); - } - - for (long t = 0; t < NUM_BLOCK; t++) { - pthread_create(&threads[t], NULL, wrap, inp[t]); - } - for (long t = 0; t < NUM_BLOCK; t++) - pthread_join(threads[t], NULL); - int gold = 0; - for (int i = 0; i < size; i++) { - gold += g_idata[i]; - } - assert(*res == gold && "Incorrect res\n"); - printf("PASS\n"); - - pthread_exit(NULL); -} diff --git a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll deleted file mode 100644 index 7d056fd..0000000 --- a/compilation/examples/reduce_shuffle/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +++ /dev/null @@ -1,179 +0,0 @@ -; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc' -source_filename = "kernel.cu" -target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } - -@_ZZ7reduce5PiS_jE5sdata = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 - -; Function Attrs: nounwind -define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: convergent nounwind -define dso_local void @_Z7reduce5PiS_j(i32* nocapture readonly %g_idata, i32* nocapture %g_odata, i32 %n) local_unnamed_addr #1 { -entry: - %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !10 - %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #5, !range !11 - %mul = shl i32 %1, 7 - %add = add i32 %mul, %0 - %cmp = icmp ult i32 %add, %n - br i1 %cmp, label %cond.true, label %cond.end - -cond.true: ; preds = %entry - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom - %2 = load i32, i32* %arrayidx, align 4, !tbaa !12 - br label %cond.end - -cond.end: ; preds = %entry, %cond.true - %cond = phi i32 [ %2, %cond.true ], [ 0, %entry ] - %add4 = add i32 %add, 64 - %cmp5 = icmp ult i32 %add4, %n - br i1 %cmp5, label %if.then, label %if.end - -if.then: ; preds = %cond.end - %idxprom7 = zext i32 %add4 to i64 - %arrayidx8 = getelementptr inbounds i32, i32* %g_idata, i64 %idxprom7 - %3 = load i32, i32* %arrayidx8, align 4, !tbaa !12 - %add9 = add nsw i32 %3, %cond - br label %if.end - -if.end: ; preds = %if.then, %cond.end - %mySum.0 = phi i32 [ %add9, %if.then ], [ %cond, %cond.end ] - %idxprom10 = zext i32 %0 to i64 - %arrayidx1150 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom10 - %arrayidx11 = addrspacecast i32 addrspace(3)* %arrayidx1150 to i32* - store i32 %mySum.0, i32* %arrayidx11, align 4, !tbaa !12 - tail call void @llvm.nvvm.barrier.sync(i32 0) #5 - tail call void @llvm.nvvm.barrier.sync(i32 0) #5 - tail call void @llvm.nvvm.barrier.sync(i32 0) #5 - tail call void @llvm.nvvm.barrier.sync(i32 0) #5 - %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.z() #5, !range !16 - %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #5, !range !17 - %mul.i.i52 = mul nuw nsw i32 %5, %4 - %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !17 - %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.y() #5, !range !10 - %mul39.i.i53 = add nuw nsw i32 %7, %mul.i.i52 - %add.i.i54 = mul nuw nsw i32 %mul39.i.i53, %6 - %add8.i.i55 = add nuw nsw i32 %add.i.i54, %0 - %cmp14 = icmp ult i32 %add8.i.i55, 32 - br i1 %cmp14, label %if.then15, label %if.end32 - -if.then15: ; preds = %if.end - %add16 = add nuw nsw i32 %0, 32 - %idxprom17 = zext i32 %add16 to i64 - %arrayidx1851 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @_ZZ7reduce5PiS_jE5sdata, i64 0, i64 %idxprom17 - %arrayidx18 = addrspacecast i32 addrspace(3)* %arrayidx1851 to i32* - %8 = load i32, i32* %arrayidx18, align 4, !tbaa !12 - %add19 = add nsw i32 %8, %mySum.0 - %9 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add19, i32 16, i32 31) #5 - %add23 = add nsw i32 %9, %add19 - %10 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23, i32 8, i32 31) #5 - %add23.1 = add nsw i32 %10, %add23 - %11 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.1, i32 4, i32 31) #5 - %add23.2 = add nsw i32 %11, %add23.1 - %12 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.2, i32 2, i32 31) #5 - %add23.3 = add nsw i32 %12, %add23.2 - %13 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %add23.3, i32 1, i32 31) #5 - %cmp27 = icmp eq i32 %add8.i.i55, 0 - br i1 %cmp27, label %if.then28, label %if.end32 - -if.then28: ; preds = %if.then15 - %add23.4 = add nsw i32 %13, %add23.3 - %idxprom30 = zext i32 %1 to i64 - %arrayidx31 = getelementptr inbounds i32, i32* %g_odata, i64 %idxprom30 - store i32 %add23.4, i32* %arrayidx31, align 4, !tbaa !12 - br label %if.end32 - -if.end32: ; preds = %if.end, %if.then28, %if.then15 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2 - -; Function Attrs: convergent nounwind -declare void @llvm.nvvm.barrier.sync(i32) #3 - -; Function Attrs: convergent inaccessiblememonly nounwind -declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } -attributes #3 = { convergent nounwind } -attributes #4 = { convergent inaccessiblememonly nounwind } -attributes #5 = { nounwind } - -!llvm.module.flags = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} -!llvm.ident = !{!8} -!nvvmir.version = !{!9} - -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} -!1 = !{i32 1, !"wchar_size", i32 4} -!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} -!3 = !{void (i32*, i32*, i32)* @_Z7reduce5PiS_j, !"kernel", i32 1} -!4 = !{null, !"align", i32 8} -!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} -!6 = !{null, !"align", i32 16} -!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} -!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} -!9 = !{i32 1, i32 4} -!10 = !{i32 0, i32 1024} -!11 = !{i32 0, i32 2147483647} -!12 = !{!13, !13, i64 0} -!13 = !{!"int", !14, i64 0} -!14 = !{!"omnipotent char", !15, i64 0} -!15 = !{!"Simple C++ TBAA"} -!16 = !{i32 0, i32 64} -!17 = !{i32 1, i32 1025} diff --git a/compilation/examples/reduce_shuffle/run.sh b/compilation/examples/reduce_shuffle/run.sh deleted file mode 100644 index 93cd3fd..0000000 --- a/compilation/examples/reduce_shuffle/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll -../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 64 1 1 -llc --filetype=obj kernel.bc -g++ host.cpp kernel.o -lpthread -o test -./test diff --git a/compilation/examples/run_example.sh b/compilation/examples/run_example.sh deleted file mode 100644 index 8600e28..0000000 --- a/compilation/examples/run_example.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!bin/sh -for file in ./* -do - if test -d $file - then - echo executing $file - cd $file - bash run.sh - cd .. - fi -done diff --git a/compilation/examples/vecadd/host.cpp b/compilation/examples/vecadd/host.cpp deleted file mode 100644 index 153d7d5..0000000 --- a/compilation/examples/vecadd/host.cpp +++ /dev/null @@ -1,84 +0,0 @@ -#include -#include -#include -#include -#include - -#define NUM_BLOCK 1 -int N = 32; - -int block_size = 32; -int block_size_x = block_size; -int block_size_y = 1; -int block_size_z = 1; -__thread int block_index = 0; -int grid_size = NUM_BLOCK; - -extern "C" { -void *_Z9vectorAddPKfS0_Pfi_wrapper(void *); -} - -void *wrap(void *p) { - int **res = (int **)p; - block_index = (*(int *)res[4]); - _Z9vectorAddPKfS0_Pfi_wrapper(p); - return NULL; -} - -void *gen_input(int bid, float *A, float *B, float *C, int N) { - int **ret = new int *[5]; - - float **p0 = new float *; - *p0 = A; - ret[0] = (int *)(p0); - - float **p1 = new float *; - *p1 = B; - ret[1] = (int *)(p1); - - float **p2 = new float *; - *p2 = C; - ret[2] = (int *)(p2); - - int *p3 = new int; - *p3 = N; - ret[3] = (int *)p3; - - int *p4 = new int; - *p4 = bid; - ret[4] = (int *)p4; - - return (void *)ret; -} - -int main() { - float *A, *B, *C; - - A = new float[N]; - B = new float[N]; - C = new float[N]; - - for (int i = 0; i < N; i++) { - A[i] = i; - B[i] = 1; - C[i] = 0; - } - - pthread_t threads[NUM_BLOCK]; - - int rc; - for (long t = 0; t < NUM_BLOCK; t++) { - void *inp = gen_input(t, A, B, C, N); - rc = pthread_create(&threads[t], NULL, wrap, inp); - } - clock_t t1 = clock(); - /* Last thing that main() should do */ - for (long t = 0; t < NUM_BLOCK; t++) - pthread_join(threads[t], NULL); - - for (int i = 0; i < N; i++) { - assert(C[i] == (A[i] + B[i])); - } - printf("PASS\n"); - pthread_exit(NULL); -} diff --git a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll b/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll deleted file mode 100644 index 57d6b64..0000000 --- a/compilation/examples/vecadd/kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll +++ /dev/null @@ -1,86 +0,0 @@ -; ModuleID = 'kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc' -source_filename = "kernel.cu" -target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } - -; Function Attrs: nounwind -define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaGetDevice(i32* %device) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nounwind -define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) local_unnamed_addr #0 { -entry: - ret i32 999 -} - -; Function Attrs: nofree nounwind -define dso_local void @_Z9vectorAddPKfS0_Pfi(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture %C, i32 %numElements) local_unnamed_addr #1 { -entry: - %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !10 - %idxprom8 = zext i32 %0 to i64 - %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom8 - %1 = load float, float* %arrayidx, align 4, !tbaa !11 - %arrayidx2 = getelementptr inbounds float, float* %B, i64 %idxprom8 - %2 = load float, float* %arrayidx2, align 4, !tbaa !11 - %add = fadd contract float %1, %2 - %arrayidx4 = getelementptr inbounds float, float* %C, i64 %idxprom8 - store float %add, float* %arrayidx4, align 4, !tbaa !11 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } -attributes #3 = { nounwind } - -!llvm.module.flags = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} -!llvm.ident = !{!8} -!nvvmir.version = !{!9} - -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} -!1 = !{i32 1, !"wchar_size", i32 4} -!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} -!3 = !{void (float*, float*, float*, i32)* @_Z9vectorAddPKfS0_Pfi, !"kernel", i32 1} -!4 = !{null, !"align", i32 8} -!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} -!6 = !{null, !"align", i32 16} -!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} -!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} -!9 = !{i32 1, i32 4} -!10 = !{i32 0, i32 1024} -!11 = !{!12, !12, i64 0} -!12 = !{!"float", !13, i64 0} -!13 = !{!"omnipotent char", !14, i64 0} -!14 = !{!"Simple C++ TBAA"} diff --git a/compilation/examples/vecadd/run.sh b/compilation/examples/vecadd/run.sh deleted file mode 100644 index 973a99c..0000000 --- a/compilation/examples/vecadd/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -llvm-as kernel-cuda-nvptx64-nvidia-cuda-sm_61.ll -../../../build/compilation/kernelTranslator kernel-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc 1 1 1 32 1 1 -llc --filetype=obj kernel.bc -g++ host.cpp kernel.o -lpthread -o test -./test diff --git a/docs/figures/workflow.png b/docs/figures/workflow.png deleted file mode 100644 index 3019f92..0000000 Binary files a/docs/figures/workflow.png and /dev/null differ diff --git a/docs/workflow.md b/docs/workflow.md deleted file mode 100644 index cef8328..0000000 --- a/docs/workflow.md +++ /dev/null @@ -1,11 +0,0 @@ -# The workflow of CuPBoP - -The workflow of CuPBoP is described as following: -![The workflow of executing CUDA applications on CuPBoP.](figures/workflow.png) -First, CuPBoP uses Clang to compile the CUDA source code into NVVM IR, -which consists of two parts: Host part and Kernel Part. -In the next step, CuPBoP-compilation parses and transforms these NVVM IRs -to make it suitable for executing on specific architectures. -The CuPBoP-runtime compiles the transformed Host IR and executes the generated programs, -which will compile the transformed Kernel IR and -upload the compiled kernel programs to specific architectures. diff --git a/examples/backprop/backprop.c b/examples/backprop/backprop.c new file mode 100644 index 0000000..e6f8f5f --- /dev/null +++ b/examples/backprop/backprop.c @@ -0,0 +1,454 @@ +#include "backprop.h" +#include +#include +#include + +//#define OPEN + +#define ABS(x) (((x) > 0.0) ? (x) : (-(x))) + +#define fastcopy(to, from, len) \ + { \ + register char *_to, *_from; \ + register int _i, _l; \ + _to = (char *)(to); \ + _from = (char *)(from); \ + _l = (len); \ + for (_i = 0; _i < _l; _i++) \ + *_to++ = *_from++; \ + } + +/*** Return random number between 0.0 and 1.0 ***/ +float drnd() { return ((float)rand() / (float)BIGRND); } + +/*** Return random number between -1.0 and 1.0 ***/ +float dpn1() { return ((drnd() * 2.0) - 1.0); } + +/*** The squashing function. Currently, it's a sigmoid. ***/ + +float squash(x) +float x; +{ + float m; + // x = -x; + // m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120; + // return(1.0 / (1.0 + m)); + return (1.0 / (1.0 + exp(-x))); +} + +/*** Allocate 1d array of floats ***/ + +float *alloc_1d_dbl(n) +int n; +{ + float *new; + + new = (float *)malloc((unsigned)(n * sizeof(float))); + if (new == NULL) { + printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n"); + return (NULL); + } + return (new); +} + +/*** Allocate 2d array of floats ***/ + +float **alloc_2d_dbl(m, n) +int m, n; +{ + int i; + float **new; + + new = (float **)malloc((unsigned)(m * sizeof(float *))); + if (new == NULL) { + printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n"); + return (NULL); + } + + for (i = 0; i < m; i++) { + new[i] = alloc_1d_dbl(n); + } + + return (new); +} + +bpnn_randomize_weights(w, m, n) float **w; +int m, n; +{ + int i, j; + + for (i = 0; i <= m; i++) { + for (j = 0; j <= n; j++) { + w[i][j] = (float)rand() / RAND_MAX; + // w[i][j] = dpn1(); + } + } +} + +bpnn_randomize_row(w, m) float *w; +int m; +{ + int i; + for (i = 0; i <= m; i++) { + // w[i] = (float) rand()/RAND_MAX; + w[i] = 0.1; + } +} + +bpnn_zero_weights(w, m, n) float **w; +int m, n; +{ + int i, j; + + for (i = 0; i <= m; i++) { + for (j = 0; j <= n; j++) { + w[i][j] = 0.0; + } + } +} + +void bpnn_initialize(seed) { + printf("Random number generator seed: %d\n", seed); + srand(seed); +} + +BPNN *bpnn_internal_create(n_in, n_hidden, n_out) +int n_in, n_hidden, n_out; +{ + BPNN *newnet; + + newnet = (BPNN *)malloc(sizeof(BPNN)); + if (newnet == NULL) { + printf("BPNN_CREATE: Couldn't allocate neural network\n"); + return (NULL); + } + + newnet->input_n = n_in; + newnet->hidden_n = n_hidden; + newnet->output_n = n_out; + newnet->input_units = alloc_1d_dbl(n_in + 1); + newnet->hidden_units = alloc_1d_dbl(n_hidden + 1); + newnet->output_units = alloc_1d_dbl(n_out + 1); + + newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1); + newnet->output_delta = alloc_1d_dbl(n_out + 1); + newnet->target = alloc_1d_dbl(n_out + 1); + + newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1); + newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1); + + newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1); + newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1); + + return (newnet); +} + +void bpnn_free(net) BPNN *net; +{ + int n1, n2, i; + + n1 = net->input_n; + n2 = net->hidden_n; + + free((char *)net->input_units); + free((char *)net->hidden_units); + free((char *)net->output_units); + + free((char *)net->hidden_delta); + free((char *)net->output_delta); + free((char *)net->target); + + for (i = 0; i <= n1; i++) { + free((char *)net->input_weights[i]); + free((char *)net->input_prev_weights[i]); + } + free((char *)net->input_weights); + free((char *)net->input_prev_weights); + + for (i = 0; i <= n2; i++) { + free((char *)net->hidden_weights[i]); + free((char *)net->hidden_prev_weights[i]); + } + free((char *)net->hidden_weights); + free((char *)net->hidden_prev_weights); + + free((char *)net); +} + +/*** Creates a new fully-connected network from scratch, + with the given numbers of input, hidden, and output units. + Threshold units are automatically included. All weights are + randomly initialized. + Space is also allocated for temporary storage (momentum weights, + error computations, etc). +***/ + +BPNN *bpnn_create(n_in, n_hidden, n_out) +int n_in, n_hidden, n_out; +{ + + BPNN *newnet; + + newnet = bpnn_internal_create(n_in, n_hidden, n_out); + +#ifdef INITZERO + bpnn_zero_weights(newnet->input_weights, n_in, n_hidden); +#else + bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden); +#endif + bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out); + bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden); + bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out); + bpnn_randomize_row(newnet->target, n_out); + return (newnet); +} + +void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn; +int n1, n2; +{ + float sum; + int j, k; + + /*** Set up thresholding unit ***/ + l1[0] = 1.0; +#ifdef OPEN + omp_set_num_threads(NUM_THREAD); +#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static) +#endif + /*** For each unit in second layer ***/ + for (j = 1; j <= n2; j++) { + + /*** Compute weighted sum of its inputs ***/ + sum = 0.0; + for (k = 0; k <= n1; k++) { + sum += conn[k][j] * l1[k]; + } + l2[j] = squash(sum); + } +} + +// extern "C" +void bpnn_output_error(delta, target, output, nj, err) float *delta, *target, + *output, *err; +int nj; +{ + int j; + float o, t, errsum; + errsum = 0.0; + for (j = 1; j <= nj; j++) { + o = output[j]; + t = target[j]; + delta[j] = o * (1.0 - o) * (t - o); + errsum += ABS(delta[j]); + } + *err = errsum; +} + +void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden, + err) float *delta_h, + *delta_o, *hidden, **who, *err; +int nh, no; +{ + int j, k; + float h, sum, errsum; + + errsum = 0.0; + for (j = 1; j <= nh; j++) { + h = hidden[j]; + sum = 0.0; + for (k = 1; k <= no; k++) { + sum += delta_o[k] * who[j][k]; + } + delta_h[j] = h * (1.0 - h) * sum; + errsum += ABS(delta_h[j]); + } + *err = errsum; +} + +void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly, + **w, **oldw; +{ + float new_dw; + int k, j; + ly[0] = 1.0; + // eta = 0.3; + // momentum = 0.3; + +#ifdef OPEN + omp_set_num_threads(NUM_THREAD); +#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw) \ + firstprivate(ndelta, nly, momentum) +#endif + for (j = 1; j <= ndelta; j++) { + for (k = 0; k <= nly; k++) { + new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j])); + w[k][j] += new_dw; + oldw[k][j] = new_dw; + } + } +} + +void bpnn_feedforward(net) BPNN *net; +{ + int in, hid, out; + + in = net->input_n; + hid = net->hidden_n; + out = net->output_n; + + /*** Feed forward input activations. ***/ + bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in, + hid); + bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, + hid, out); +} + +void bpnn_train(net, eo, eh) BPNN *net; +float *eo, *eh; +{ + int in, hid, out; + float out_err, hid_err; + + in = net->input_n; + hid = net->hidden_n; + out = net->output_n; + + /*** Feed forward input activations. ***/ + bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in, + hid); + bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, + hid, out); + + /*** Compute error on output and hidden units. ***/ + bpnn_output_error(net->output_delta, net->target, net->output_units, out, + &out_err); + bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, + net->hidden_weights, net->hidden_units, &hid_err); + *eo = out_err; + *eh = hid_err; + + /*** Adjust input and hidden weights. ***/ + bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, + net->hidden_weights, net->hidden_prev_weights); + bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, + net->input_weights, net->input_prev_weights); +} + +void bpnn_save(net, filename) BPNN *net; +char *filename; +{ + int n1, n2, n3, i, j, memcnt; + float dvalue, **w; + char *mem; + /// add// + FILE *pFile; + pFile = fopen(filename, "w+"); + /////// + /* + if ((fd = creat(filename, 0644)) == -1) { + printf("BPNN_SAVE: Cannot create '%s'\n", filename); + return; + } + */ + + n1 = net->input_n; + n2 = net->hidden_n; + n3 = net->output_n; + printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename); + // fflush(stdout); + + // write(fd, (char *) &n1, sizeof(int)); + // write(fd, (char *) &n2, sizeof(int)); + // write(fd, (char *) &n3, sizeof(int)); + + fwrite((char *)&n1, sizeof(char), sizeof(char), pFile); + fwrite((char *)&n2, sizeof(char), sizeof(char), pFile); + fwrite((char *)&n3, sizeof(char), sizeof(char), pFile); + + memcnt = 0; + w = net->input_weights; + mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float))); + for (i = 0; i <= n1; i++) { + for (j = 0; j <= n2; j++) { + dvalue = w[i][j]; + fastcopy(&mem[memcnt], &dvalue, sizeof(float)); + memcnt += sizeof(float); + } + } + // write(fd, mem, (n1+1) * (n2+1) * sizeof(float)); + fwrite(mem, (unsigned)(sizeof(float)), + (unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile); + free(mem); + + memcnt = 0; + w = net->hidden_weights; + mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float))); + for (i = 0; i <= n2; i++) { + for (j = 0; j <= n3; j++) { + dvalue = w[i][j]; + fastcopy(&mem[memcnt], &dvalue, sizeof(float)); + memcnt += sizeof(float); + } + } + // write(fd, mem, (n2+1) * (n3+1) * sizeof(float)); + fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)), + pFile); + free(mem); + + fclose(pFile); + return; +} + +BPNN *bpnn_read(filename) +char *filename; +{ + char *mem; + BPNN *new; + int fd, n1, n2, n3, i, j, memcnt; + + if ((fd = open(filename, 0, 0644)) == -1) { + return (NULL); + } + + printf("Reading '%s'\n", filename); // fflush(stdout); + + read(fd, (char *)&n1, sizeof(int)); + read(fd, (char *)&n2, sizeof(int)); + read(fd, (char *)&n3, sizeof(int)); + new = bpnn_internal_create(n1, n2, n3); + + printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3); + printf("Reading input weights..."); // fflush(stdout); + + memcnt = 0; + mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float))); + read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float)); + for (i = 0; i <= n1; i++) { + for (j = 0; j <= n2; j++) { + fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float)); + memcnt += sizeof(float); + } + } + free(mem); + + printf("Done\nReading hidden weights..."); // fflush(stdout); + + memcnt = 0; + mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float))); + read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float)); + for (i = 0; i <= n2; i++) { + for (j = 0; j <= n3; j++) { + fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float)); + memcnt += sizeof(float); + } + } + free(mem); + close(fd); + + printf("Done\n"); // fflush(stdout); + + bpnn_zero_weights(new->input_prev_weights, n1, n2); + bpnn_zero_weights(new->hidden_prev_weights, n2, n3); + + return (new); +} diff --git a/examples/backprop/backprop.h b/examples/backprop/backprop.h new file mode 100644 index 0000000..a6a753c --- /dev/null +++ b/examples/backprop/backprop.h @@ -0,0 +1,50 @@ +#ifndef _BACKPROP_H_ +#define _BACKPROP_H_ + +#define BIGRND 0x7fffffff + +#define GPU +#define THREADS 256 +#define WIDTH 16 // shared memory width +#define HEIGHT 16 // shared memory height + +#define ETA 0.3 // eta value +#define MOMENTUM 0.3 // momentum value +#define NUM_THREAD 4 // OpenMP threads + +typedef struct { + int input_n; /* number of input units */ + int hidden_n; /* number of hidden units */ + int output_n; /* number of output units */ + + float *input_units; /* the input units */ + float *hidden_units; /* the hidden units */ + float *output_units; /* the output units */ + + float *hidden_delta; /* storage for hidden unit error */ + float *output_delta; /* storage for output unit error */ + + float *target; /* storage for target vector */ + + float **input_weights; /* weights from input to hidden layer */ + float **hidden_weights; /* weights from hidden to output layer */ + + /*** The next two are for momentum ***/ + float **input_prev_weights; /* previous change on input to hidden wgt */ + float **hidden_prev_weights; /* previous change on hidden to output wgt */ +} BPNN; + +/*** User-level functions ***/ + +void bpnn_initialize(); + +BPNN *bpnn_create(); +void bpnn_free(); + +void bpnn_train(); +void bpnn_feedforward(); + +void bpnn_save(); +BPNN *bpnn_read(); + +#endif diff --git a/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..321377b --- /dev/null +++ b/examples/backprop/backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,615 @@ +; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "backprop_cuda.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any + +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4 +@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 { +entry: + %input_cuda.addr = alloca float*, align 8 + %output_hidden_cuda.addr = alloca float*, align 8 + %input_hidden_cuda.addr = alloca float*, align 8 + %hidden_partial_sum.addr = alloca float*, align 8 + %in.addr = alloca i32, align 4 + %hid.addr = alloca i32, align 4 + %by = alloca i32, align 4 + %tx = alloca i32, align 4 + %ty = alloca i32, align 4 + %index = alloca i32, align 4 + %index_in = alloca i32, align 4 + %i = alloca i32, align 4 + %power_two = alloca i32, align 4 + store float* %input_cuda, float** %input_cuda.addr, align 8 + store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8 + store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8 + store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8 + store i32 %in, i32* %in.addr, align 4 + store i32 %hid, i32* %hid.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2 + store i32 %call, i32* %by, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %tx, align 4 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + store i32 %call2, i32* %ty, align 4 + %0 = load i32, i32* %hid.addr, align 4 + %add = add nsw i32 %0, 1 + %mul = mul nsw i32 %add, 16 + %1 = load i32, i32* %by, align 4 + %mul3 = mul nsw i32 %mul, %1 + %2 = load i32, i32* %hid.addr, align 4 + %add4 = add nsw i32 %2, 1 + %3 = load i32, i32* %ty, align 4 + %mul5 = mul nsw i32 %add4, %3 + %add6 = add nsw i32 %mul3, %mul5 + %4 = load i32, i32* %tx, align 4 + %add7 = add nsw i32 %add6, %4 + %add8 = add nsw i32 %add7, 1 + %5 = load i32, i32* %hid.addr, align 4 + %add9 = add nsw i32 %5, 1 + %add10 = add nsw i32 %add8, %add9 + store i32 %add10, i32* %index, align 4 + %6 = load i32, i32* %by, align 4 + %mul11 = mul nsw i32 16, %6 + %7 = load i32, i32* %ty, align 4 + %add12 = add nsw i32 %mul11, %7 + %add13 = add nsw i32 %add12, 1 + store i32 %add13, i32* %index_in, align 4 + %8 = load i32, i32* %tx, align 4 + %cmp = icmp eq i32 %8, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %9 = load float*, float** %input_cuda.addr, align 8 + %10 = load i32, i32* %index_in, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom + %11 = load float, float* %arrayidx, align 4 + %12 = load i32, i32* %ty, align 4 + %idxprom14 = sext i32 %12 to i64 + %arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14 + store float %11, float* %arrayidx15, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + call void @llvm.nvvm.barrier0() + %13 = load float*, float** %input_hidden_cuda.addr, align 8 + %14 = load i32, i32* %index, align 4 + %idxprom16 = sext i32 %14 to i64 + %arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16 + %15 = load float, float* %arrayidx17, align 4 + %16 = load i32, i32* %ty, align 4 + %idxprom18 = sext i32 %16 to i64 + %arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18 + %17 = load i32, i32* %tx, align 4 + %idxprom20 = sext i32 %17 to i64 + %arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20 + store float %15, float* %arrayidx21, align 4 + call void @llvm.nvvm.barrier0() + %18 = load i32, i32* %ty, align 4 + %idxprom22 = sext i32 %18 to i64 + %arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22 + %19 = load i32, i32* %tx, align 4 + %idxprom24 = sext i32 %19 to i64 + %arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24 + %20 = load float, float* %arrayidx25, align 4 + %21 = load i32, i32* %ty, align 4 + %idxprom26 = sext i32 %21 to i64 + %arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26 + %22 = load float, float* %arrayidx27, align 4 + %mul28 = fmul contract float %20, %22 + %23 = load i32, i32* %ty, align 4 + %idxprom29 = sext i32 %23 to i64 + %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29 + %24 = load i32, i32* %tx, align 4 + %idxprom31 = sext i32 %24 to i64 + %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31 + store float %mul28, float* %arrayidx32, align 4 + call void @llvm.nvvm.barrier0() + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %25 = load i32, i32* %i, align 4 + %conv = sitofp i32 %25 to float + %call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2 + %cmp34 = fcmp ole float %conv, %call33 + br i1 %cmp34, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %26 = load i32, i32* %i, align 4 + %conv35 = sitofp i32 %26 to float + %call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2 + %conv37 = fptosi float %call36 to i32 + store i32 %conv37, i32* %power_two, align 4 + %27 = load i32, i32* %ty, align 4 + %28 = load i32, i32* %power_two, align 4 + %rem = srem i32 %27, %28 + %cmp38 = icmp eq i32 %rem, 0 + br i1 %cmp38, label %if.then39, label %if.end54 + +if.then39: ; preds = %for.body + %29 = load i32, i32* %ty, align 4 + %idxprom40 = sext i32 %29 to i64 + %arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40 + %30 = load i32, i32* %tx, align 4 + %idxprom42 = sext i32 %30 to i64 + %arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42 + %31 = load float, float* %arrayidx43, align 4 + %32 = load i32, i32* %ty, align 4 + %33 = load i32, i32* %power_two, align 4 + %div = sdiv i32 %33, 2 + %add44 = add nsw i32 %32, %div + %idxprom45 = sext i32 %add44 to i64 + %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45 + %34 = load i32, i32* %tx, align 4 + %idxprom47 = sext i32 %34 to i64 + %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47 + %35 = load float, float* %arrayidx48, align 4 + %add49 = fadd contract float %31, %35 + %36 = load i32, i32* %ty, align 4 + %idxprom50 = sext i32 %36 to i64 + %arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50 + %37 = load i32, i32* %tx, align 4 + %idxprom52 = sext i32 %37 to i64 + %arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52 + store float %add49, float* %arrayidx53, align 4 + br label %if.end54 + +if.end54: ; preds = %if.then39, %for.body + call void @llvm.nvvm.barrier0() + br label %for.inc + +for.inc: ; preds = %if.end54 + %38 = load i32, i32* %i, align 4 + %inc = add nsw i32 %38, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %39 = load i32, i32* %ty, align 4 + %idxprom55 = sext i32 %39 to i64 + %arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55 + %40 = load i32, i32* %tx, align 4 + %idxprom57 = sext i32 %40 to i64 + %arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57 + %41 = load float, float* %arrayidx58, align 4 + %42 = load float*, float** %input_hidden_cuda.addr, align 8 + %43 = load i32, i32* %index, align 4 + %idxprom59 = sext i32 %43 to i64 + %arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59 + store float %41, float* %arrayidx60, align 4 + call void @llvm.nvvm.barrier0() + %44 = load i32, i32* %tx, align 4 + %cmp61 = icmp eq i32 %44, 0 + br i1 %cmp61, label %if.then62, label %if.end71 + +if.then62: ; preds = %for.end + %45 = load i32, i32* %tx, align 4 + %idxprom63 = sext i32 %45 to i64 + %arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63 + %46 = load i32, i32* %ty, align 4 + %idxprom65 = sext i32 %46 to i64 + %arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65 + %47 = load float, float* %arrayidx66, align 4 + %48 = load float*, float** %hidden_partial_sum.addr, align 8 + %49 = load i32, i32* %by, align 4 + %50 = load i32, i32* %hid.addr, align 4 + %mul67 = mul nsw i32 %49, %50 + %51 = load i32, i32* %ty, align 4 + %add68 = add nsw i32 %mul67, %51 + %idxprom69 = sext i32 %add68 to i64 + %arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69 + store float %47, float* %arrayidx70, align 4 + br label %if.end71 + +if.end71: ; preds = %if.then62, %for.end + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline convergent nounwind +define internal float @_ZL7__log2ff(float %__a) #1 { +entry: + %__a.addr = alloca float, align 4 + store float %__a, float* %__a.addr, align 4 + %0 = load float, float* %__a.addr, align 4 + %call = call float @__nv_fast_log2f(float %0) #2 + ret float %call +} + +; Function Attrs: alwaysinline convergent nounwind +define internal float @_ZL6__powfff(float %__a, float %__b) #1 { +entry: + %__a.addr = alloca float, align 4 + %__b.addr = alloca float, align 4 + store float %__a, float* %__a.addr, align 4 + store float %__b, float* %__b.addr, align 4 + %0 = load float, float* %__a.addr, align 4 + %1 = load float, float* %__b.addr, align 4 + %call = call float @__nv_fast_powf(float %0, float %1) #2 + ret float %call +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 { +entry: + %delta.addr = alloca float*, align 8 + %hid.addr = alloca i32, align 4 + %ly.addr = alloca float*, align 8 + %in.addr = alloca i32, align 4 + %w.addr = alloca float*, align 8 + %oldw.addr = alloca float*, align 8 + %by = alloca i32, align 4 + %tx = alloca i32, align 4 + %ty = alloca i32, align 4 + %index = alloca i32, align 4 + %index_y = alloca i32, align 4 + %index_x = alloca i32, align 4 + store float* %delta, float** %delta.addr, align 8 + store i32 %hid, i32* %hid.addr, align 4 + store float* %ly, float** %ly.addr, align 8 + store i32 %in, i32* %in.addr, align 4 + store float* %w, float** %w.addr, align 8 + store float* %oldw, float** %oldw.addr, align 8 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2 + store i32 %call, i32* %by, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %tx, align 4 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + store i32 %call2, i32* %ty, align 4 + %0 = load i32, i32* %hid.addr, align 4 + %add = add nsw i32 %0, 1 + %mul = mul nsw i32 %add, 16 + %1 = load i32, i32* %by, align 4 + %mul3 = mul nsw i32 %mul, %1 + %2 = load i32, i32* %hid.addr, align 4 + %add4 = add nsw i32 %2, 1 + %3 = load i32, i32* %ty, align 4 + %mul5 = mul nsw i32 %add4, %3 + %add6 = add nsw i32 %mul3, %mul5 + %4 = load i32, i32* %tx, align 4 + %add7 = add nsw i32 %add6, %4 + %add8 = add nsw i32 %add7, 1 + %5 = load i32, i32* %hid.addr, align 4 + %add9 = add nsw i32 %5, 1 + %add10 = add nsw i32 %add8, %add9 + store i32 %add10, i32* %index, align 4 + %6 = load i32, i32* %by, align 4 + %mul11 = mul nsw i32 16, %6 + %7 = load i32, i32* %ty, align 4 + %add12 = add nsw i32 %mul11, %7 + %add13 = add nsw i32 %add12, 1 + store i32 %add13, i32* %index_y, align 4 + %8 = load i32, i32* %tx, align 4 + %add14 = add nsw i32 %8, 1 + store i32 %add14, i32* %index_x, align 4 + %9 = load float*, float** %delta.addr, align 8 + %10 = load i32, i32* %index_x, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom + %11 = load float, float* %arrayidx, align 4 + %conv = fpext float %11 to double + %mul15 = fmul contract double 3.000000e-01, %conv + %12 = load float*, float** %ly.addr, align 8 + %13 = load i32, i32* %index_y, align 4 + %idxprom16 = sext i32 %13 to i64 + %arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16 + %14 = load float, float* %arrayidx17, align 4 + %conv18 = fpext float %14 to double + %mul19 = fmul contract double %mul15, %conv18 + %15 = load float*, float** %oldw.addr, align 8 + %16 = load i32, i32* %index, align 4 + %idxprom20 = sext i32 %16 to i64 + %arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20 + %17 = load float, float* %arrayidx21, align 4 + %conv22 = fpext float %17 to double + %mul23 = fmul contract double 3.000000e-01, %conv22 + %add24 = fadd contract double %mul19, %mul23 + %18 = load float*, float** %w.addr, align 8 + %19 = load i32, i32* %index, align 4 + %idxprom25 = sext i32 %19 to i64 + %arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25 + %20 = load float, float* %arrayidx26, align 4 + %conv27 = fpext float %20 to double + %add28 = fadd contract double %conv27, %add24 + %conv29 = fptrunc double %add28 to float + store float %conv29, float* %arrayidx26, align 4 + %21 = load float*, float** %delta.addr, align 8 + %22 = load i32, i32* %index_x, align 4 + %idxprom30 = sext i32 %22 to i64 + %arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30 + %23 = load float, float* %arrayidx31, align 4 + %conv32 = fpext float %23 to double + %mul33 = fmul contract double 3.000000e-01, %conv32 + %24 = load float*, float** %ly.addr, align 8 + %25 = load i32, i32* %index_y, align 4 + %idxprom34 = sext i32 %25 to i64 + %arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34 + %26 = load float, float* %arrayidx35, align 4 + %conv36 = fpext float %26 to double + %mul37 = fmul contract double %mul33, %conv36 + %27 = load float*, float** %oldw.addr, align 8 + %28 = load i32, i32* %index, align 4 + %idxprom38 = sext i32 %28 to i64 + %arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38 + %29 = load float, float* %arrayidx39, align 4 + %conv40 = fpext float %29 to double + %mul41 = fmul contract double 3.000000e-01, %conv40 + %add42 = fadd contract double %mul37, %mul41 + %conv43 = fptrunc double %add42 to float + %30 = load float*, float** %oldw.addr, align 8 + %31 = load i32, i32* %index, align 4 + %idxprom44 = sext i32 %31 to i64 + %arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44 + store float %conv43, float* %arrayidx45, align 4 + call void @llvm.nvvm.barrier0() + %32 = load i32, i32* %ty, align 4 + %cmp = icmp eq i32 %32, 0 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %entry + %33 = load i32, i32* %by, align 4 + %cmp46 = icmp eq i32 %33, 0 + br i1 %cmp46, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + %34 = load float*, float** %delta.addr, align 8 + %35 = load i32, i32* %index_x, align 4 + %idxprom47 = sext i32 %35 to i64 + %arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47 + %36 = load float, float* %arrayidx48, align 4 + %conv49 = fpext float %36 to double + %mul50 = fmul contract double 3.000000e-01, %conv49 + %37 = load float*, float** %oldw.addr, align 8 + %38 = load i32, i32* %index_x, align 4 + %idxprom51 = sext i32 %38 to i64 + %arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51 + %39 = load float, float* %arrayidx52, align 4 + %conv53 = fpext float %39 to double + %mul54 = fmul contract double 3.000000e-01, %conv53 + %add55 = fadd contract double %mul50, %mul54 + %40 = load float*, float** %w.addr, align 8 + %41 = load i32, i32* %index_x, align 4 + %idxprom56 = sext i32 %41 to i64 + %arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56 + %42 = load float, float* %arrayidx57, align 4 + %conv58 = fpext float %42 to double + %add59 = fadd contract double %conv58, %add55 + %conv60 = fptrunc double %add59 to float + store float %conv60, float* %arrayidx57, align 4 + %43 = load float*, float** %delta.addr, align 8 + %44 = load i32, i32* %index_x, align 4 + %idxprom61 = sext i32 %44 to i64 + %arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61 + %45 = load float, float* %arrayidx62, align 4 + %conv63 = fpext float %45 to double + %mul64 = fmul contract double 3.000000e-01, %conv63 + %46 = load float*, float** %oldw.addr, align 8 + %47 = load i32, i32* %index_x, align 4 + %idxprom65 = sext i32 %47 to i64 + %arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65 + %48 = load float, float* %arrayidx66, align 4 + %conv67 = fpext float %48 to double + %mul68 = fmul contract double 3.000000e-01, %conv67 + %add69 = fadd contract double %mul64, %mul68 + %conv70 = fptrunc double %add69 to float + %49 = load float*, float** %oldw.addr, align 8 + %50 = load i32, i32* %index_x, align 4 + %idxprom71 = sext i32 %50 to i64 + %arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71 + store float %conv70, float* %arrayidx72, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %entry + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3 + +; Function Attrs: alwaysinline convergent inlinehint nounwind +define internal float @__nv_fast_log2f(float %a) #4 { + %call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %1 = icmp ne i32 %call.i, 0 + br i1 %1, label %2, label %4 + +2: ; preds = %0 + %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a) + br label %__nvvm_builtin_log2f.exit + +4: ; preds = %0 + %5 = call float @llvm.nvvm.lg2.approx.f(float %a) + br label %__nvvm_builtin_log2f.exit + +__nvvm_builtin_log2f.exit: ; preds = %4, %2 + %retval.0.i = phi float [ %3, %2 ], [ %5, %4 ] + ret float %retval.0.i +} + +; Function Attrs: convergent nounwind +declare i32 @__nvvm_reflect(i8*) #5 + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3 + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.lg2.approx.f(float) #3 + +; Function Attrs: alwaysinline convergent inlinehint nounwind +define internal float @__nv_fast_powf(float %a, float %b) #4 { + %call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %1 = icmp ne i32 %call.i.i, 0 + br i1 %1, label %2, label %4 + +2: ; preds = %0 + %3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a) + br label %__nv_fast_log2f.exit + +4: ; preds = %0 + %5 = call float @llvm.nvvm.lg2.approx.f(float %a) + br label %__nv_fast_log2f.exit + +__nv_fast_log2f.exit: ; preds = %4, %2 + %retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ] + %6 = fmul float %b, %retval.0.i.i + %call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %7 = icmp ne i32 %call.i.i1, 0 + br i1 %7, label %8, label %10 + +8: ; preds = %__nv_fast_log2f.exit + %9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6) + br label %__nv_exp2f.exit + +10: ; preds = %__nv_fast_log2f.exit + %11 = call float @llvm.nvvm.ex2.approx.f(float %6) + br label %__nv_exp2f.exit + +__nv_exp2f.exit: ; preds = %10, %8 + %retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ] + ret float %retval.0.i.i2 +} + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3 + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.ex2.approx.f(float) #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } +attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7} +!llvm.ident = !{!9} +!nvvmir.version = !{!10} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1} +!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1} +!5 = !{null, !"align", i32 8} +!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!7 = !{null, !"align", i32 16} +!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!10 = !{i32 1, i32 4} diff --git a/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll b/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..6c7daea --- /dev/null +++ b/examples/backprop/backprop_cuda-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,894 @@ +; ModuleID = 'backprop_cuda-host-x86_64-unknown-linux-gnu.bc' +source_filename = "backprop_cuda.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%struct.BPNN = type { i32, i32, i32, float*, float*, float*, float*, float*, float*, float**, float**, float**, float** } + +$_ZN4dim3C2Ejjj = comdat any + +$_ZSt3expf = comdat any + +@num_threads = dso_local global i32 0, align 4 +@num_blocks = dso_local global i32 0, align 4 +@.str = private unnamed_addr constant [28 x i8] c"Performing GPU computation\0A\00", align 1 +@.str.1 = private unnamed_addr constant [23 x i8] c"bpnn kernel error: %s\0A\00", align 1 +@.str.2 = private unnamed_addr constant [4 x i8] c"%f \00", align 1 +@.str.3 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 +@0 = private unnamed_addr constant [37 x i8] c"_Z22bpnn_layerforward_CUDAPfS_S_S_ii\00", align 1 +@1 = private unnamed_addr constant [39 x i8] c"_Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00", align 1 +@2 = private constant [26889 x i8] c"P\EDU\BA\01\00\10\00\F8h\00\00\00\00\00\00\02\00\01\01@\00\00\00xY\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\D0X\00\00\00\00\00\00\10U\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0F\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.info._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.shared._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.global\00.nv.global.init\00.nv.constant2._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.constant0._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.text._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.info._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.shared._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.constant0._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.text._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.info._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.shared._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.global\00blockIdx\00threadIdx\00.nv.global.init\00$str\00.nv.constant2._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00__ocg_const\00.nv.constant0._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00_param\00_Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.text._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.info._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.shared._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00$___ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node__186\00$___ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix__188\00.nv.constant0._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00Y\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E8\00\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F3\00\00\00\01\00\0D\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\FC\00\00\00\01\00\0D\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\06\01\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\16\01\00\00\01\00\0C\00\00\00\00\00\00\00\00\00\0B\00\00\00\00\00\00\00\1B\01\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\\\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\BD\01\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\16\02\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\BF\02\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00\80\1D\00\00\00\00\00\00\98\01\00\00\12\10\0B\00\00\00\00\00\00\00\00\00\80,\00\00\00\00\00\00\04/\08\00\0D\00\00\00\10\00\00\00\04#\08\00\0D\00\00\00\00\00\00\00\04\12\08\00\0D\00\00\00X\00\00\00\04\11\08\00\0D\00\00\00X\00\00\00\04/\08\00\0C\00\00\00\10\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00H\00\00\00\04\11\08\00\0C\00\00\00H\00\00\00\010\00\00\01*\00\00\04\0A\08\00\08\00\00\00@\010\00\03\190\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0\11\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00X\05\00\00\04\1C\04\00p\1D\00\00\04\1E\04\000\00\00\00\010\00\00\01*\00\00\04\0A\08\00\0B\00\00\00@\01(\00\03\19(\00\04\17\0C\00\00\00\00\00\05\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00H\05\00\00\04\1C\04\008,\00\00\04\1E\04\00@\00\00\00333333\D3?\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB\A5\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F0\1Evisible .entry _Z22bpnn_layerforward_CUDAPfS_\02\00&ii\AD\04\00\A7\00\0F2\00\11\0E\9B\04\0F:\00\1C\1F1:\00&\1F2:\00&\07e\04\00a\01\0F:\00\18\1F4:\00&\1F5\8D\04\13?6[8^\0B\16\95pred %p<5\8E\04\10f/\02\\f<20>\B1\04\1D7\B2\04\108%\00`\0A\09.shaH\00\03\93\00\124\93\00\1FZ\D9\00\10\FF\02E10input_node[64]O\00-\F0\033weight_matrix[102T\00\0FU\05\08\1F6U\05\1C\0F\AF\01\19\0F\EE\04\00\0FB\00\1B\0F\85\05\01\1F4C\00\1B\1F3\C8\05\01\0FC\00\1B\0Fb\05\01\0F\0B\01\1C\0FP\05\01\0F\0C\01\1C#0]\AA\01#tor\15\04I\00\115\04\05\04f\0B\0A\1C\00\116\1C\00\1F5;\00\05\147?\05\0F;\00\00\118\1C\00\1F7;\00\05\149\A7\05\0F;\00\00!10\1D\00\1F9<\00\05$11\FA\05\0F=\00\01\122\1D\00\0B\EC\05\03\02\06?d12\04\06\03*10\18\00\03\05\06*d8\17\00\134v\06\1A63\06\1F4m\10\02\1F5I\06\03\8B%ctaid.y-\00\02\A1\00\09L\0B\9A4, %tid.x+\00\126\85\00\184+\00\135+\00\0BV\00\126\DF\00\115@\02\02*\00%6,\9E\00q;\0Aadd.s\17\00\227,\1C\00\171+\00%8,\9C\00\83;\0Amul.lo.\00$9,3\00\00!\01#hl\E6\04\02\BA\01G9, 4F\00\00\AF\01\04\8D\00\0BG\00%12H\00(11\8E\00513,O\00(12M\00%4,\05\01\091\00&5,7\00\194\1A\00%6, \00\197\19\00#7,\1F\00\0B\B8\01\136\CE\01\187x\00\09\0C\01\07\F2\00#9,\1E\00\08\F3\00(20\F3\00\06\A6\00521,4\00)20\1A\00#2, \00\0B\8D\00\137E\02\08\05\01(23\05\01rsetp.neI\003p1,!\00\F2\0C0;\0A@%p1 bra LBB6_2;\0Abra.uni\10\0021;\0A\08\00\11:Z\00\03?\03%3,1\03\01r\00\02\9F\00\15dx\01$72\00\01\031\00$5, \00\132\D1\00\03\19\00$6,Q\00\01'\00\01N\00\02\1F\07\01\11\02,rdc\00\1875\01\08c\00$8, \00\172\8E\06 rdI\01\0F)\07\1F\037\04\02'\07\05\92\05\02t\04)19\C4\00\02\B0\01\05\1D\00\02\0D\04\11f\9B\01\00\1D\00\00\9A\01+f1W\01\132W\01\B02:\0Abar.sync\8C\01\06c\01\00\EB\01\04d\04\08\00\01\06\D6\01\1A8\00\01424, \00\0Ac\01425,Q\00\01'\00\07c\01\222,\A4\00\1A5c\00\196\8B\03\06J\01?27,$\08#\0FM\01\03\02\B1\01*27\C7\00$9,\84\00\196\C7\00830,6\00\189\B2\00)31\EB\02\06\15\01432, \00\0A\15\01733,U\00)32\B4\01!33\B4\01,2;\9C\01\04o\00\1F4\84\02\04435, \00\0B\BD\00\186\BD\00)35N\00\1F7\BD\00\05\03.\00\1D7\BD\00'9,U\00'38\D2\01\223,\C2\00\1A9\BA\01/40\04\036\124\EE\02)40\AC\00\134i\01\0D\E1\03843,6\00\172\AC\00\124\AC\00#43\F1\05#rn\19\00\225,\CA\00:%f4\98\01\119\98\01\1A5\98\01\07V\12\1E4\19\05\021\07+24\80\03\133\80\03'3:1\05%5,5\00\01\FB\00\03\92\00\02\A9\01\11fn\01)25q\00\00\17\06\9A098907648z\00\03\C0\12\176\F1\00\04\E9\01\00\E8\01\A3lg2.approx\1D\00\00\ED\01#f7\A9\052gtu\17\003p2,{\00!f8\AB\05\162\AB\05\1B8\D4\00\134\D4\00\174\D4\00/33\D4\00\0A\02%\09\193P\08\00\C4\01\AB1073741824\C3\08\02\7F\08\08\97\01\04E\198f11\BD\05\04\BF\04(4]\16\00\04\8E\00\1F8\02\01\01!14G\00\1B2\0E\02\01<\06\01;\00\00#\00L;\0Aex8\00\01\1F\01#15\E0\00 zi\DD\00\02\1C\00\22r3=\00\0B\8A\09\138\87\0F\08\F2\08\193\F6\04\06\17\00%7,4\00\00\B6\0F\13m\B7\03\02\9D\03\02\83\03,37`\07#4,#\00\02`\07\164\B5\01\1B6\B5\01\135\B5\01\185\B5\01\189\8B\00\00\D5\004s64/\04\126y\00\199\E0\03/69\9A\059\127\16\05)69\E3\03471,\80\00\0A\DD\04772,6\00(71\DD\04/73\DD\04\04\127\AF\06\1D7\AF\06875,U\00\08\AF\06\02\FC\07\00#\00\08\A2\01(40\A2\014shr\18\00#1,\1E\00\1936\0A$42\18\00\00$\00\022\00\03\1A\00#3, \00\0A1\00#4,\95\01\00#\00\0D\91\01\117\04\02*44\DF\00$7,\1C\00\0B-\01\198-\01\1A7\1D\00(9,$\00\09\FC\00\138\FC\00\139,\0A\06 \03#9,\1C\01+%f}\08\2275\9A\03\1B9u\02\136u\02\1A6~\08\09$\00\137$\00\177\99\02\194\22\05\07\22\01\02\03\01\1F5\7F\05\05/46\7F\05\04'8:,\02\1F4W\07\05\00Y\00\03 \00\1A6\D7\06\1F6\F7\029\124\B4\01\1A4\B1\01848,\1D\00\09\BB\07/49\DE\02\04450, \00\0A\DE\02751,U\00'50\E2\01\129\E1\01851]\08\0A\1F5\08\0A\04\1F5\08\0A\05454, \00\0B{\00$5,Q\00\01'\00\09*\02\125*\02\1D9\F3\08\03\E9\06\1F7L\0C\07#3,!\00\02\EC\04\163\EC\04\1C1T\02\139\F7\01\189\F7\01/56E\01\05$7, \00\0A\F7\01/58\F7\019\125u\03)58.\01860,\1D\00\08\E6\0D/28\9F\05\09\02[\0D\198\CA\00\126t\01\1D6\87\0A763,m\00\186\1E\09\2210\10\02)63\10\02564,e\10\08\94\00\189z\0E\06\BE\06\180\C8\0F\09V\0F\00\14\0B\028\00\00'\00\08\CE\03\025\0B\153\C9\00\0C\E2\00\01H\02*32\E2\00$6,\1C\00\0B\E2\00$7,\B7\00\01'\00\09v\02\2267\A0\04\0C(\02$10v\0E/0:*\17\0A\114\02\02radjust_\EB\01 s_A\18`PfiS_i\06\02\0D,\17\0F4\00\10\0F.\17\00/32<\00\1B/1,x\00'\1F2x\00(\1F3x\00(\1F4<\00(\0F8\17\14O7[728\17\1D\1C38\17,178\17\162\12\00\10fF\00Nfd<2E \1F5&'\0D\1F7F \19\03%\16\0F#\01\18\0F'\16\01\0FE\00\1D.4]3\17\0FD\00\1D\0F\B0\16\01\0FE\00\1E\0F\C9%\01\0FD\00\1D\0F\B3\16\01\0FE\00\17\0F\B5\16\F4\1A0\B4\16\0E\DA&\0F\E1\16\01\1A0-\00\03\E1\16\1F2\DF\16\1D\0F\B3\16\10/48\B3\16\16/52\B3\16\16/56\B3\16\06\1F8\B2\16\16/48\B2\162/56\B2\168/52\B2\16G\1F0\B2\16\08/48\B2\16\19/56\B2\16-/64\B2\16\08\0B\05\01$24\AF\16\0F\84\17\03\03\D4\14\0C\9D\16\1F0\9C\16\03/68\9C\16;\00\1F\093f64\1A\00#d1\EF\15\04\C4\10\02\8F\06#2,\1C\00h0d3FD3\01\00\09\BC\00\1F7Y\17\04\1F8\F6\16\05$9, \00\0B\F6\15\150\0F\17\03\B2\16\0D\F6\15\1E0\BD\00\133\F8\14\07\88\16\1B1p\0A\04%\16\1F2\B6\0B\04%23o\16\0B\92\00(4,.\17\1E3\B6\14.24\92\00\01\14\12\09$\01(25\12\0D\08a\00\03e\15\1E5a\00\03k\14\1F2\B0\01\00\135h\147fma\B0\01#6,}\00\0E\B0\01\01\1A\00\1C51\00&7,\E7\01\02#\00\196p\13\02'\00\01\DA\14*d7\22\18\146\D6\14\07\E4\00\1F7\C4\02\02/28\FD\17\05\03u\06\0D\07\02\046\17\1876\17\14f\D0\14\01U\15\0F\07\02\00\01\8C\14\1C6\C4\02#9,\1C\00\0F\C4\02\0B/31\C4\02\03/326\17\05\03\8C\17\1D2\BD\00\154\BE\17\01'\00\0CK\15\00\22\00\0F\81\03\00\130J\15\07\93\00\1F5\C5\02\03\1F3{\0E\05%37b\17\0B\93\00(8,\E9\17\177\93\00\03\A1\11/38\93\00\00\131\C1\15\0AQ\01\01'\06\02\1E\00\0F\17\04\01\0Aa\02\02\01\15\03\85\01\02\E4\00\01\F4\07\0Dd\02\03\22\00\1A3;\17\1F8\E0\0F\0B\195\B9\1C\0F,\1C\00\1D5,\1C97_3\B7\0D\137,\1C87_1U\17\196a\06\09Z\00\02\1C\06\146Z\00\1F2Z\00\07\132Z\00\182\86\1C/39%\03\02/40%\03\04\05\BA\18\0D\A4\18'2,W\19/41\86\0F\00/42\D6\01\00\02\B0\16\09\B7\1B(43j\02\08d\00)4,\0F\19\08d\00\03N\06\1F4\CD\02\01\175[\17\04d\00\0F\01\05\05\02\87\13\05/\13\0E\90\17\00#\00\0Fd\00\00\146x\17\0Br\02$7,\83\00\0F\06\05\04-164\00$8,\1B\01\0F4\00\05\1F7\B4\02\00\03\DD\02\0B\BF\14\124\1A\05)13\FE\00\0F\1B\05\03/48\F6\01\05\03\D6\01\0D\1B\05)50\D9\13\08\1B\05\1310\06\1F5\1C\05\00\03o\15\09s\08/51\F6\01\04\04m\13\1F1d\00\00\135d\00\0FZ\02\00$20\D2\18\0BF\08$1,\1F\00\0F1\04\0F\01{\09\02\B0\00\0F\8B\01\04/21\8B\01\01\02\FB\06*22 \14$2]I\19\09\99\03\133\99\03\B03:\0Aret;\0A\0A}\0A\00\00\00\00\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([26889 x i8], [26889 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 { +entry: + %input_cuda.addr = alloca float*, align 8 + %output_hidden_cuda.addr = alloca float*, align 8 + %input_hidden_cuda.addr = alloca float*, align 8 + %hidden_partial_sum.addr = alloca float*, align 8 + %in.addr = alloca i32, align 4 + %hid.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %input_cuda, float** %input_cuda.addr, align 8 + store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8 + store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8 + store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8 + store i32 %in, i32* %in.addr, align 4 + store i32 %hid, i32* %hid.addr, align 4 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast float** %input_cuda.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast float** %output_hidden_cuda.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast float** %input_hidden_cuda.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast float** %hidden_partial_sum.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %in.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %hid.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 { +entry: + %delta.addr = alloca float*, align 8 + %hid.addr = alloca i32, align 4 + %ly.addr = alloca float*, align 8 + %in.addr = alloca i32, align 4 + %w.addr = alloca float*, align 8 + %oldw.addr = alloca float*, align 8 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %delta, float** %delta.addr, align 8 + store i32 %hid, i32* %hid.addr, align 4 + store float* %ly, float** %ly.addr, align 8 + store i32 %in, i32* %in.addr, align 4 + store float* %w, float** %w.addr, align 8 + store float* %oldw, float** %oldw.addr, align 8 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast float** %delta.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32* %hid.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast float** %ly.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %in.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast float** %w.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast float** %oldw.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_ to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local double @_Z7gettimev() #2 { +entry: + %t = alloca %struct.timeval, align 8 + %call = call i32 @gettimeofday(%struct.timeval* %t, %struct.timezone* null) #7 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %t, i32 0, i32 0 + %0 = load i64, i64* %tv_sec, align 8 + %conv = sitofp i64 %0 to double + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %t, i32 0, i32 1 + %1 = load i64, i64* %tv_usec, align 8 + %conv1 = sitofp i64 %1 to double + %mul = fmul contract double %conv1, 0x3EB0C6F7A0B5ED8D + %add = fadd contract double %conv, %mul + ret double %add +} + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #3 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #4 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + %call1 = call i32 @setup(i32 %0, i8** %1) + ret i32 0 +} + +declare dso_local i32 @cudaSetDevice(i32) #5 + +declare dso_local i32 @setup(i32, i8**) #5 + +; Function Attrs: noinline optnone uwtable +define dso_local void @bpnn_train_cuda(%struct.BPNN* %net, float* %eo, float* %eh) #0 { +entry: + %net.addr = alloca %struct.BPNN*, align 8 + %eo.addr = alloca float*, align 8 + %eh.addr = alloca float*, align 8 + %in = alloca i32, align 4 + %hid = alloca i32, align 4 + %out = alloca i32, align 4 + %out_err = alloca float, align 4 + %hid_err = alloca float, align 4 + %m = alloca i32, align 4 + %input_hidden_cuda = alloca float*, align 8 + %input_cuda = alloca float*, align 8 + %output_hidden_cuda = alloca float*, align 8 + %partial_sum = alloca float*, align 8 + %hidden_partial_sum = alloca float*, align 8 + %hidden_delta_cuda = alloca float*, align 8 + %input_prev_weights_cuda = alloca float*, align 8 + %sum = alloca float, align 4 + %input_weights_one_dim = alloca float*, align 8 + %input_weights_prev_one_dim = alloca float*, align 8 + %grid = alloca %struct.dim3, align 4 + %threads = alloca %struct.dim3, align 4 + %k = alloca i32, align 4 + %j = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp59 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp59.coerce = alloca { i64, i32 }, align 4 + %error = alloca i32, align 4 + %j70 = alloca i32, align 4 + %k74 = alloca i32, align 4 + %agg.tmp136 = alloca %struct.dim3, align 4 + %agg.tmp137 = alloca %struct.dim3, align 4 + %agg.tmp136.coerce = alloca { i64, i32 }, align 4 + %agg.tmp137.coerce = alloca { i64, i32 }, align 4 + %i = alloca i32, align 4 + store %struct.BPNN* %net, %struct.BPNN** %net.addr, align 8 + store float* %eo, float** %eo.addr, align 8 + store float* %eh, float** %eh.addr, align 8 + %0 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %input_n = getelementptr inbounds %struct.BPNN, %struct.BPNN* %0, i32 0, i32 0 + %1 = load i32, i32* %input_n, align 8 + store i32 %1, i32* %in, align 4 + %2 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_n = getelementptr inbounds %struct.BPNN, %struct.BPNN* %2, i32 0, i32 1 + %3 = load i32, i32* %hidden_n, align 4 + store i32 %3, i32* %hid, align 4 + %4 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %output_n = getelementptr inbounds %struct.BPNN, %struct.BPNN* %4, i32 0, i32 2 + %5 = load i32, i32* %output_n, align 8 + store i32 %5, i32* %out, align 4 + store i32 0, i32* %m, align 4 + %6 = load i32, i32* %in, align 4 + %div = sdiv i32 %6, 16 + store i32 %div, i32* @num_blocks, align 4 + %7 = load i32, i32* @num_blocks, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid, i32 1, i32 %7, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %threads, i32 16, i32 16, i32 1) + %8 = load i32, i32* %in, align 4 + %add = add nsw i32 %8, 1 + %9 = load i32, i32* %hid, align 4 + %add1 = add nsw i32 %9, 1 + %mul = mul nsw i32 %add, %add1 + %conv = sext i32 %mul to i64 + %mul2 = mul i64 %conv, 4 + %call = call noalias i8* @malloc(i64 %mul2) #7 + %10 = bitcast i8* %call to float* + store float* %10, float** %input_weights_one_dim, align 8 + %11 = load i32, i32* %in, align 4 + %add3 = add nsw i32 %11, 1 + %12 = load i32, i32* %hid, align 4 + %add4 = add nsw i32 %12, 1 + %mul5 = mul nsw i32 %add3, %add4 + %conv6 = sext i32 %mul5 to i64 + %mul7 = mul i64 %conv6, 4 + %call8 = call noalias i8* @malloc(i64 %mul7) #7 + %13 = bitcast i8* %call8 to float* + store float* %13, float** %input_weights_prev_one_dim, align 8 + %14 = load i32, i32* @num_blocks, align 4 + %mul9 = mul i32 %14, 16 + %conv10 = zext i32 %mul9 to i64 + %mul11 = mul i64 %conv10, 4 + %call12 = call noalias i8* @malloc(i64 %mul11) #7 + %15 = bitcast i8* %call12 to float* + store float* %15, float** %partial_sum, align 8 + store i32 0, i32* %k, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc27, %entry + %16 = load i32, i32* %k, align 4 + %17 = load i32, i32* %in, align 4 + %cmp = icmp sle i32 %16, %17 + br i1 %cmp, label %for.body, label %for.end29 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond13 + +for.cond13: ; preds = %for.inc, %for.body + %18 = load i32, i32* %j, align 4 + %19 = load i32, i32* %hid, align 4 + %cmp14 = icmp sle i32 %18, %19 + br i1 %cmp14, label %for.body15, label %for.end + +for.body15: ; preds = %for.cond13 + %20 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %input_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %20, i32 0, i32 9 + %21 = load float**, float*** %input_weights, align 8 + %22 = load i32, i32* %k, align 4 + %idxprom = sext i32 %22 to i64 + %arrayidx = getelementptr inbounds float*, float** %21, i64 %idxprom + %23 = load float*, float** %arrayidx, align 8 + %24 = load i32, i32* %j, align 4 + %idxprom16 = sext i32 %24 to i64 + %arrayidx17 = getelementptr inbounds float, float* %23, i64 %idxprom16 + %25 = load float, float* %arrayidx17, align 4 + %26 = load float*, float** %input_weights_one_dim, align 8 + %27 = load i32, i32* %m, align 4 + %idxprom18 = sext i32 %27 to i64 + %arrayidx19 = getelementptr inbounds float, float* %26, i64 %idxprom18 + store float %25, float* %arrayidx19, align 4 + %28 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %input_prev_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %28, i32 0, i32 11 + %29 = load float**, float*** %input_prev_weights, align 8 + %30 = load i32, i32* %k, align 4 + %idxprom20 = sext i32 %30 to i64 + %arrayidx21 = getelementptr inbounds float*, float** %29, i64 %idxprom20 + %31 = load float*, float** %arrayidx21, align 8 + %32 = load i32, i32* %j, align 4 + %idxprom22 = sext i32 %32 to i64 + %arrayidx23 = getelementptr inbounds float, float* %31, i64 %idxprom22 + %33 = load float, float* %arrayidx23, align 4 + %34 = load float*, float** %input_weights_prev_one_dim, align 8 + %35 = load i32, i32* %m, align 4 + %idxprom24 = sext i32 %35 to i64 + %arrayidx25 = getelementptr inbounds float, float* %34, i64 %idxprom24 + store float %33, float* %arrayidx25, align 4 + %36 = load i32, i32* %m, align 4 + %inc = add nsw i32 %36, 1 + store i32 %inc, i32* %m, align 4 + br label %for.inc + +for.inc: ; preds = %for.body15 + %37 = load i32, i32* %j, align 4 + %inc26 = add nsw i32 %37, 1 + store i32 %inc26, i32* %j, align 4 + br label %for.cond13 + +for.end: ; preds = %for.cond13 + br label %for.inc27 + +for.inc27: ; preds = %for.end + %38 = load i32, i32* %k, align 4 + %inc28 = add nsw i32 %38, 1 + store i32 %inc28, i32* %k, align 4 + br label %for.cond + +for.end29: ; preds = %for.cond + %39 = bitcast float** %input_cuda to i8** + %40 = load i32, i32* %in, align 4 + %add30 = add nsw i32 %40, 1 + %conv31 = sext i32 %add30 to i64 + %mul32 = mul i64 %conv31, 4 + %call33 = call i32 @cudaMalloc(i8** %39, i64 %mul32) + %41 = bitcast float** %output_hidden_cuda to i8** + %42 = load i32, i32* %hid, align 4 + %add34 = add nsw i32 %42, 1 + %conv35 = sext i32 %add34 to i64 + %mul36 = mul i64 %conv35, 4 + %call37 = call i32 @cudaMalloc(i8** %41, i64 %mul36) + %43 = bitcast float** %input_hidden_cuda to i8** + %44 = load i32, i32* %in, align 4 + %add38 = add nsw i32 %44, 1 + %45 = load i32, i32* %hid, align 4 + %add39 = add nsw i32 %45, 1 + %mul40 = mul nsw i32 %add38, %add39 + %conv41 = sext i32 %mul40 to i64 + %mul42 = mul i64 %conv41, 4 + %call43 = call i32 @cudaMalloc(i8** %43, i64 %mul42) + %46 = bitcast float** %hidden_partial_sum to i8** + %47 = load i32, i32* @num_blocks, align 4 + %mul44 = mul i32 %47, 16 + %conv45 = zext i32 %mul44 to i64 + %mul46 = mul i64 %conv45, 4 + %call47 = call i32 @cudaMalloc(i8** %46, i64 %mul46) + %call48 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str, i64 0, i64 0)) + %48 = load float*, float** %input_cuda, align 8 + %49 = bitcast float* %48 to i8* + %50 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %input_units = getelementptr inbounds %struct.BPNN, %struct.BPNN* %50, i32 0, i32 3 + %51 = load float*, float** %input_units, align 8 + %52 = bitcast float* %51 to i8* + %53 = load i32, i32* %in, align 4 + %add49 = add nsw i32 %53, 1 + %conv50 = sext i32 %add49 to i64 + %mul51 = mul i64 %conv50, 4 + %call52 = call i32 @cudaMemcpy(i8* %49, i8* %52, i64 %mul51, i32 1) + %54 = load float*, float** %input_hidden_cuda, align 8 + %55 = bitcast float* %54 to i8* + %56 = load float*, float** %input_weights_one_dim, align 8 + %57 = bitcast float* %56 to i8* + %58 = load i32, i32* %in, align 4 + %add53 = add nsw i32 %58, 1 + %59 = load i32, i32* %hid, align 4 + %add54 = add nsw i32 %59, 1 + %mul55 = mul nsw i32 %add53, %add54 + %conv56 = sext i32 %mul55 to i64 + %mul57 = mul i64 %conv56, 4 + %call58 = call i32 @cudaMemcpy(i8* %55, i8* %57, i64 %mul57, i32 1) + %60 = bitcast %struct.dim3* %agg.tmp to i8* + %61 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %60, i8* align 4 %61, i64 12, i1 false) + %62 = bitcast %struct.dim3* %agg.tmp59 to i8* + %63 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %62, i8* align 4 %63, i64 12, i1 false) + %64 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %65 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %64, i8* align 4 %65, i64 12, i1 false) + %66 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %67 = load i64, i64* %66, align 4 + %68 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %69 = load i32, i32* %68, align 4 + %70 = bitcast { i64, i32 }* %agg.tmp59.coerce to i8* + %71 = bitcast %struct.dim3* %agg.tmp59 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %70, i8* align 4 %71, i64 12, i1 false) + %72 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp59.coerce, i32 0, i32 0 + %73 = load i64, i64* %72, align 4 + %74 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp59.coerce, i32 0, i32 1 + %75 = load i32, i32* %74, align 4 + %call60 = call i32 @__cudaPushCallConfiguration(i64 %67, i32 %69, i64 %73, i32 %75, i64 0, i8* null) + %tobool = icmp ne i32 %call60, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.end29 + %76 = load float*, float** %input_cuda, align 8 + %77 = load float*, float** %output_hidden_cuda, align 8 + %78 = load float*, float** %input_hidden_cuda, align 8 + %79 = load float*, float** %hidden_partial_sum, align 8 + %80 = load i32, i32* %in, align 4 + %81 = load i32, i32* %hid, align 4 + call void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %76, float* %77, float* %78, float* %79, i32 %80, i32 %81) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.end29 + %call61 = call i32 @cudaThreadSynchronize() + %call62 = call i32 @cudaGetLastError() + store i32 %call62, i32* %error, align 4 + %82 = load i32, i32* %error, align 4 + %cmp63 = icmp ne i32 %82, 0 + br i1 %cmp63, label %if.then, label %if.end + +if.then: ; preds = %kcall.end + %83 = load i32, i32* %error, align 4 + %call64 = call i8* @cudaGetErrorString(i32 %83) + %call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.1, i64 0, i64 0), i8* %call64) + call void @exit(i32 1) #8 + unreachable + +if.end: ; preds = %kcall.end + %84 = load float*, float** %partial_sum, align 8 + %85 = bitcast float* %84 to i8* + %86 = load float*, float** %hidden_partial_sum, align 8 + %87 = bitcast float* %86 to i8* + %88 = load i32, i32* @num_blocks, align 4 + %mul66 = mul i32 %88, 16 + %conv67 = zext i32 %mul66 to i64 + %mul68 = mul i64 %conv67, 4 + %call69 = call i32 @cudaMemcpy(i8* %85, i8* %87, i64 %mul68, i32 2) + store i32 1, i32* %j70, align 4 + br label %for.cond71 + +for.cond71: ; preds = %for.inc98, %if.end + %89 = load i32, i32* %j70, align 4 + %90 = load i32, i32* %hid, align 4 + %cmp72 = icmp sle i32 %89, %90 + br i1 %cmp72, label %for.body73, label %for.end100 + +for.body73: ; preds = %for.cond71 + store float 0.000000e+00, float* %sum, align 4 + store i32 0, i32* %k74, align 4 + br label %for.cond75 + +for.cond75: ; preds = %for.inc83, %for.body73 + %91 = load i32, i32* %k74, align 4 + %92 = load i32, i32* @num_blocks, align 4 + %cmp76 = icmp ult i32 %91, %92 + br i1 %cmp76, label %for.body77, label %for.end85 + +for.body77: ; preds = %for.cond75 + %93 = load float*, float** %partial_sum, align 8 + %94 = load i32, i32* %k74, align 4 + %95 = load i32, i32* %hid, align 4 + %mul78 = mul nsw i32 %94, %95 + %96 = load i32, i32* %j70, align 4 + %add79 = add nsw i32 %mul78, %96 + %sub = sub nsw i32 %add79, 1 + %idxprom80 = sext i32 %sub to i64 + %arrayidx81 = getelementptr inbounds float, float* %93, i64 %idxprom80 + %97 = load float, float* %arrayidx81, align 4 + %98 = load float, float* %sum, align 4 + %add82 = fadd contract float %98, %97 + store float %add82, float* %sum, align 4 + br label %for.inc83 + +for.inc83: ; preds = %for.body77 + %99 = load i32, i32* %k74, align 4 + %inc84 = add nsw i32 %99, 1 + store i32 %inc84, i32* %k74, align 4 + br label %for.cond75 + +for.end85: ; preds = %for.cond75 + %100 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %input_weights86 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %100, i32 0, i32 9 + %101 = load float**, float*** %input_weights86, align 8 + %arrayidx87 = getelementptr inbounds float*, float** %101, i64 0 + %102 = load float*, float** %arrayidx87, align 8 + %103 = load i32, i32* %j70, align 4 + %idxprom88 = sext i32 %103 to i64 + %arrayidx89 = getelementptr inbounds float, float* %102, i64 %idxprom88 + %104 = load float, float* %arrayidx89, align 4 + %105 = load float, float* %sum, align 4 + %add90 = fadd contract float %105, %104 + store float %add90, float* %sum, align 4 + %106 = load float, float* %sum, align 4 + %fneg = fneg float %106 + %call91 = call float @_ZSt3expf(float %fneg) + %conv92 = fpext float %call91 to double + %add93 = fadd contract double 1.000000e+00, %conv92 + %div94 = fdiv double 1.000000e+00, %add93 + %conv95 = fptrunc double %div94 to float + %107 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_units = getelementptr inbounds %struct.BPNN, %struct.BPNN* %107, i32 0, i32 4 + %108 = load float*, float** %hidden_units, align 8 + %109 = load i32, i32* %j70, align 4 + %idxprom96 = sext i32 %109 to i64 + %arrayidx97 = getelementptr inbounds float, float* %108, i64 %idxprom96 + store float %conv95, float* %arrayidx97, align 4 + br label %for.inc98 + +for.inc98: ; preds = %for.end85 + %110 = load i32, i32* %j70, align 4 + %inc99 = add nsw i32 %110, 1 + store i32 %inc99, i32* %j70, align 4 + br label %for.cond71 + +for.end100: ; preds = %for.cond71 + %111 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_units101 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %111, i32 0, i32 4 + %112 = load float*, float** %hidden_units101, align 8 + %113 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %output_units = getelementptr inbounds %struct.BPNN, %struct.BPNN* %113, i32 0, i32 5 + %114 = load float*, float** %output_units, align 8 + %115 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %115, i32 0, i32 10 + %116 = load float**, float*** %hidden_weights, align 8 + %117 = load i32, i32* %hid, align 4 + %118 = load i32, i32* %out, align 4 + call void @bpnn_layerforward(float* %112, float* %114, float** %116, i32 %117, i32 %118) + %119 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %output_delta = getelementptr inbounds %struct.BPNN, %struct.BPNN* %119, i32 0, i32 7 + %120 = load float*, float** %output_delta, align 8 + %121 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %target = getelementptr inbounds %struct.BPNN, %struct.BPNN* %121, i32 0, i32 8 + %122 = load float*, float** %target, align 8 + %123 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %output_units102 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %123, i32 0, i32 5 + %124 = load float*, float** %output_units102, align 8 + %125 = load i32, i32* %out, align 4 + call void @bpnn_output_error(float* %120, float* %122, float* %124, i32 %125, float* %out_err) + %126 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_delta = getelementptr inbounds %struct.BPNN, %struct.BPNN* %126, i32 0, i32 6 + %127 = load float*, float** %hidden_delta, align 8 + %128 = load i32, i32* %hid, align 4 + %129 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %output_delta103 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %129, i32 0, i32 7 + %130 = load float*, float** %output_delta103, align 8 + %131 = load i32, i32* %out, align 4 + %132 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_weights104 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %132, i32 0, i32 10 + %133 = load float**, float*** %hidden_weights104, align 8 + %134 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_units105 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %134, i32 0, i32 4 + %135 = load float*, float** %hidden_units105, align 8 + call void @bpnn_hidden_error(float* %127, i32 %128, float* %130, i32 %131, float** %133, float* %135, float* %hid_err) + %136 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %output_delta106 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %136, i32 0, i32 7 + %137 = load float*, float** %output_delta106, align 8 + %138 = load i32, i32* %out, align 4 + %139 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_units107 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %139, i32 0, i32 4 + %140 = load float*, float** %hidden_units107, align 8 + %141 = load i32, i32* %hid, align 4 + %142 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_weights108 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %142, i32 0, i32 10 + %143 = load float**, float*** %hidden_weights108, align 8 + %144 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_prev_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %144, i32 0, i32 12 + %145 = load float**, float*** %hidden_prev_weights, align 8 + call void @bpnn_adjust_weights(float* %137, i32 %138, float* %140, i32 %141, float** %143, float** %145) + %146 = bitcast float** %hidden_delta_cuda to i8** + %147 = load i32, i32* %hid, align 4 + %add109 = add nsw i32 %147, 1 + %conv110 = sext i32 %add109 to i64 + %mul111 = mul i64 %conv110, 4 + %call112 = call i32 @cudaMalloc(i8** %146, i64 %mul111) + %148 = bitcast float** %input_prev_weights_cuda to i8** + %149 = load i32, i32* %in, align 4 + %add113 = add nsw i32 %149, 1 + %150 = load i32, i32* %hid, align 4 + %add114 = add nsw i32 %150, 1 + %mul115 = mul nsw i32 %add113, %add114 + %conv116 = sext i32 %mul115 to i64 + %mul117 = mul i64 %conv116, 4 + %call118 = call i32 @cudaMalloc(i8** %148, i64 %mul117) + %151 = load float*, float** %hidden_delta_cuda, align 8 + %152 = bitcast float* %151 to i8* + %153 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %hidden_delta119 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %153, i32 0, i32 6 + %154 = load float*, float** %hidden_delta119, align 8 + %155 = bitcast float* %154 to i8* + %156 = load i32, i32* %hid, align 4 + %add120 = add nsw i32 %156, 1 + %conv121 = sext i32 %add120 to i64 + %mul122 = mul i64 %conv121, 4 + %call123 = call i32 @cudaMemcpy(i8* %152, i8* %155, i64 %mul122, i32 1) + %157 = load float*, float** %input_prev_weights_cuda, align 8 + %158 = bitcast float* %157 to i8* + %159 = load float*, float** %input_weights_prev_one_dim, align 8 + %160 = bitcast float* %159 to i8* + %161 = load i32, i32* %in, align 4 + %add124 = add nsw i32 %161, 1 + %162 = load i32, i32* %hid, align 4 + %add125 = add nsw i32 %162, 1 + %mul126 = mul nsw i32 %add124, %add125 + %conv127 = sext i32 %mul126 to i64 + %mul128 = mul i64 %conv127, 4 + %call129 = call i32 @cudaMemcpy(i8* %158, i8* %160, i64 %mul128, i32 1) + %163 = load float*, float** %input_hidden_cuda, align 8 + %164 = bitcast float* %163 to i8* + %165 = load float*, float** %input_weights_one_dim, align 8 + %166 = bitcast float* %165 to i8* + %167 = load i32, i32* %in, align 4 + %add130 = add nsw i32 %167, 1 + %168 = load i32, i32* %hid, align 4 + %add131 = add nsw i32 %168, 1 + %mul132 = mul nsw i32 %add130, %add131 + %conv133 = sext i32 %mul132 to i64 + %mul134 = mul i64 %conv133, 4 + %call135 = call i32 @cudaMemcpy(i8* %164, i8* %166, i64 %mul134, i32 1) + %169 = bitcast %struct.dim3* %agg.tmp136 to i8* + %170 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %169, i8* align 4 %170, i64 12, i1 false) + %171 = bitcast %struct.dim3* %agg.tmp137 to i8* + %172 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %171, i8* align 4 %172, i64 12, i1 false) + %173 = bitcast { i64, i32 }* %agg.tmp136.coerce to i8* + %174 = bitcast %struct.dim3* %agg.tmp136 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %173, i8* align 4 %174, i64 12, i1 false) + %175 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp136.coerce, i32 0, i32 0 + %176 = load i64, i64* %175, align 4 + %177 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp136.coerce, i32 0, i32 1 + %178 = load i32, i32* %177, align 4 + %179 = bitcast { i64, i32 }* %agg.tmp137.coerce to i8* + %180 = bitcast %struct.dim3* %agg.tmp137 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %179, i8* align 4 %180, i64 12, i1 false) + %181 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp137.coerce, i32 0, i32 0 + %182 = load i64, i64* %181, align 4 + %183 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp137.coerce, i32 0, i32 1 + %184 = load i32, i32* %183, align 4 + %call138 = call i32 @__cudaPushCallConfiguration(i64 %176, i32 %178, i64 %182, i32 %184, i64 0, i8* null) + %tobool139 = icmp ne i32 %call138, 0 + br i1 %tobool139, label %kcall.end141, label %kcall.configok140 + +kcall.configok140: ; preds = %for.end100 + %185 = load float*, float** %hidden_delta_cuda, align 8 + %186 = load i32, i32* %hid, align 4 + %187 = load float*, float** %input_cuda, align 8 + %188 = load i32, i32* %in, align 4 + %189 = load float*, float** %input_hidden_cuda, align 8 + %190 = load float*, float** %input_prev_weights_cuda, align 8 + call void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %185, i32 %186, float* %187, i32 %188, float* %189, float* %190) + br label %kcall.end141 + +kcall.end141: ; preds = %kcall.configok140, %for.end100 + %191 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8 + %input_units142 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %191, i32 0, i32 3 + %192 = load float*, float** %input_units142, align 8 + %193 = bitcast float* %192 to i8* + %194 = load float*, float** %input_cuda, align 8 + %195 = bitcast float* %194 to i8* + %196 = load i32, i32* %in, align 4 + %add143 = add nsw i32 %196, 1 + %conv144 = sext i32 %add143 to i64 + %mul145 = mul i64 %conv144, 4 + %call146 = call i32 @cudaMemcpy(i8* %193, i8* %195, i64 %mul145, i32 2) + %197 = load float*, float** %input_weights_one_dim, align 8 + %198 = bitcast float* %197 to i8* + %199 = load float*, float** %input_hidden_cuda, align 8 + %200 = bitcast float* %199 to i8* + %201 = load i32, i32* %in, align 4 + %add147 = add nsw i32 %201, 1 + %202 = load i32, i32* %hid, align 4 + %add148 = add nsw i32 %202, 1 + %mul149 = mul nsw i32 %add147, %add148 + %conv150 = sext i32 %mul149 to i64 + %mul151 = mul i64 %conv150, 4 + %call152 = call i32 @cudaMemcpy(i8* %198, i8* %200, i64 %mul151, i32 2) + store i32 0, i32* %i, align 4 + br label %for.cond153 + +for.cond153: ; preds = %for.inc163, %kcall.end141 + %203 = load i32, i32* %i, align 4 + %204 = load i32, i32* %in, align 4 + %add154 = add nsw i32 %204, 1 + %205 = load i32, i32* %hid, align 4 + %add155 = add nsw i32 %205, 1 + %mul156 = mul nsw i32 %add154, %add155 + %cmp157 = icmp slt i32 %203, %mul156 + br i1 %cmp157, label %for.body158, label %for.end165 + +for.body158: ; preds = %for.cond153 + %206 = load float*, float** %input_weights_one_dim, align 8 + %207 = load i32, i32* %i, align 4 + %idxprom159 = sext i32 %207 to i64 + %arrayidx160 = getelementptr inbounds float, float* %206, i64 %idxprom159 + %208 = load float, float* %arrayidx160, align 4 + %conv161 = fpext float %208 to double + %call162 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.2, i64 0, i64 0), double %conv161) + br label %for.inc163 + +for.inc163: ; preds = %for.body158 + %209 = load i32, i32* %i, align 4 + %inc164 = add nsw i32 %209, 1 + store i32 %inc164, i32* %i, align 4 + br label %for.cond153 + +for.end165: ; preds = %for.cond153 + %call166 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i64 0, i64 0)) + %210 = load float*, float** %input_cuda, align 8 + %211 = bitcast float* %210 to i8* + %call167 = call i32 @cudaFree(i8* %211) + %212 = load float*, float** %output_hidden_cuda, align 8 + %213 = bitcast float* %212 to i8* + %call168 = call i32 @cudaFree(i8* %213) + %214 = load float*, float** %input_hidden_cuda, align 8 + %215 = bitcast float* %214 to i8* + %call169 = call i32 @cudaFree(i8* %215) + %216 = load float*, float** %hidden_partial_sum, align 8 + %217 = bitcast float* %216 to i8* + %call170 = call i32 @cudaFree(i8* %217) + %218 = load float*, float** %input_prev_weights_cuda, align 8 + %219 = bitcast float* %218 to i8* + %call171 = call i32 @cudaFree(i8* %219) + %220 = load float*, float** %hidden_delta_cuda, align 8 + %221 = bitcast float* %220 to i8* + %call172 = call i32 @cudaFree(i8* %221) + %222 = load float*, float** %partial_sum, align 8 + %223 = bitcast float* %222 to i8* + call void @free(i8* %223) #7 + %224 = load float*, float** %input_weights_one_dim, align 8 + %225 = bitcast float* %224 to i8* + call void @free(i8* %225) #7 + %226 = load float*, float** %input_weights_prev_one_dim, align 8 + %227 = bitcast float* %226 to i8* + call void @free(i8* %227) #7 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #3 + +declare dso_local i32 @cudaMalloc(i8**, i64) #5 + +declare dso_local i32 @printf(i8*, ...) #5 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #5 + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #5 + +declare dso_local i32 @cudaThreadSynchronize() #5 + +declare dso_local i32 @cudaGetLastError() #5 + +declare dso_local i8* @cudaGetErrorString(i32) #5 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #6 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt3expf(float %__x) #2 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %call = call float @expf(float %0) #7 + ret float %call +} + +declare dso_local void @bpnn_layerforward(float*, float*, float**, i32, i32) #5 + +declare dso_local void @bpnn_output_error(float*, float*, float*, i32, float*) #5 + +declare dso_local void @bpnn_hidden_error(float*, i32, float*, i32, float**, float*, float*) #5 + +declare dso_local void @bpnn_adjust_weights(float*, i32, float*, i32, float**, float**) #5 + +declare dso_local i32 @cudaFree(i8*) #5 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #3 + +; Function Attrs: nounwind +declare dso_local float @expf(float) #3 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii to i8*), i8* getelementptr inbounds ([37 x i8], [37 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([37 x i8], [37 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_ to i8*), i8* getelementptr inbounds ([39 x i8], [39 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([39 x i8], [39 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { nounwind } +attributes #8 = { noreturn nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/backprop/backprop_cuda.cu b/examples/backprop/backprop_cuda.cu new file mode 100644 index 0000000..9641fe6 --- /dev/null +++ b/examples/backprop/backprop_cuda.cu @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include + +// includes, kernels +#include "backprop.h" +#include "backprop_cuda_kernel.cu" + +//////////////////////////////////////////////////////////////////////////////// + +extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, + int n2); + +extern "C" void bpnn_output_error(float *delta, float *target, float *output, + int nj, float *err); + +extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, + int no, float **who, float *hidden, + float *err); + +extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly, + int nly, float **w, float **oldw); + +extern "C" int setup(int argc, char **argv); + +extern "C" float **alloc_2d_dbl(int m, int n); + +extern "C" float squash(float x); + +double gettime() { + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec + t.tv_usec * 1e-6; +} + +unsigned int num_threads = 0; +unsigned int num_blocks = 0; + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + cudaSetDevice(0); + setup(argc, argv); +} + +extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) { + int in, hid, out; + float out_err, hid_err; + + in = net->input_n; + hid = net->hidden_n; + out = net->output_n; + +#ifdef GPU + int m = 0; + float *input_hidden_cuda; + float *input_cuda; + float *output_hidden_cuda; + float *partial_sum; + float *hidden_partial_sum; + float *hidden_delta_cuda; + float *input_prev_weights_cuda; + float sum; + float *input_weights_one_dim; + float *input_weights_prev_one_dim; + num_blocks = in / 16; + dim3 grid(1, num_blocks); + dim3 threads(16, 16); + + input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float)); + input_weights_prev_one_dim = + (float *)malloc((in + 1) * (hid + 1) * sizeof(float)); + partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float)); + + // this preprocessing stage is added to correct the bugs of wrong memcopy + // using two-dimensional net->inputweights + for (int k = 0; k <= in; k++) { + for (int j = 0; j <= hid; j++) { + input_weights_one_dim[m] = net->input_weights[k][j]; + input_weights_prev_one_dim[m] = net->input_prev_weights[k][j]; + m++; + } + } + + cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float)); + cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float)); + cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); + cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); + +#endif + +#ifdef CPU + + printf("Performing CPU computation\n"); + bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in, + hid); + +#endif + +#ifdef GPU + + printf("Performing GPU computation\n"); + + // printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks); + + cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float), + cudaMemcpyHostToDevice); + cudaMemcpy(input_hidden_cuda, input_weights_one_dim, + (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); + + bpnn_layerforward_CUDA<<>>(input_cuda, output_hidden_cuda, + input_hidden_cuda, + hidden_partial_sum, in, hid); + + cudaThreadSynchronize(); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) { + printf("bpnn kernel error: %s\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + cudaMemcpy(partial_sum, hidden_partial_sum, + num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost); + + for (int j = 1; j <= hid; j++) { + sum = 0.0; + for (int k = 0; k < num_blocks; k++) { + sum += partial_sum[k * hid + j - 1]; + } + sum += net->input_weights[0][j]; + net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum))); + } +#endif + + bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, + hid, out); + bpnn_output_error(net->output_delta, net->target, net->output_units, out, + &out_err); + bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, + net->hidden_weights, net->hidden_units, &hid_err); + bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, + net->hidden_weights, net->hidden_prev_weights); + +#ifdef CPU + + bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, + net->input_weights, net->input_prev_weights); + +#endif + +#ifdef GPU + + cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float)); + cudaMalloc((void **)&input_prev_weights_cuda, + (in + 1) * (hid + 1) * sizeof(float)); + + cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float), + cudaMemcpyHostToDevice); + cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim, + (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(input_hidden_cuda, input_weights_one_dim, + (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); + + bpnn_adjust_weights_cuda<<>>(hidden_delta_cuda, hid, + input_cuda, in, input_hidden_cuda, + input_prev_weights_cuda); + + cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float), + cudaMemcpyDeviceToHost); + cudaMemcpy(input_weights_one_dim, input_hidden_cuda, + (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost); + + for (int i = 0; i < (in + 1) * (hid + 1); i++) { + printf("%f ", input_weights_one_dim[i]); + } + printf("\n"); + + cudaFree(input_cuda); + cudaFree(output_hidden_cuda); + cudaFree(input_hidden_cuda); + cudaFree(hidden_partial_sum); + cudaFree(input_prev_weights_cuda); + cudaFree(hidden_delta_cuda); + + free(partial_sum); + free(input_weights_one_dim); + free(input_weights_prev_one_dim); + +#endif +} diff --git a/examples/backprop/backprop_cuda_kernel.cu b/examples/backprop/backprop_cuda_kernel.cu new file mode 100644 index 0000000..96b7d9b --- /dev/null +++ b/examples/backprop/backprop_cuda_kernel.cu @@ -0,0 +1,96 @@ +#ifndef _BACKPROP_CUDA_KERNEL_H_ +#define _BACKPROP_CUDA_KERNEL_H_ + +#include "backprop.h" +#include "cuda.h" +#include "math.h" +#include + +__global__ void bpnn_layerforward_CUDA(float *input_cuda, + float *output_hidden_cuda, + float *input_hidden_cuda, + float *hidden_partial_sum, int in, + int hid) { + int by = blockIdx.y; + int tx = threadIdx.x; + int ty = threadIdx.y; + + int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1); + + int index_in = HEIGHT * by + ty + 1; + + __shared__ float input_node[HEIGHT]; + __shared__ float weight_matrix[HEIGHT][WIDTH]; + + if (tx == 0) + input_node[ty] = input_cuda[index_in]; + + __syncthreads(); + + weight_matrix[ty][tx] = input_hidden_cuda[index]; + + __syncthreads(); + + weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty]; + + __syncthreads(); + + for (int i = 1; i <= __log2f(HEIGHT); i++) { + + int power_two = __powf(2, i); + + if (ty % power_two == 0) + weight_matrix[ty][tx] = + weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx]; + + __syncthreads(); + } + + //__syncthreads(); + + input_hidden_cuda[index] = weight_matrix[ty][tx]; + + /* + for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){ + + unsigned int power_two = i - 1; + if( (ty & power_two) == 0 ) { + weight_matrix[ty][tx] = weight_matrix[ty][tx] + + weight_matrix[ty + power_two/2][tx]; + } + } + */ + + __syncthreads(); + + if (tx == 0) { + hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty]; + } +} + +__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly, + int in, float *w, float *oldw) { + + int by = blockIdx.y; + + int tx = threadIdx.x; + int ty = threadIdx.y; + + int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1); + int index_y = HEIGHT * by + ty + 1; + int index_x = tx + 1; + // eta = 0.3; + // momentum = 0.3; + + w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index])); + oldw[index] = + ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index])); + + __syncthreads(); + + if (ty == 0 && by == 0) { + w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x])); + oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x])); + } +} +#endif diff --git a/examples/backprop/facetrain.c b/examples/backprop/facetrain.c new file mode 100644 index 0000000..4f9aaab --- /dev/null +++ b/examples/backprop/facetrain.c @@ -0,0 +1,48 @@ +#include "backprop.h" +#include +#include +#include + +extern char *strcpy(); +extern void exit(); + +int layer_size = 0; + +backprop_face() { + BPNN *net; + int i; + float out_err, hid_err; + net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed) + + printf("Input layer size : %d\n", layer_size); + load(net); + // entering the training kernel, only one iteration + printf("Starting training kernel\n"); + bpnn_train_cuda(net, &out_err, &hid_err); + bpnn_free(net); + printf("Training done\n"); +} + +int setup(argc, argv) +int argc; +char *argv[]; +{ + + int seed; + + if (argc != 2) { + fprintf(stderr, "usage: backprop \n"); + exit(0); + } + layer_size = atoi(argv[1]); + if (layer_size % 16 != 0) { + fprintf(stderr, "The number of input points must be divided by 16\n"); + exit(0); + } + + seed = 7; + bpnn_initialize(seed); + backprop_face(); + + exit(0); +} diff --git a/examples/backprop/imagenet.c b/examples/backprop/imagenet.c new file mode 100644 index 0000000..807df38 --- /dev/null +++ b/examples/backprop/imagenet.c @@ -0,0 +1,22 @@ +#include "backprop.h" +#include +#include + +extern layer_size; + +load(net) BPNN *net; +{ + float *units; + int nr, nc, imgsize, i, j, k; + + nr = layer_size; + + imgsize = nr * nc; + units = net->input_units; + + k = 1; + for (i = 0; i < nr; i++) { + units[k] = (float)rand() / RAND_MAX; + k++; + } +} diff --git a/examples/backprop/run.sh b/examples/backprop/run.sh new file mode 100644 index 0000000..18083f5 --- /dev/null +++ b/examples/backprop/run.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e +clang -c -emit-llvm backprop.c +clang -c -emit-llvm facetrain.c +clang -c -emit-llvm imagenet.c + +llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc +llc --relocation-model=pic --filetype=obj backprop.bc +llc --relocation-model=pic --filetype=obj facetrain.bc +llc --relocation-model=pic --filetype=obj imagenet.bc +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o demo \ + -fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \ + -lc -lx86Runtime -lthreadPool -lpthread + +./demo 1024 > res.log +if grep -q -e "0.173289 0.259645 0.350836" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..5592d33 --- /dev/null +++ b/examples/bfs/bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,307 @@ +; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "bfs.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } +%struct.Node = type { i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 { +entry: + %g_graph_nodes.addr = alloca %struct.Node*, align 8 + %g_graph_edges.addr = alloca i32*, align 8 + %g_graph_mask.addr = alloca i8*, align 8 + %g_updating_graph_mask.addr = alloca i8*, align 8 + %g_graph_visited.addr = alloca i8*, align 8 + %g_cost.addr = alloca i32*, align 8 + %no_of_nodes.addr = alloca i32, align 4 + %tid = alloca i32, align 4 + %i = alloca i32, align 4 + %id = alloca i32, align 4 + store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8 + store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8 + store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8 + store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8 + store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8 + store i32* %g_cost, i32** %g_cost.addr, align 8 + store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %mul = mul i32 %call, 512 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add = add i32 %mul, %call1 + store i32 %add, i32* %tid, align 4 + %0 = load i32, i32* %tid, align 4 + %1 = load i32, i32* %no_of_nodes.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %land.lhs.true, label %if.end26 + +land.lhs.true: ; preds = %entry + %2 = load i8*, i8** %g_graph_mask.addr, align 8 + %3 = load i32, i32* %tid, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom + %4 = load i8, i8* %arrayidx, align 1 + %tobool = trunc i8 %4 to i1 + br i1 %tobool, label %if.then, label %if.end26 + +if.then: ; preds = %land.lhs.true + %5 = load i8*, i8** %g_graph_mask.addr, align 8 + %6 = load i32, i32* %tid, align 4 + %idxprom2 = sext i32 %6 to i64 + %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2 + store i8 0, i8* %arrayidx3, align 1 + %7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8 + %8 = load i32, i32* %tid, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4 + %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0 + %9 = load i32, i32* %starting, align 4 + store i32 %9, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then + %10 = load i32, i32* %i, align 4 + %11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8 + %12 = load i32, i32* %tid, align 4 + %idxprom6 = sext i32 %12 to i64 + %arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6 + %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1 + %13 = load i32, i32* %no_of_edges, align 4 + %14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8 + %15 = load i32, i32* %tid, align 4 + %idxprom8 = sext i32 %15 to i64 + %arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8 + %starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0 + %16 = load i32, i32* %starting10, align 4 + %add11 = add nsw i32 %13, %16 + %cmp12 = icmp slt i32 %10, %add11 + br i1 %cmp12, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %17 = load i32*, i32** %g_graph_edges.addr, align 8 + %18 = load i32, i32* %i, align 4 + %idxprom13 = sext i32 %18 to i64 + %arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13 + %19 = load i32, i32* %arrayidx14, align 4 + store i32 %19, i32* %id, align 4 + %20 = load i8*, i8** %g_graph_visited.addr, align 8 + %21 = load i32, i32* %id, align 4 + %idxprom15 = sext i32 %21 to i64 + %arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15 + %22 = load i8, i8* %arrayidx16, align 1 + %tobool17 = trunc i8 %22 to i1 + br i1 %tobool17, label %if.end, label %if.then18 + +if.then18: ; preds = %for.body + %23 = load i32*, i32** %g_cost.addr, align 8 + %24 = load i32, i32* %tid, align 4 + %idxprom19 = sext i32 %24 to i64 + %arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19 + %25 = load i32, i32* %arrayidx20, align 4 + %add21 = add nsw i32 %25, 1 + %26 = load i32*, i32** %g_cost.addr, align 8 + %27 = load i32, i32* %id, align 4 + %idxprom22 = sext i32 %27 to i64 + %arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22 + store i32 %add21, i32* %arrayidx23, align 4 + %28 = load i8*, i8** %g_updating_graph_mask.addr, align 8 + %29 = load i32, i32* %id, align 4 + %idxprom24 = sext i32 %29 to i64 + %arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24 + store i8 1, i8* %arrayidx25, align 1 + br label %if.end + +if.end: ; preds = %if.then18, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %30 = load i32, i32* %i, align 4 + %inc = add nsw i32 %30, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + br label %if.end26 + +if.end26: ; preds = %for.end, %land.lhs.true, %entry + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 { +entry: + %g_graph_mask.addr = alloca i8*, align 8 + %g_updating_graph_mask.addr = alloca i8*, align 8 + %g_graph_visited.addr = alloca i8*, align 8 + %g_over.addr = alloca i8*, align 8 + %no_of_nodes.addr = alloca i32, align 4 + %tid = alloca i32, align 4 + store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8 + store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8 + store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8 + store i8* %g_over, i8** %g_over.addr, align 8 + store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %mul = mul i32 %call, 512 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add = add i32 %mul, %call1 + store i32 %add, i32* %tid, align 4 + %0 = load i32, i32* %tid, align 4 + %1 = load i32, i32* %no_of_nodes.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %entry + %2 = load i8*, i8** %g_updating_graph_mask.addr, align 8 + %3 = load i32, i32* %tid, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom + %4 = load i8, i8* %arrayidx, align 1 + %tobool = trunc i8 %4 to i1 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + %5 = load i8*, i8** %g_graph_mask.addr, align 8 + %6 = load i32, i32* %tid, align 4 + %idxprom2 = sext i32 %6 to i64 + %arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2 + store i8 1, i8* %arrayidx3, align 1 + %7 = load i8*, i8** %g_graph_visited.addr, align 8 + %8 = load i32, i32* %tid, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4 + store i8 1, i8* %arrayidx5, align 1 + %9 = load i8*, i8** %g_over.addr, align 8 + store i8 1, i8* %9, align 1 + %10 = load i8*, i8** %g_updating_graph_mask.addr, align 8 + %11 = load i32, i32* %tid, align 4 + %idxprom6 = sext i32 %11 to i64 + %arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6 + store i8 0, i8* %arrayidx7, align 1 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %entry + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7} +!llvm.ident = !{!9} +!nvvmir.version = !{!10} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1} +!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1} +!5 = !{null, !"align", i32 8} +!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!7 = !{null, !"align", i32 16} +!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!10 = !{i32 1, i32 4} diff --git a/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll b/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..bbb02fc --- /dev/null +++ b/examples/bfs/bfs-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,825 @@ +; ModuleID = 'bfs-host-x86_64-unknown-linux-gnu.bc' +source_filename = "bfs.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.Node = type { i32, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +@no_of_nodes = dso_local global i32 0, align 4 +@edge_list_size = dso_local global i32 0, align 4 +@fp = dso_local global %struct._IO_FILE* null, align 8 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str = private unnamed_addr constant [24 x i8] c"Usage: %s \0A\00", align 1 +@.str.1 = private unnamed_addr constant [14 x i8] c"Reading File\0A\00", align 1 +@.str.2 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.3 = private unnamed_addr constant [26 x i8] c"Error Reading graph file\0A\00", align 1 +@.str.4 = private unnamed_addr constant [3 x i8] c"%d\00", align 1 +@.str.5 = private unnamed_addr constant [6 x i8] c"%d %d\00", align 1 +@.str.6 = private unnamed_addr constant [11 x i8] c"Read File\0A\00", align 1 +@.str.7 = private unnamed_addr constant [33 x i8] c"Copied Everything to GPU memory\0A\00", align 1 +@.str.8 = private unnamed_addr constant [27 x i8] c"Start traversing the tree\0A\00", align 1 +@.str.9 = private unnamed_addr constant [26 x i8] c"Kernel Executed %d times\0A\00", align 1 +@.str.10 = private unnamed_addr constant [11 x i8] c"result.txt\00", align 1 +@.str.11 = private unnamed_addr constant [2 x i8] c"w\00", align 1 +@.str.12 = private unnamed_addr constant [13 x i8] c"%d) cost:%d\0A\00", align 1 +@.str.13 = private unnamed_addr constant [29 x i8] c"Result stored in result.txt\0A\00", align 1 +@0 = private unnamed_addr constant [30 x i8] c"_Z6KernelP4NodePiPbS2_S2_S1_i\00", align 1 +@1 = private unnamed_addr constant [20 x i8] c"_Z7Kernel2PbS_S_S_i\00", align 1 +@2 = private constant [15329 x i8] c"P\EDU\BA\01\00\10\00\D0;\00\00\00\00\00\00\02\00\01\01@\00\00\00H2\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\A01\00\00\00\00\00\00\A0.\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z7Kernel2PbS_S_S_i\00.nv.info._Z7Kernel2PbS_S_S_i\00.nv.shared._Z7Kernel2PbS_S_S_i\00.nv.global\00.nv.constant0._Z7Kernel2PbS_S_S_i\00.text._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.info._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.shared._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.constant0._Z6KernelP4NodePiPbS2_S2_S1_i\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z7Kernel2PbS_S_S_i\00.text._Z7Kernel2PbS_S_S_i\00.nv.info._Z7Kernel2PbS_S_S_i\00.nv.shared._Z7Kernel2PbS_S_S_i\00.nv.global\00blockIdx\00threadIdx\00.nv.constant0._Z7Kernel2PbS_S_S_i\00_param\00_Z6KernelP4NodePiPbS2_S2_S1_i\00.text._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.info._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.shared._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.constant0._Z6KernelP4NodePiPbS2_S2_S1_i\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00F\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9C\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A7\00\00\00\01\00\0B\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\B0\00\00\00\01\00\0B\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\BA\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00u\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\0D\00\00\00\00\00\00\E3\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00\80\18\00\00\00\00\00\00\04/\08\00\09\00\00\00\13\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00@\00\00\00\04\11\08\00\09\00\00\00@\00\00\00\04/\08\00\08\00\00\00\0F\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00(\00\00\00\04\11\08\00\08\00\00\00(\00\00\00\010\00\00\01*\00\00\04\0A\08\00\05\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\C8\04\00\00\04\1C\04\00H\0D\00\00\04\1E\04\000\00\00\00\010\00\00\01*\00\00\04\0A\08\00\07\00\00\00@\014\00\03\194\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\A8\06\00\00\04\1C\04\008\18\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F0\16visible .entry _Z6KernelP4NodePiPbS2_\03\0061_i\A6\04\00\A0\00\0F+\00\0A\0E\8D\04\0F3\00\15\1F13\00\1F\1F23\00\1F\1F33\00\1F\1F43\00\1F\1753\00/323\00\12\1F6\8F\04\13O6[64\8F\04\15\96pred %p<5\90\04\9B16 %rs<7>\B3\04-18\B4\04/50\B5\04\0C\1F6\B5\04\18\00b\03\0F\06\01\12\0F\9E\03\00\1F6<\00\14\1F5<\00\00\1F5<\00\14\0F\14\05\01\1F4<\00\14\1F3P\05\01\0F<\00\14\0F\E3\04\02\0F<\00\14\0F\CA\04\01\0Fh\01\15#0]\BD\01#to\BB\14\04B\00\117w\04\04\DC\01\0A\1C\00\118\1C\00\1F7;\00\05\119\1F\00\1F5;\00\02!10\1D\00\1F9<\00\05!11 \00\1F4=\00\03\122O\05\1F1>\00\06\143-\05\0F>\00\01\124>\00\1F3>\00\06\145\98\05\0F>\00\01\126>\00\1F5>\00\06\147\EC\05\0F>\00\01\023\01/17 \06\03\1F8!\06\02*16\17\00\03\22\06?d14$\06\03*12\18\00\03%\06:d10\18\00\134w\00\1A8T\06\154\8E\10\09*\0B\F4\00%ctaid.x;\0Ashl.bk\06\02F\0B\08,\00\00_\01\12t*\00Qadd.s\15\00$5,/\00\1A4n\00\125\9D\00\115\BC\02\02A\00%6,\1B\00\07\16\00%7,\9F\00\92;\0Asetp.ge]\002p1,6\00\F2\0E%r7;\0A@%p1 bra LBB6_9;\0Abra.uni\10\0021;\0A\08\00\11:Z\00\03\96\01%9,Z\01\01r\00\02\B4\008d20\8A\00\01\CD\00\03\93\03$1,8\00\01'\00\02\A7\00\108L\05\00r\03\01\22\00\002\00$ndc\05#2,\1D\00\131\BD\00\22eq\1B\003p2, \00\8F1;\0A@!%p2\BD\00\07\132\BD\00\182\BD\00/22\BD\00\04\1F3\BD\00\05$4,8\00\01'\00\03\BC\01\02\A8\0033, &\02\128\C8\00\02T\028s3;s\00$5,\B8\02\09r\00\09\B9\01\01&\02\030\00$7, \00\1A3\8B\00$8,P\00\01'\00\07\EF\01\138H\01+8]0\02\02\FD\02\1B8\1B\01\133\1B\01&3:C\00%9,3\00\09\BE\00\1F9\BE\00\02/30\BE\00\04431, \00\0A\BE\00432,P\00\01'\00\07\BE\00\2210\BF\00X32+4]\18\00\141\18\00\18]\1E\03\02\D1\04\02\14\05,11\DF\02\223,\CD\00\00(\00\01\E0\02\163#\02\0C\08\01\134\08\01\184#\02\143\0B\02\1A8\F2\00\184!\01\08\F2\00$5, \00\1A2\F2\00$6,P\00\01'\00\08\DA\00\133\DA\00\1B6\B1\01\136\12\13\09B\02537,\9C\04\09\93\00%8,6\00\0Az\00$9,8\00\01'\00\07r\03\134y\00\1A9r\03#5,\1D\00\0Dr\03#4, \00\111N\01\164N\01\1B6N\01\135N\01\185N\01\144)\02\1A4A\02/41A\02\04442, \00\0AO\01443,Q\00\01'\00\08O\01\03\D6\00*43)\02#5,\1D\00\191{\00\1847\01\08{\00$5, \00\0B{\00\196{\00\175\B4\01\00\1D\00\02\B3\01(5;\F7\00%7,g\06\09|\00\0F\B3\01\05449,8\00\01'\00\09h\04\126\98\01\05h\04\2249h\04\0C\82\01\136\82\01*6:\18\00\137\18\00\177\F0\03(16\D0\02\075\01\01\82\00\161s\00\0BL\04/17M\04\04\1B8u\00\139\18\00/9:9\0D\09\127n\09P2PbS_\02\00\0D/\0D\0D!\00\0E%\0D\0F)\00\0B\1F1)\00\15\1F2)\00\15\1C3\A1\0C\0E)\00\0F&\11\1A\1E7&\11\0F\97\0C\0E\1D3\97\0C\1C5\97\0C\0E\96\0C/26\96\0C\0C\1F7\96\0C\1E\0E\FB\00\0F\14\0C\0D\0EV\01\0F\0A\0C\0D\0E\B1\01\0F\00\0C\0D\0E\0C\02\0F\F6\0B\0D\0Eg\02\0F\EC\0B\0D\1F5t\0B\08\196\CD\0B\0F'\0C\04\1F3'\0C'\1F2'\0C)\1F1'\0C\0D\0Fm\0B\01\1F2m\0B\03\1F0m\0B\03\1F8l\0B\03\1E6\90\11\0F<\0BW/36<\0B\06/36<\0B\01/32<\0B\1797_3\17\05\137<\0B\1B7<\0B\0F\\\08\03(14\89\00\07\16\06\03]\0D%13\C3\0C\0C;\0B/15;\0B.\0E\BC\00\132\BC\00\09;\0B/16\0A\0A\02/17\BC\00\05(8,\03\0E\1C7:\0B\09\D2\06.18:\0B\0Fj\0C\0F/36j\0C\0D\07\9C\0B\1F1\9C\0B\01\182\A7\07\07-\00\1F2-\00\01\0F\BD\01\03/24\8E\00\05$5,7\00\01'\00\09\01\01\1C4;\0C\125r\00\1B4\8B\01\133\8B\01\B03:\0Aret;\0A\0A}\0A\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([15329 x i8], [15329 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 { +entry: + %g_graph_nodes.addr = alloca %struct.Node*, align 8 + %g_graph_edges.addr = alloca i32*, align 8 + %g_graph_mask.addr = alloca i8*, align 8 + %g_updating_graph_mask.addr = alloca i8*, align 8 + %g_graph_visited.addr = alloca i8*, align 8 + %g_cost.addr = alloca i32*, align 8 + %no_of_nodes.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8 + store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8 + store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8 + store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8 + store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8 + store i32* %g_cost, i32** %g_cost.addr, align 8 + store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4 + %kernel_args = alloca i8*, i64 7, align 16 + %0 = bitcast %struct.Node** %g_graph_nodes.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %g_graph_edges.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i8** %g_graph_mask.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i8** %g_updating_graph_mask.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i8** %g_graph_visited.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32** %g_cost.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32* %no_of_nodes.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %15 = load i64, i64* %shmem_size, align 8 + %16 = load i8*, i8** %stream, align 8 + %17 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %18 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 12, i1 false) + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %20 = load i64, i64* %19, align 8 + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %22 = load i32, i32* %21, align 8 + %23 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %24 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 12, i1 false) + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %26 = load i64, i64* %25, align 8 + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %28 = load i32, i32* %27, align 8 + %29 = bitcast i8* %16 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i to i8*), i64 %20, i32 %22, i64 %26, i32 %28, i8** %kernel_args, i64 %15, %struct.CUstream_st* %29) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 { +entry: + %g_graph_mask.addr = alloca i8*, align 8 + %g_updating_graph_mask.addr = alloca i8*, align 8 + %g_graph_visited.addr = alloca i8*, align 8 + %g_over.addr = alloca i8*, align 8 + %no_of_nodes.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8 + store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8 + store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8 + store i8* %g_over, i8** %g_over.addr, align 8 + store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4 + %kernel_args = alloca i8*, i64 5, align 16 + %0 = bitcast i8** %g_graph_mask.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i8** %g_updating_graph_mask.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i8** %g_graph_visited.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i8** %g_over.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %no_of_nodes.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %11 = load i64, i64* %shmem_size, align 8 + %12 = load i8*, i8** %stream, align 8 + %13 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %14 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 12, i1 false) + %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %16 = load i64, i64* %15, align 8 + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %18 = load i32, i32* %17, align 8 + %19 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %20 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast i8* %12 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i to i8*), i64 %16, i32 %18, i64 %22, i32 %24, i8** %kernel_args, i64 %11, %struct.CUstream_st* %25) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #2 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + store i32 0, i32* @no_of_nodes, align 4 + store i32 0, i32* @edge_list_size, align 4 + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + call void @_Z8BFSGraphiPPc(i32 %0, i8** %1) + ret i32 0 +} + +declare dso_local i32 @cudaSetDevice(i32) #3 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z8BFSGraphiPPc(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %input_f = alloca i8*, align 8 + %source = alloca i32, align 4 + %num_of_blocks = alloca i32, align 4 + %num_of_threads_per_block = alloca i32, align 4 + %h_graph_nodes = alloca %struct.Node*, align 8 + %h_graph_mask = alloca i8*, align 8 + %h_updating_graph_mask = alloca i8*, align 8 + %h_graph_visited = alloca i8*, align 8 + %start = alloca i32, align 4 + %edgeno = alloca i32, align 4 + %i = alloca i32, align 4 + %id = alloca i32, align 4 + %cost = alloca i32, align 4 + %h_graph_edges = alloca i32*, align 8 + %i41 = alloca i32, align 4 + %d_graph_nodes = alloca %struct.Node*, align 8 + %d_graph_edges = alloca i32*, align 8 + %d_graph_mask = alloca i8*, align 8 + %d_updating_graph_mask = alloca i8*, align 8 + %d_graph_visited = alloca i8*, align 8 + %h_cost = alloca i32*, align 8 + %i90 = alloca i32, align 4 + %d_cost = alloca i32*, align 8 + %d_over = alloca i8*, align 8 + %grid = alloca %struct.dim3, align 4 + %threads = alloca %struct.dim3, align 4 + %k = alloca i32, align 4 + %stop = alloca i8, align 1 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp111 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp111.coerce = alloca { i64, i32 }, align 4 + %agg.tmp115 = alloca %struct.dim3, align 4 + %agg.tmp116 = alloca %struct.dim3, align 4 + %agg.tmp115.coerce = alloca { i64, i32 }, align 4 + %agg.tmp116.coerce = alloca { i64, i32 }, align 4 + %fpo = alloca %struct._IO_FILE*, align 8 + %i130 = alloca i32, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp ne i32 %0, 2 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* %argc.addr, align 4 + %2 = load i8**, i8*** %argv.addr, align 8 + call void @_Z5UsageiPPc(i32 %1, i8** %2) + call void @exit(i32 0) #8 + unreachable + +if.end: ; preds = %entry + %3 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %3, i64 1 + %4 = load i8*, i8** %arrayidx, align 8 + store i8* %4, i8** %input_f, align 8 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.1, i64 0, i64 0)) + %5 = load i8*, i8** %input_f, align 8 + %call1 = call %struct._IO_FILE* @fopen(i8* %5, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0)) + store %struct._IO_FILE* %call1, %struct._IO_FILE** @fp, align 8 + %6 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %tobool = icmp ne %struct._IO_FILE* %6, null + br i1 %tobool, label %if.end4, label %if.then2 + +if.then2: ; preds = %if.end + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.3, i64 0, i64 0)) + br label %return + +if.end4: ; preds = %if.end + store i32 0, i32* %source, align 4 + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* @no_of_nodes) + store i32 1, i32* %num_of_blocks, align 4 + %8 = load i32, i32* @no_of_nodes, align 4 + store i32 %8, i32* %num_of_threads_per_block, align 4 + %9 = load i32, i32* @no_of_nodes, align 4 + %cmp6 = icmp sgt i32 %9, 512 + br i1 %cmp6, label %if.then7, label %if.end9 + +if.then7: ; preds = %if.end4 + %10 = load i32, i32* @no_of_nodes, align 4 + %conv = sitofp i32 %10 to double + %div = fdiv double %conv, 5.120000e+02 + %11 = call double @llvm.ceil.f64(double %div) + %conv8 = fptosi double %11 to i32 + store i32 %conv8, i32* %num_of_blocks, align 4 + store i32 512, i32* %num_of_threads_per_block, align 4 + br label %if.end9 + +if.end9: ; preds = %if.then7, %if.end4 + %12 = load i32, i32* @no_of_nodes, align 4 + %conv10 = sext i32 %12 to i64 + %mul = mul i64 8, %conv10 + %call11 = call noalias i8* @malloc(i64 %mul) #9 + %13 = bitcast i8* %call11 to %struct.Node* + store %struct.Node* %13, %struct.Node** %h_graph_nodes, align 8 + %14 = load i32, i32* @no_of_nodes, align 4 + %conv12 = sext i32 %14 to i64 + %mul13 = mul i64 1, %conv12 + %call14 = call noalias i8* @malloc(i64 %mul13) #9 + store i8* %call14, i8** %h_graph_mask, align 8 + %15 = load i32, i32* @no_of_nodes, align 4 + %conv15 = sext i32 %15 to i64 + %mul16 = mul i64 1, %conv15 + %call17 = call noalias i8* @malloc(i64 %mul16) #9 + store i8* %call17, i8** %h_updating_graph_mask, align 8 + %16 = load i32, i32* @no_of_nodes, align 4 + %conv18 = sext i32 %16 to i64 + %mul19 = mul i64 1, %conv18 + %call20 = call noalias i8* @malloc(i64 %mul19) #9 + store i8* %call20, i8** %h_graph_visited, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end9 + %17 = load i32, i32* %i, align 4 + %18 = load i32, i32* @no_of_nodes, align 4 + %cmp21 = icmp ult i32 %17, %18 + br i1 %cmp21, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %19 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call22 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %19, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.5, i64 0, i64 0), i32* %start, i32* %edgeno) + %20 = load i32, i32* %start, align 4 + %21 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8 + %22 = load i32, i32* %i, align 4 + %idxprom = zext i32 %22 to i64 + %arrayidx23 = getelementptr inbounds %struct.Node, %struct.Node* %21, i64 %idxprom + %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx23, i32 0, i32 0 + store i32 %20, i32* %starting, align 4 + %23 = load i32, i32* %edgeno, align 4 + %24 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8 + %25 = load i32, i32* %i, align 4 + %idxprom24 = zext i32 %25 to i64 + %arrayidx25 = getelementptr inbounds %struct.Node, %struct.Node* %24, i64 %idxprom24 + %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx25, i32 0, i32 1 + store i32 %23, i32* %no_of_edges, align 4 + %26 = load i8*, i8** %h_graph_mask, align 8 + %27 = load i32, i32* %i, align 4 + %idxprom26 = zext i32 %27 to i64 + %arrayidx27 = getelementptr inbounds i8, i8* %26, i64 %idxprom26 + store i8 0, i8* %arrayidx27, align 1 + %28 = load i8*, i8** %h_updating_graph_mask, align 8 + %29 = load i32, i32* %i, align 4 + %idxprom28 = zext i32 %29 to i64 + %arrayidx29 = getelementptr inbounds i8, i8* %28, i64 %idxprom28 + store i8 0, i8* %arrayidx29, align 1 + %30 = load i8*, i8** %h_graph_visited, align 8 + %31 = load i32, i32* %i, align 4 + %idxprom30 = zext i32 %31 to i64 + %arrayidx31 = getelementptr inbounds i8, i8* %30, i64 %idxprom30 + store i8 0, i8* %arrayidx31, align 1 + br label %for.inc + +for.inc: ; preds = %for.body + %32 = load i32, i32* %i, align 4 + %inc = add i32 %32, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %33 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call32 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %33, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* %source) + store i32 0, i32* %source, align 4 + %34 = load i8*, i8** %h_graph_mask, align 8 + %35 = load i32, i32* %source, align 4 + %idxprom33 = sext i32 %35 to i64 + %arrayidx34 = getelementptr inbounds i8, i8* %34, i64 %idxprom33 + store i8 1, i8* %arrayidx34, align 1 + %36 = load i8*, i8** %h_graph_visited, align 8 + %37 = load i32, i32* %source, align 4 + %idxprom35 = sext i32 %37 to i64 + %arrayidx36 = getelementptr inbounds i8, i8* %36, i64 %idxprom35 + store i8 1, i8* %arrayidx36, align 1 + %38 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call37 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %38, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* @edge_list_size) + %39 = load i32, i32* @edge_list_size, align 4 + %conv38 = sext i32 %39 to i64 + %mul39 = mul i64 4, %conv38 + %call40 = call noalias i8* @malloc(i64 %mul39) #9 + %40 = bitcast i8* %call40 to i32* + store i32* %40, i32** %h_graph_edges, align 8 + store i32 0, i32* %i41, align 4 + br label %for.cond42 + +for.cond42: ; preds = %for.inc49, %for.end + %41 = load i32, i32* %i41, align 4 + %42 = load i32, i32* @edge_list_size, align 4 + %cmp43 = icmp slt i32 %41, %42 + br i1 %cmp43, label %for.body44, label %for.end51 + +for.body44: ; preds = %for.cond42 + %43 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call45 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %43, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* %id) + %44 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call46 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %44, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* %cost) + %45 = load i32, i32* %id, align 4 + %46 = load i32*, i32** %h_graph_edges, align 8 + %47 = load i32, i32* %i41, align 4 + %idxprom47 = sext i32 %47 to i64 + %arrayidx48 = getelementptr inbounds i32, i32* %46, i64 %idxprom47 + store i32 %45, i32* %arrayidx48, align 4 + br label %for.inc49 + +for.inc49: ; preds = %for.body44 + %48 = load i32, i32* %i41, align 4 + %inc50 = add nsw i32 %48, 1 + store i32 %inc50, i32* %i41, align 4 + br label %for.cond42 + +for.end51: ; preds = %for.cond42 + %49 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %tobool52 = icmp ne %struct._IO_FILE* %49, null + br i1 %tobool52, label %if.then53, label %if.end55 + +if.then53: ; preds = %for.end51 + %50 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call54 = call i32 @fclose(%struct._IO_FILE* %50) + br label %if.end55 + +if.end55: ; preds = %if.then53, %for.end51 + %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.6, i64 0, i64 0)) + %51 = bitcast %struct.Node** %d_graph_nodes to i8** + %52 = load i32, i32* @no_of_nodes, align 4 + %conv57 = sext i32 %52 to i64 + %mul58 = mul i64 8, %conv57 + %call59 = call i32 @cudaMalloc(i8** %51, i64 %mul58) + %53 = load %struct.Node*, %struct.Node** %d_graph_nodes, align 8 + %54 = bitcast %struct.Node* %53 to i8* + %55 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8 + %56 = bitcast %struct.Node* %55 to i8* + %57 = load i32, i32* @no_of_nodes, align 4 + %conv60 = sext i32 %57 to i64 + %mul61 = mul i64 8, %conv60 + %call62 = call i32 @cudaMemcpy(i8* %54, i8* %56, i64 %mul61, i32 1) + %58 = bitcast i32** %d_graph_edges to i8** + %59 = load i32, i32* @edge_list_size, align 4 + %conv63 = sext i32 %59 to i64 + %mul64 = mul i64 4, %conv63 + %call65 = call i32 @cudaMalloc(i8** %58, i64 %mul64) + %60 = load i32*, i32** %d_graph_edges, align 8 + %61 = bitcast i32* %60 to i8* + %62 = load i32*, i32** %h_graph_edges, align 8 + %63 = bitcast i32* %62 to i8* + %64 = load i32, i32* @edge_list_size, align 4 + %conv66 = sext i32 %64 to i64 + %mul67 = mul i64 4, %conv66 + %call68 = call i32 @cudaMemcpy(i8* %61, i8* %63, i64 %mul67, i32 1) + %65 = load i32, i32* @no_of_nodes, align 4 + %conv69 = sext i32 %65 to i64 + %mul70 = mul i64 1, %conv69 + %call71 = call i32 @cudaMalloc(i8** %d_graph_mask, i64 %mul70) + %66 = load i8*, i8** %d_graph_mask, align 8 + %67 = load i8*, i8** %h_graph_mask, align 8 + %68 = load i32, i32* @no_of_nodes, align 4 + %conv72 = sext i32 %68 to i64 + %mul73 = mul i64 1, %conv72 + %call74 = call i32 @cudaMemcpy(i8* %66, i8* %67, i64 %mul73, i32 1) + %69 = load i32, i32* @no_of_nodes, align 4 + %conv75 = sext i32 %69 to i64 + %mul76 = mul i64 1, %conv75 + %call77 = call i32 @cudaMalloc(i8** %d_updating_graph_mask, i64 %mul76) + %70 = load i8*, i8** %d_updating_graph_mask, align 8 + %71 = load i8*, i8** %h_updating_graph_mask, align 8 + %72 = load i32, i32* @no_of_nodes, align 4 + %conv78 = sext i32 %72 to i64 + %mul79 = mul i64 1, %conv78 + %call80 = call i32 @cudaMemcpy(i8* %70, i8* %71, i64 %mul79, i32 1) + %73 = load i32, i32* @no_of_nodes, align 4 + %conv81 = sext i32 %73 to i64 + %mul82 = mul i64 1, %conv81 + %call83 = call i32 @cudaMalloc(i8** %d_graph_visited, i64 %mul82) + %74 = load i8*, i8** %d_graph_visited, align 8 + %75 = load i8*, i8** %h_graph_visited, align 8 + %76 = load i32, i32* @no_of_nodes, align 4 + %conv84 = sext i32 %76 to i64 + %mul85 = mul i64 1, %conv84 + %call86 = call i32 @cudaMemcpy(i8* %74, i8* %75, i64 %mul85, i32 1) + %77 = load i32, i32* @no_of_nodes, align 4 + %conv87 = sext i32 %77 to i64 + %mul88 = mul i64 4, %conv87 + %call89 = call noalias i8* @malloc(i64 %mul88) #9 + %78 = bitcast i8* %call89 to i32* + store i32* %78, i32** %h_cost, align 8 + store i32 0, i32* %i90, align 4 + br label %for.cond91 + +for.cond91: ; preds = %for.inc96, %if.end55 + %79 = load i32, i32* %i90, align 4 + %80 = load i32, i32* @no_of_nodes, align 4 + %cmp92 = icmp slt i32 %79, %80 + br i1 %cmp92, label %for.body93, label %for.end98 + +for.body93: ; preds = %for.cond91 + %81 = load i32*, i32** %h_cost, align 8 + %82 = load i32, i32* %i90, align 4 + %idxprom94 = sext i32 %82 to i64 + %arrayidx95 = getelementptr inbounds i32, i32* %81, i64 %idxprom94 + store i32 -1, i32* %arrayidx95, align 4 + br label %for.inc96 + +for.inc96: ; preds = %for.body93 + %83 = load i32, i32* %i90, align 4 + %inc97 = add nsw i32 %83, 1 + store i32 %inc97, i32* %i90, align 4 + br label %for.cond91 + +for.end98: ; preds = %for.cond91 + %84 = load i32*, i32** %h_cost, align 8 + %85 = load i32, i32* %source, align 4 + %idxprom99 = sext i32 %85 to i64 + %arrayidx100 = getelementptr inbounds i32, i32* %84, i64 %idxprom99 + store i32 0, i32* %arrayidx100, align 4 + %86 = bitcast i32** %d_cost to i8** + %87 = load i32, i32* @no_of_nodes, align 4 + %conv101 = sext i32 %87 to i64 + %mul102 = mul i64 4, %conv101 + %call103 = call i32 @cudaMalloc(i8** %86, i64 %mul102) + %88 = load i32*, i32** %d_cost, align 8 + %89 = bitcast i32* %88 to i8* + %90 = load i32*, i32** %h_cost, align 8 + %91 = bitcast i32* %90 to i8* + %92 = load i32, i32* @no_of_nodes, align 4 + %conv104 = sext i32 %92 to i64 + %mul105 = mul i64 4, %conv104 + %call106 = call i32 @cudaMemcpy(i8* %89, i8* %91, i64 %mul105, i32 1) + %call107 = call i32 @cudaMalloc(i8** %d_over, i64 1) + %call108 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.7, i64 0, i64 0)) + %93 = load i32, i32* %num_of_blocks, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid, i32 %93, i32 1, i32 1) + %94 = load i32, i32* %num_of_threads_per_block, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %threads, i32 %94, i32 1, i32 1) + store i32 0, i32* %k, align 4 + %call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.8, i64 0, i64 0)) + br label %do.body + +do.body: ; preds = %do.cond, %for.end98 + store i8 0, i8* %stop, align 1 + %95 = load i8*, i8** %d_over, align 8 + %call110 = call i32 @cudaMemcpy(i8* %95, i8* %stop, i64 1, i32 1) + %96 = bitcast %struct.dim3* %agg.tmp to i8* + %97 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %96, i8* align 4 %97, i64 12, i1 false) + %98 = bitcast %struct.dim3* %agg.tmp111 to i8* + %99 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %98, i8* align 4 %99, i64 12, i1 false) + %100 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %101 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %100, i8* align 4 %101, i64 12, i1 false) + %102 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %103 = load i64, i64* %102, align 4 + %104 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %105 = load i32, i32* %104, align 4 + %106 = bitcast { i64, i32 }* %agg.tmp111.coerce to i8* + %107 = bitcast %struct.dim3* %agg.tmp111 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %106, i8* align 4 %107, i64 12, i1 false) + %108 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp111.coerce, i32 0, i32 0 + %109 = load i64, i64* %108, align 4 + %110 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp111.coerce, i32 0, i32 1 + %111 = load i32, i32* %110, align 4 + %call112 = call i32 @__cudaPushCallConfiguration(i64 %103, i32 %105, i64 %109, i32 %111, i64 0, i8* null) + %tobool113 = icmp ne i32 %call112, 0 + br i1 %tobool113, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %do.body + %112 = load %struct.Node*, %struct.Node** %d_graph_nodes, align 8 + %113 = load i32*, i32** %d_graph_edges, align 8 + %114 = load i8*, i8** %d_graph_mask, align 8 + %115 = load i8*, i8** %d_updating_graph_mask, align 8 + %116 = load i8*, i8** %d_graph_visited, align 8 + %117 = load i32*, i32** %d_cost, align 8 + %118 = load i32, i32* @no_of_nodes, align 4 + call void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %112, i32* %113, i8* %114, i8* %115, i8* %116, i32* %117, i32 %118) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %do.body + %call114 = call i32 @cudaDeviceSynchronize() + %119 = bitcast %struct.dim3* %agg.tmp115 to i8* + %120 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %119, i8* align 4 %120, i64 12, i1 false) + %121 = bitcast %struct.dim3* %agg.tmp116 to i8* + %122 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %121, i8* align 4 %122, i64 12, i1 false) + %123 = bitcast { i64, i32 }* %agg.tmp115.coerce to i8* + %124 = bitcast %struct.dim3* %agg.tmp115 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %123, i8* align 4 %124, i64 12, i1 false) + %125 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp115.coerce, i32 0, i32 0 + %126 = load i64, i64* %125, align 4 + %127 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp115.coerce, i32 0, i32 1 + %128 = load i32, i32* %127, align 4 + %129 = bitcast { i64, i32 }* %agg.tmp116.coerce to i8* + %130 = bitcast %struct.dim3* %agg.tmp116 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %129, i8* align 4 %130, i64 12, i1 false) + %131 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp116.coerce, i32 0, i32 0 + %132 = load i64, i64* %131, align 4 + %133 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp116.coerce, i32 0, i32 1 + %134 = load i32, i32* %133, align 4 + %call117 = call i32 @__cudaPushCallConfiguration(i64 %126, i32 %128, i64 %132, i32 %134, i64 0, i8* null) + %tobool118 = icmp ne i32 %call117, 0 + br i1 %tobool118, label %kcall.end120, label %kcall.configok119 + +kcall.configok119: ; preds = %kcall.end + %135 = load i8*, i8** %d_graph_mask, align 8 + %136 = load i8*, i8** %d_updating_graph_mask, align 8 + %137 = load i8*, i8** %d_graph_visited, align 8 + %138 = load i8*, i8** %d_over, align 8 + %139 = load i32, i32* @no_of_nodes, align 4 + call void @_Z7Kernel2PbS_S_S_i(i8* %135, i8* %136, i8* %137, i8* %138, i32 %139) + br label %kcall.end120 + +kcall.end120: ; preds = %kcall.configok119, %kcall.end + %call121 = call i32 @cudaDeviceSynchronize() + %140 = load i8*, i8** %d_over, align 8 + %call122 = call i32 @cudaMemcpy(i8* %stop, i8* %140, i64 1, i32 2) + %141 = load i32, i32* %k, align 4 + %inc123 = add nsw i32 %141, 1 + store i32 %inc123, i32* %k, align 4 + br label %do.cond + +do.cond: ; preds = %kcall.end120 + %142 = load i8, i8* %stop, align 1 + %tobool124 = trunc i8 %142 to i1 + br i1 %tobool124, label %do.body, label %do.end + +do.end: ; preds = %do.cond + %143 = load i32, i32* %k, align 4 + %call125 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.9, i64 0, i64 0), i32 %143) + %144 = load i32*, i32** %h_cost, align 8 + %145 = bitcast i32* %144 to i8* + %146 = load i32*, i32** %d_cost, align 8 + %147 = bitcast i32* %146 to i8* + %148 = load i32, i32* @no_of_nodes, align 4 + %conv126 = sext i32 %148 to i64 + %mul127 = mul i64 4, %conv126 + %call128 = call i32 @cudaMemcpy(i8* %145, i8* %147, i64 %mul127, i32 2) + %call129 = call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.10, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.11, i64 0, i64 0)) + store %struct._IO_FILE* %call129, %struct._IO_FILE** %fpo, align 8 + store i32 0, i32* %i130, align 4 + br label %for.cond131 + +for.cond131: ; preds = %for.inc137, %do.end + %149 = load i32, i32* %i130, align 4 + %150 = load i32, i32* @no_of_nodes, align 4 + %cmp132 = icmp slt i32 %149, %150 + br i1 %cmp132, label %for.body133, label %for.end139 + +for.body133: ; preds = %for.cond131 + %151 = load %struct._IO_FILE*, %struct._IO_FILE** %fpo, align 8 + %152 = load i32, i32* %i130, align 4 + %153 = load i32*, i32** %h_cost, align 8 + %154 = load i32, i32* %i130, align 4 + %idxprom134 = sext i32 %154 to i64 + %arrayidx135 = getelementptr inbounds i32, i32* %153, i64 %idxprom134 + %155 = load i32, i32* %arrayidx135, align 4 + %call136 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %151, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.12, i64 0, i64 0), i32 %152, i32 %155) + br label %for.inc137 + +for.inc137: ; preds = %for.body133 + %156 = load i32, i32* %i130, align 4 + %inc138 = add nsw i32 %156, 1 + store i32 %inc138, i32* %i130, align 4 + br label %for.cond131 + +for.end139: ; preds = %for.cond131 + %157 = load %struct._IO_FILE*, %struct._IO_FILE** %fpo, align 8 + %call140 = call i32 @fclose(%struct._IO_FILE* %157) + %call141 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.13, i64 0, i64 0)) + %158 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8 + %159 = bitcast %struct.Node* %158 to i8* + call void @free(i8* %159) #9 + %160 = load i32*, i32** %h_graph_edges, align 8 + %161 = bitcast i32* %160 to i8* + call void @free(i8* %161) #9 + %162 = load i8*, i8** %h_graph_mask, align 8 + call void @free(i8* %162) #9 + %163 = load i8*, i8** %h_updating_graph_mask, align 8 + call void @free(i8* %163) #9 + %164 = load i8*, i8** %h_graph_visited, align 8 + call void @free(i8* %164) #9 + %165 = load i32*, i32** %h_cost, align 8 + %166 = bitcast i32* %165 to i8* + call void @free(i8* %166) #9 + %167 = load %struct.Node*, %struct.Node** %d_graph_nodes, align 8 + %168 = bitcast %struct.Node* %167 to i8* + %call142 = call i32 @cudaFree(i8* %168) + %169 = load i32*, i32** %d_graph_edges, align 8 + %170 = bitcast i32* %169 to i8* + %call143 = call i32 @cudaFree(i8* %170) + %171 = load i8*, i8** %d_graph_mask, align 8 + %call144 = call i32 @cudaFree(i8* %171) + %172 = load i8*, i8** %d_updating_graph_mask, align 8 + %call145 = call i32 @cudaFree(i8* %172) + %173 = load i8*, i8** %d_graph_visited, align 8 + %call146 = call i32 @cudaFree(i8* %173) + %174 = load i32*, i32** %d_cost, align 8 + %175 = bitcast i32* %174 to i8* + %call147 = call i32 @cudaFree(i8* %175) + br label %return + +return: ; preds = %for.end139, %if.then2 + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z5UsageiPPc(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %1 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0 + %2 = load i8*, i8** %arrayidx, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i8* %2) + ret void +} + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #3 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #4 + +declare dso_local i32 @printf(i8*, ...) #3 + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #3 + +declare dso_local i32 @fscanf(%struct._IO_FILE*, i8*, ...) #3 + +; Function Attrs: nounwind readnone speculatable willreturn +declare double @llvm.ceil.f64(double) #5 + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #6 + +declare dso_local i32 @fclose(%struct._IO_FILE*) #3 + +declare dso_local i32 @cudaMalloc(i8**, i64) #3 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #3 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #7 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #3 + +declare dso_local i32 @cudaDeviceSynchronize() #3 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #6 + +declare dso_local i32 @cudaFree(i8*) #3 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i to i8*), i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i to i8*), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind readnone speculatable willreturn } +attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { noreturn nounwind } +attributes #9 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/bfs/bfs.cu b/examples/bfs/bfs.cu new file mode 100644 index 0000000..252341d --- /dev/null +++ b/examples/bfs/bfs.cu @@ -0,0 +1,213 @@ +#include +#include +#include +#include +#include + +#define MAX_THREADS_PER_BLOCK 512 + +int no_of_nodes; +int edge_list_size; +FILE *fp; + +// Structure to hold a node information +struct Node { + int starting; + int no_of_edges; +}; + +#include "kernel.cu" +#include "kernel2.cu" + +void BFSGraph(int argc, char **argv); + +//////////////////////////////////////////////////////////////////////////////// +// Main Program +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + cudaSetDevice(0); + no_of_nodes = 0; + edge_list_size = 0; + BFSGraph(argc, argv); +} + +void Usage(int argc, char **argv) { + + fprintf(stderr, "Usage: %s \n", argv[0]); +} +//////////////////////////////////////////////////////////////////////////////// +// Apply BFS on a Graph using CUDA +//////////////////////////////////////////////////////////////////////////////// +void BFSGraph(int argc, char **argv) { + + char *input_f; + if (argc != 2) { + Usage(argc, argv); + exit(0); + } + + input_f = argv[1]; + printf("Reading File\n"); + // Read in Graph from a file + fp = fopen(input_f, "r"); + if (!fp) { + printf("Error Reading graph file\n"); + return; + } + + int source = 0; + + fscanf(fp, "%d", &no_of_nodes); + + int num_of_blocks = 1; + int num_of_threads_per_block = no_of_nodes; + + // Make execution Parameters according to the number of nodes + // Distribute threads across multiple Blocks if necessary + if (no_of_nodes > MAX_THREADS_PER_BLOCK) { + num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = MAX_THREADS_PER_BLOCK; + } + + // allocate host memory + Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes); + bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes); + bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes); + bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes); + + int start, edgeno; + // initalize the memory + for (unsigned int i = 0; i < no_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); + h_graph_nodes[i].starting = start; + h_graph_nodes[i].no_of_edges = edgeno; + h_graph_mask[i] = false; + h_updating_graph_mask[i] = false; + h_graph_visited[i] = false; + } + + // read the source node from the file + fscanf(fp, "%d", &source); + source = 0; + + // set the source node as true in the mask + h_graph_mask[source] = true; + h_graph_visited[source] = true; + + fscanf(fp, "%d", &edge_list_size); + + int id, cost; + int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size); + for (int i = 0; i < edge_list_size; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); + h_graph_edges[i] = id; + } + + if (fp) + fclose(fp); + + printf("Read File\n"); + + // Copy the Node list to device memory + Node *d_graph_nodes; + cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes); + cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes, + cudaMemcpyHostToDevice); + + // Copy the Edge List to device Memory + int *d_graph_edges; + cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size); + cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size, + cudaMemcpyHostToDevice); + + // Copy the Mask to device memory + bool *d_graph_mask; + cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes); + cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes, + cudaMemcpyHostToDevice); + + bool *d_updating_graph_mask; + cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes); + cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask, + sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice); + + // Copy the Visited nodes array to device memory + bool *d_graph_visited; + cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes); + cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes, + cudaMemcpyHostToDevice); + + // allocate mem for the result on host side + int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes); + for (int i = 0; i < no_of_nodes; i++) + h_cost[i] = -1; + h_cost[source] = 0; + + // allocate device memory for result + int *d_cost; + cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes); + cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice); + + // make a bool to check if the execution is over + bool *d_over; + cudaMalloc((void **)&d_over, sizeof(bool)); + + printf("Copied Everything to GPU memory\n"); + + // setup execution parameters + dim3 grid(num_of_blocks, 1, 1); + dim3 threads(num_of_threads_per_block, 1, 1); + + int k = 0; + printf("Start traversing the tree\n"); + bool stop; + // Call the Kernel untill all the elements of Frontier are not false + do { + // if no thread changes this value then the loop stops + stop = false; + cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice); + + Kernel<<>>(d_graph_nodes, d_graph_edges, d_graph_mask, + d_updating_graph_mask, d_graph_visited, d_cost, + no_of_nodes); + cudaDeviceSynchronize(); + // check if kernel execution generated and error + + Kernel2<<>>(d_graph_mask, d_updating_graph_mask, + d_graph_visited, d_over, no_of_nodes); + cudaDeviceSynchronize(); + // check if kernel execution generated and error + + cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost); + + k++; + } while (stop); + + printf("Kernel Executed %d times\n", k); + + // copy result from device to host + cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost); + + // Store the result into a file + FILE *fpo = fopen("result.txt", "w"); + for (int i = 0; i < no_of_nodes; i++) + fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]); + fclose(fpo); + printf("Result stored in result.txt\n"); + + // cleanup memory + free(h_graph_nodes); + free(h_graph_edges); + free(h_graph_mask); + free(h_updating_graph_mask); + free(h_graph_visited); + free(h_cost); + + cudaFree(d_graph_nodes); + cudaFree(d_graph_edges); + cudaFree(d_graph_mask); + cudaFree(d_updating_graph_mask); + cudaFree(d_graph_visited); + cudaFree(d_cost); +} diff --git a/examples/bfs/kernel.cu b/examples/bfs/kernel.cu new file mode 100644 index 0000000..7cf0df4 --- /dev/null +++ b/examples/bfs/kernel.cu @@ -0,0 +1,23 @@ +#ifndef _KERNEL_H_ +#define _KERNEL_H_ + +__global__ void +Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes) +{ + int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x; + if( tid // (in path known to compiler) needed by true/false, bool +#include // (in path known to compiler) needed by uint32_t +#include // (in path known to compiler) needed by malloc + +//======================================================================================================================================================150 +// DEFINE +//======================================================================================================================================================150 + +#define fp float + +#define Version "1.5" + +#ifdef WINDOWS +#define bool char +#define false 0 +#define true 1 +#endif + +/* #define DEFAULT_ORDER 256 */ + +#ifdef RD_WG_SIZE_0_0 +#define DEFAULT_ORDER RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define DEFAULT_ORDER RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define DEFAULT_ORDER RD_WG_SIZE +#else +#define DEFAULT_ORDER 256 +#endif + +/* #ifdef RD_WG_SIZE_1_0 */ +/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */ +/* #elif defined(RD_WG_SIZE_1) */ +/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1 */ +/* #elif defined(RD_WG_SIZE) */ +/* #define DEFAULT_ORDER_2 RD_WG_SIZE */ +/* #else */ +/* #define DEFAULT_ORDER_2 256 */ +/* #endif */ + +/* #define DEFAULT_ORDER 508 */ + +#define malloc(size) \ + ({ \ + void *_tmp; \ + \ + if (!(_tmp = malloc(size))) { \ + fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \ + exit(-1); \ + } \ + \ + _tmp; \ + }) + +//======================================================================================================================================================150 +// STRUCTURES +//======================================================================================================================================================150 + +// struct list_item; +typedef struct list_item list_item_t; + +typedef struct list_t { + list_item_t *head, *tail; + uint32_t length; + int32_t (*compare)(const void *key, const void *with); + void (*datum_delete)(void *); +} list_t; + +typedef list_item_t *list_iterator_t; +typedef list_item_t *list_reverse_iterator_t; + +/* Type representing the record + * to which a given key refers. + * In a real B+ tree system, the + * record would hold data (in a database) + * or a file (in an operating system) + * or some other information. + * Users can rewrite this part of the code + * to change the type and content + * of the value field. + */ +typedef struct record { + int value; +} record; + +/* Type representing a node in the B+ tree. + * This type is general enough to serve for both + * the leaf and the internal node. + * The heart of the node is the array + * of keys and the array of corresponding + * pointers. The relation between keys + * and pointers differs between leaves and + * internal nodes. In a leaf, the index + * of each key equals the index of its corresponding + * pointer, with a maximum of order - 1 key-pointer + * pairs. The last pointer points to the + * leaf to the right (or NULL in the case + * of the rightmost leaf). + * In an internal node, the first pointer + * refers to lower nodes with keys less than + * the smallest key in the keys array. Then, + * with indices i starting at 0, the pointer + * at i + 1 points to the subtree with keys + * greater than or equal to the key in this + * node at index i. + * The num_keys field is used to keep + * track of the number of valid keys. + * In an internal node, the number of valid + * pointers is always num_keys + 1. + * In a leaf, the number of valid pointers + * to data is always num_keys. The + * last leaf pointer points to the next leaf. + */ +typedef struct node { + void **pointers; + int *keys; + struct node *parent; + bool is_leaf; + int num_keys; + struct node *next; // Used for queue. +} node; + +// +typedef struct knode { + int location; + int indices[DEFAULT_ORDER + 1]; + int keys[DEFAULT_ORDER + 1]; + bool is_leaf; + int num_keys; +} knode; + +struct list_item { + struct list_item *pred, *next; + void *datum; +}; + +//===============================================================================================================================================================================================================200 +// PROTOTYPES +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// Other +//======================================================================================================================================================150 + +void list_item_init(list_item_t *li, void *datum); + +void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum)); + +void list_insert_item_tail(list_t *l, list_item_t *i); + +void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i); + +void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i); + +void list_insert_item_sorted(list_t *l, list_item_t *i); + +//======================================================================================================================================================150 +// ??? +//======================================================================================================================================================150 + +void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with), + void (*datum_delete)(void *datum)); + +void list_delete(list_t *l); + +void list_reset(list_t *l); + +void list_insert_head(list_t *l, void *v); + +void list_insert_tail(list_t *l, void *v); + +void list_insert_before(list_t *l, list_item_t *next, void *v); + +void list_insert_after(list_t *l, list_item_t *pred, void *v); + +void list_insert_sorted(list_t *l, void *v); + +void list_insert_item_head(list_t *l, list_item_t *i); + +void list_remove_item(list_t *l, list_item_t *i); + +void list_remove_head(list_t *l); + +void list_remove_tail(list_t *l); + +list_item_t *list_find_item(list_t *l, void *datum); + +list_item_t *list_get_head_item(list_t *l); + +list_item_t *list_get_tail_item(list_t *l); + +void *list_find(list_t *l, void *datum); + +void *list_get_head(list_t *l); + +void *list_get_tail(list_t *l); + +uint32_t list_get_length(list_t *l); + +bool list_is_empty(list_t *l); + +bool list_not_empty(list_t *l); + +void list_visit_items(list_t *l, void (*visitor)(void *v)); + +void *list_item_get_datum(list_item_t *li); + +void list_iterator_init(list_t *l, list_iterator_t *li); + +void list_iterator_delete(list_iterator_t *li); + +void list_iterator_next(list_iterator_t *li); + +void list_iterator_prev(list_iterator_t *li); + +void *list_iterator_get_datum(list_iterator_t *li); + +bool list_iterator_is_valid(list_iterator_t *li); + +void list_reverse_iterator_init(list_t *l, list_iterator_t *li); + +void list_reverse_iterator_delete(list_iterator_t *li); + +void list_reverse_iterator_next(list_iterator_t *li); + +void list_reverse_iterator_prev(list_iterator_t *li); + +void *list_reverse_iterator_get_datum(list_iterator_t *li); + +bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li); + +//======================================================================================================================================================150 +// Output and utility +//======================================================================================================================================================150 + +void *kmalloc(int size); + +long transform_to_cuda(node *n, + bool verbose); // returns actual mem used in a long + +void usage_1(void); + +void usage_2(void); + +void enqueue(node *new_node); + +node *dequeue(void); + +int height(node *root); + +int path_to_root(node *root, node *child); + +void print_leaves(node *root); + +void print_tree(node *root); + +node *find_leaf(node *root, int key, bool verbose); + +record *find(node *root, int key, bool verbose); + +int cut(int length); + +//======================================================================================================================================================150 +// Insertion +//======================================================================================================================================================150 + +record *make_record(int value); + +node *make_node(void); + +node *make_leaf(void); + +int get_left_index(node *parent, node *left); + +node *insert_into_leaf(node *leaf, int key, record *pointer); + +node *insert_into_leaf_after_splitting(node *root, node *leaf, int key, + record *pointer); + +node *insert_into_node(node *root, node *parent, int left_index, int key, + node *right); + +node *insert_into_node_after_splitting(node *root, node *parent, int left_index, + int key, node *right); + +node *insert_into_parent(node *root, node *left, int key, node *right); + +node *insert_into_new_root(node *left, int key, node *right); + +node *start_new_tree(int key, record *pointer); + +node *insert(node *root, int key, int value); + +//======================================================================================================================================================150 +// Deletion +//======================================================================================================================================================150 + +int get_neighbor_index(node *n); + +node *adjust_root(node *root); + +node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index, + int k_prime); + +node *redistribute_nodes(node *root, node *n, node *neighbor, + int neighbor_index, int k_prime_index, int k_prime); + +node *delete_entry(node *root, node *n, int key, void *pointer); + +node *deleteVal(node *root, int key); + +//===============================================================================================================================================================================================================200 +// HEADER +//===============================================================================================================================================================================================================200 + +// int main( int argc, +// char *argv []); + +//===============================================================================================================================================================================================================200 +// END +//===============================================================================================================================================================================================================200 + +// #endif + +// # ifdef __cplusplus +// } +// # endif diff --git a/examples/btree/kernel/kernel_gpu_cuda.cu b/examples/btree/kernel/kernel_gpu_cuda.cu new file mode 100755 index 0000000..57170c8 --- /dev/null +++ b/examples/btree/kernel/kernel_gpu_cuda.cu @@ -0,0 +1,54 @@ +//========================================================================================================================================================================================================200 +// findK function +//========================================================================================================================================================================================================200 + +__global__ void +findK( long height, + knode *knodesD, + long knodes_elem, + record *recordsD, + + long *currKnodeD, + long *offsetD, + int *keysD, + record *ansD) +{ + + // private thread IDs + int thid = threadIdx.x; + int bid = blockIdx.x; + + // processtree levels + int i; + for(i = 0; i < height; i++){ + + // if value is between the two keys + if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){ + // this conditional statement is inserted to avoid crush due to but in original code + // "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault + // more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address + if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){ + offsetD[bid] = knodesD[offsetD[bid]].indices[thid]; + } + } + __syncthreads(); + + // set for next tree level + if(thid==0){ + currKnodeD[bid] = offsetD[bid]; + } + __syncthreads(); + + } + + //At this point, we have a candidate leaf node which may contain + //the target record. Check each key to hopefully find the record + if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){ + ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value; + } + +} + +//========================================================================================================================================================================================================200 +// End +//========================================================================================================================================================================================================200 diff --git a/examples/btree/kernel/kernel_gpu_cuda_2.cu b/examples/btree/kernel/kernel_gpu_cuda_2.cu new file mode 100755 index 0000000..1bb8d7b --- /dev/null +++ b/examples/btree/kernel/kernel_gpu_cuda_2.cu @@ -0,0 +1,70 @@ +//========================================================================================================================================================================================================200 +// findRangeK function +//========================================================================================================================================================================================================200 + +__global__ void +findRangeK( long height, + + knode *knodesD, + long knodes_elem, + + long *currKnodeD, + long *offsetD, + long *lastKnodeD, + long *offset_2D, + int *startD, + int *endD, + int *RecstartD, + int *ReclenD) +{ + + // private thread IDs + int thid = threadIdx.x; + int bid = blockIdx.x; + + // ??? + int i; + for(i = 0; i < height; i++){ + + if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){ + // this conditional statement is inserted to avoid crush due to but in original code + // "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault + // more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address + if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){ + offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid]; + } + } + if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){ + // this conditional statement is inserted to avoid crush due to but in original code + // "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault + // more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address + if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){ + offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid]; + } + } + __syncthreads(); + + // set for next tree level + if(thid==0){ + currKnodeD[bid] = offsetD[bid]; + lastKnodeD[bid] = offset_2D[bid]; + } + __syncthreads(); + } + + // Find the index of the starting record + if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){ + RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid]; + } + __syncthreads(); + + // Find the index of the ending record + if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){ + ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1; + } + +} + +//========================================================================================================================================================================================================200 +// End +//========================================================================================================================================================================================================200 diff --git a/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu b/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu new file mode 100755 index 0000000..361f9bb --- /dev/null +++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.cu @@ -0,0 +1,292 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//========================================================================================================================================================================================================200 +// DEFINE/INCLUDE +//========================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// COMMON +//======================================================================================================================================================150 + +#include "../common.h" // (in main program directory) needed to recognized input variables + +//======================================================================================================================================================150 +// UTILITIES +//======================================================================================================================================================150 + +#include "../util/cuda/cuda.h" // (in path specified to compiler) needed by for device functions +#include "../util/timer/timer.h" // (in path specified to compiler) needed by timer + +//======================================================================================================================================================150 +// KERNEL +//======================================================================================================================================================150 + +#include "./kernel_gpu_cuda.cu" // (in current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables + +//======================================================================================================================================================150 +// HEADER +//======================================================================================================================================================150 + +#include "./kernel_gpu_cuda_wrapper.h" // (in current directory) + +//========================================================================================================================================================================================================200 +// KERNEL_GPU_CUDA_WRAPPER FUNCTION +//========================================================================================================================================================================================================200 + +void +kernel_gpu_cuda_wrapper(record *records, + long records_mem, + knode *knodes, + long knodes_elem, + long knodes_mem, + + int order, + long maxheight, + int count, + + long *currKnode, + long *offset, + int *keys, + record *ans) +{ + + //======================================================================================================================================================150 + // CPU VARIABLES + //======================================================================================================================================================150 + + // timer + long long time0; + long long time1; + long long time2; + long long time3; + long long time4; + long long time5; + long long time6; + + time0 = get_time(); + + //======================================================================================================================================================150 + // GPU SETUP + //======================================================================================================================================================150 + + //====================================================================================================100 + // INITIAL DRIVER OVERHEAD + //====================================================================================================100 + + cudaThreadSynchronize(); + + //====================================================================================================100 + // EXECUTION PARAMETERS + //====================================================================================================100 + + int numBlocks; + numBlocks = count; // max # of blocks can be 65,535 + int threadsPerBlock; + threadsPerBlock = order < 1024 ? order : 1024; + + printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock); + + time1 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY (MALLOC) + //======================================================================================================================================================150 + + //====================================================================================================100 + // DEVICE IN + //====================================================================================================100 + + //==================================================50 + // recordsD + //==================================================50 + + record *recordsD; + cudaMalloc((void**)&recordsD, records_mem); + checkCUDAError("cudaMalloc recordsD"); + + //==================================================50 + // knodesD + //==================================================50 + + knode *knodesD; + cudaMalloc((void**)&knodesD, knodes_mem); + checkCUDAError("cudaMalloc recordsD"); + + //==================================================50 + // currKnodeD + //==================================================50 + + long *currKnodeD; + cudaMalloc((void**)&currKnodeD, count*sizeof(long)); + checkCUDAError("cudaMalloc currKnodeD"); + + //==================================================50 + // offsetD + //==================================================50 + + long *offsetD; + cudaMalloc((void**)&offsetD, count*sizeof(long)); + checkCUDAError("cudaMalloc offsetD"); + + //==================================================50 + // keysD + //==================================================50 + + int *keysD; + cudaMalloc((void**)&keysD, count*sizeof(int)); + checkCUDAError("cudaMalloc keysD"); + + //====================================================================================================100 + // DEVICE IN/OUT + //====================================================================================================100 + + //==================================================50 + // ansD + //==================================================50 + + record *ansD; + cudaMalloc((void**)&ansD, count*sizeof(record)); + checkCUDAError("cudaMalloc ansD"); + + time2 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY COPY + //======================================================================================================================================================150 + + //====================================================================================================100 + // GPU MEMORY (MALLOC) COPY IN + //====================================================================================================100 + + //==================================================50 + // recordsD + //==================================================50 + + cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy memD"); + + //==================================================50 + // knodesD + //==================================================50 + + cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy memD"); + + //==================================================50 + // currKnodeD + //==================================================50 + + cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy currKnodeD"); + + //==================================================50 + // offsetD + //==================================================50 + + cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy offsetD"); + + //==================================================50 + // keysD + //==================================================50 + + cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy keysD"); + + //====================================================================================================100 + // DEVICE IN/OUT + //====================================================================================================100 + + //==================================================50 + // ansD + //==================================================50 + + cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy ansD"); + + time3 = get_time(); + + //======================================================================================================================================================150 + // findK kernel + //======================================================================================================================================================150 + + findK<<>>( maxheight, + + knodesD, + knodes_elem, + + recordsD, + + currKnodeD, + offsetD, + keysD, + ansD); + cudaThreadSynchronize(); + checkCUDAError("findK"); + + time4 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY COPY (CONTD.) + //======================================================================================================================================================150 + + //====================================================================================================100 + // DEVICE IN/OUT + //====================================================================================================100 + + //==================================================50 + // ansD + //==================================================50 + + cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy ansD"); + + time5 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY DEALLOCATION + //======================================================================================================================================================150 + + cudaFree(recordsD); + cudaFree(knodesD); + + cudaFree(currKnodeD); + cudaFree(offsetD); + cudaFree(keysD); + cudaFree(ansD); + + time6 = get_time(); + + //======================================================================================================================================================150 + // DISPLAY TIMING + //======================================================================================================================================================150 + + printf("Time spent in different stages of GPU_CUDA KERNEL:\n"); + + printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100); + printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100); + printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100); + + printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100); + + printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100); + printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100); + + printf("Total time:\n"); + printf("%.12f s\n", (float) (time6-time0) / 1000000); + +//========================================================================================================================================================================================================200 +// End +//========================================================================================================================================================================================================200 + +} + +//========================================================================================================================================================================================================200 +// END +//========================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/kernel/kernel_gpu_cuda_wrapper.h b/examples/btree/kernel/kernel_gpu_cuda_wrapper.h new file mode 100644 index 0000000..b27c428 --- /dev/null +++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper.h @@ -0,0 +1,23 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//========================================================================================================================================================================================================200 +// KERNEL_GPU_CUDA_WRAPPER HEADER +//========================================================================================================================================================================================================200 + +void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes, + long knodes_elem, long knodes_mem, + + int order, long maxheight, int count, + + long *currKnode, long *offset, int *keys, + record *ans); + +//========================================================================================================================================================================================================200 +// End +//========================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu new file mode 100755 index 0000000..baa6f11 --- /dev/null +++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu @@ -0,0 +1,347 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//========================================================================================================================================================================================================200 +// INCLUDE +//========================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// COMMON +//======================================================================================================================================================150 + +#include "../common.h" // (in the main program folder) needed to recognized input parameters + +//======================================================================================================================================================150 +// UTILITIES +//======================================================================================================================================================150 + +#include "../util/cuda/cuda.h" // (in library path specified to compiler) needed by for device functions +#include "../util/timer/timer.h" // (in library path specified to compiler) needed by timer + +//======================================================================================================================================================150 +// KERNEL +//======================================================================================================================================================150 + +#include "./kernel_gpu_cuda_2.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables + +//======================================================================================================================================================150 +// HEADER +//======================================================================================================================================================150 + +#include "./kernel_gpu_cuda_wrapper_2.h" // (in the current directory) + +//========================================================================================================================================================================================================200 +// FUNCTION +//========================================================================================================================================================================================================200 + +void +kernel_gpu_cuda_wrapper_2( knode *knodes, + long knodes_elem, + long knodes_mem, + + int order, + long maxheight, + int count, + + long *currKnode, + long *offset, + long *lastKnode, + long *offset_2, + int *start, + int *end, + int *recstart, + int *reclength) +{ + + //======================================================================================================================================================150 + // CPU VARIABLES + //======================================================================================================================================================150 + + // timer + long long time0; + long long time1; + long long time2; + long long time3; + long long time4; + long long time5; + long long time6; + + time0 = get_time(); + + //======================================================================================================================================================150 + // GPU SETUP + //======================================================================================================================================================150 + + //====================================================================================================100 + // INITIAL DRIVER OVERHEAD + //====================================================================================================100 + + cudaThreadSynchronize(); + + //====================================================================================================100 + // EXECUTION PARAMETERS + //====================================================================================================100 + + int numBlocks; + numBlocks = count; + int threadsPerBlock; + threadsPerBlock = order < 1024 ? order : 1024; + + printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock); + + time1 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY MALLOC + //======================================================================================================================================================150 + + //====================================================================================================100 + // DEVICE IN + //====================================================================================================100 + + //==================================================50 + // knodesD + //==================================================50 + + knode *knodesD; + cudaMalloc((void**)&knodesD, knodes_mem); + checkCUDAError("cudaMalloc recordsD"); + + //==================================================50 + // currKnodeD + //==================================================50 + + long *currKnodeD; + cudaMalloc((void**)&currKnodeD, count*sizeof(long)); + checkCUDAError("cudaMalloc currKnodeD"); + + //==================================================50 + // offsetD + //==================================================50 + + long *offsetD; + cudaMalloc((void**)&offsetD, count*sizeof(long)); + checkCUDAError("cudaMalloc offsetD"); + + //==================================================50 + // lastKnodeD + //==================================================50 + + long *lastKnodeD; + cudaMalloc((void**)&lastKnodeD, count*sizeof(long)); + checkCUDAError("cudaMalloc lastKnodeD"); + + //==================================================50 + // offset_2D + //==================================================50 + + long *offset_2D; + cudaMalloc((void**)&offset_2D, count*sizeof(long)); + checkCUDAError("cudaMalloc offset_2D"); + + //==================================================50 + // startD + //==================================================50 + + int *startD; + cudaMalloc((void**)&startD, count*sizeof(int)); + checkCUDAError("cudaMalloc startD"); + + //==================================================50 + // endD + //==================================================50 + + int *endD; + cudaMalloc((void**)&endD, count*sizeof(int)); + checkCUDAError("cudaMalloc endD"); + + //====================================================================================================100 + // DEVICE IN/OUT + //====================================================================================================100 + + //==================================================50 + // ansDStart + //==================================================50 + + int *ansDStart; + cudaMalloc((void**)&ansDStart, count*sizeof(int)); + checkCUDAError("cudaMalloc ansDStart"); + + //==================================================50 + // ansDLength + //==================================================50 + + int *ansDLength; + cudaMalloc((void**)&ansDLength, count*sizeof(int)); + checkCUDAError("cudaMalloc ansDLength"); + + time2 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY COPY + //======================================================================================================================================================150 + + //====================================================================================================100 + // DEVICE IN + //====================================================================================================100 + + //==================================================50 + // knodesD + //==================================================50 + + cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy memD"); + + //==================================================50 + // currKnodeD + //==================================================50 + + cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy currKnodeD"); + + //==================================================50 + // offsetD + //==================================================50 + + cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy offsetD"); + + //==================================================50 + // lastKnodeD + //==================================================50 + + cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD"); + + //==================================================50 + // offset_2D + //==================================================50 + + cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice); + checkCUDAError("cudaMalloc cudaMemcpy offset_2D"); + + //==================================================50 + // startD + //==================================================50 + + cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy startD"); + + //==================================================50 + // endD + //==================================================50 + + cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy endD"); + + //====================================================================================================100 + // DEVICE IN/OUT + //====================================================================================================100 + + //==================================================50 + // ansDStart + //==================================================50 + + cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy ansDStart"); + + //==================================================50 + // ansDLength + //==================================================50 + + cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy ansDLength"); + + time3 = get_time(); + + //======================================================================================================================================================150 + // KERNEL + //======================================================================================================================================================150 + + // [GPU] findRangeK kernel + findRangeK<<>>( maxheight, + knodesD, + knodes_elem, + + currKnodeD, + offsetD, + lastKnodeD, + offset_2D, + startD, + endD, + ansDStart, + ansDLength); + cudaThreadSynchronize(); + checkCUDAError("findRangeK"); + + time4 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY COPY (CONTD.) + //======================================================================================================================================================150 + + //====================================================================================================100 + // DEVICE IN/OUT + //====================================================================================================100 + + //==================================================50 + // ansDStart + //==================================================50 + + cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy ansDStart"); + + //==================================================50 + // ansDLength + //==================================================50 + + cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy ansDLength"); + + time5 = get_time(); + + //======================================================================================================================================================150 + // GPU MEMORY DEALLOCATION + //======================================================================================================================================================150 + + cudaFree(knodesD); + + cudaFree(currKnodeD); + cudaFree(offsetD); + cudaFree(lastKnodeD); + cudaFree(offset_2D); + cudaFree(startD); + cudaFree(endD); + cudaFree(ansDStart); + cudaFree(ansDLength); + + time6 = get_time(); + + //======================================================================================================================================================150 + // DISPLAY TIMING + //======================================================================================================================================================150 + + printf("Time spent in different stages of GPU_CUDA KERNEL:\n"); + + printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100); + printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100); + printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100); + + printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100); + + printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100); + printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100); + + printf("Total time:\n"); + printf("%.12f s\n", (float) (time6-time0) / 1000000); + +} + +//========================================================================================================================================================================================================200 +// END +//========================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h new file mode 100644 index 0000000..43b07ae --- /dev/null +++ b/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.h @@ -0,0 +1,23 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//========================================================================================================================================================================================================200 +// KERNEL_GPU_CUDA_WRAPPER HEADER +//========================================================================================================================================================================================================200 + +void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem, + + int order, long maxheight, int count, + + long *currKnode, long *offset, long *lastKnode, + long *offset_2, int *start, int *end, + int *recstart, int *reclength); + +//========================================================================================================================================================================================================200 +// End +//========================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..7979fd9 --- /dev/null +++ b/examples/btree/kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,332 @@ +; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "kernel/kernel_gpu_cuda_wrapper.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } +%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 } +%struct.record = type { i32 } + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 { +entry: + %height.addr = alloca i64, align 8 + %knodesD.addr = alloca %struct.knode*, align 8 + %knodes_elem.addr = alloca i64, align 8 + %recordsD.addr = alloca %struct.record*, align 8 + %currKnodeD.addr = alloca i64*, align 8 + %offsetD.addr = alloca i64*, align 8 + %keysD.addr = alloca i32*, align 8 + %ansD.addr = alloca %struct.record*, align 8 + %thid = alloca i32, align 4 + %bid = alloca i32, align 4 + %i = alloca i32, align 4 + store i64 %height, i64* %height.addr, align 8 + store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8 + store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 + store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8 + store i64* %currKnodeD, i64** %currKnodeD.addr, align 8 + store i64* %offsetD, i64** %offsetD.addr, align 8 + store i32* %keysD, i32** %keysD.addr, align 8 + store %struct.record* %ansD, %struct.record** %ansD.addr, align 8 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %thid, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %bid, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %conv = sext i32 %0 to i64 + %1 = load i64, i64* %height.addr, align 8 + %cmp = icmp slt i64 %conv, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %3 = load i64*, i64** %currKnodeD.addr, align 8 + %4 = load i32, i32* %bid, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom + %5 = load i64, i64* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5 + %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2 + %6 = load i32, i32* %thid, align 4 + %idxprom3 = sext i32 %6 to i64 + %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3 + %7 = load i32, i32* %arrayidx4, align 4 + %8 = load i32*, i32** %keysD.addr, align 8 + %9 = load i32, i32* %bid, align 4 + %idxprom5 = sext i32 %9 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5 + %10 = load i32, i32* %arrayidx6, align 4 + %cmp7 = icmp sle i32 %7, %10 + br i1 %cmp7, label %land.lhs.true, label %if.end34 + +land.lhs.true: ; preds = %for.body + %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %12 = load i64*, i64** %currKnodeD.addr, align 8 + %13 = load i32, i32* %bid, align 4 + %idxprom8 = sext i32 %13 to i64 + %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8 + %14 = load i64, i64* %arrayidx9, align 8 + %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14 + %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2 + %15 = load i32, i32* %thid, align 4 + %add = add nsw i32 %15, 1 + %idxprom12 = sext i32 %add to i64 + %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12 + %16 = load i32, i32* %arrayidx13, align 4 + %17 = load i32*, i32** %keysD.addr, align 8 + %18 = load i32, i32* %bid, align 4 + %idxprom14 = sext i32 %18 to i64 + %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14 + %19 = load i32, i32* %arrayidx15, align 4 + %cmp16 = icmp sgt i32 %16, %19 + br i1 %cmp16, label %if.then, label %if.end34 + +if.then: ; preds = %land.lhs.true + %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %21 = load i64*, i64** %offsetD.addr, align 8 + %22 = load i32, i32* %bid, align 4 + %idxprom17 = sext i32 %22 to i64 + %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17 + %23 = load i64, i64* %arrayidx18, align 8 + %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23 + %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1 + %24 = load i32, i32* %thid, align 4 + %idxprom20 = sext i32 %24 to i64 + %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20 + %25 = load i32, i32* %arrayidx21, align 4 + %conv22 = sext i32 %25 to i64 + %26 = load i64, i64* %knodes_elem.addr, align 8 + %cmp23 = icmp slt i64 %conv22, %26 + br i1 %cmp23, label %if.then24, label %if.end + +if.then24: ; preds = %if.then + %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %28 = load i64*, i64** %offsetD.addr, align 8 + %29 = load i32, i32* %bid, align 4 + %idxprom25 = sext i32 %29 to i64 + %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25 + %30 = load i64, i64* %arrayidx26, align 8 + %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30 + %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1 + %31 = load i32, i32* %thid, align 4 + %idxprom29 = sext i32 %31 to i64 + %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29 + %32 = load i32, i32* %arrayidx30, align 4 + %conv31 = sext i32 %32 to i64 + %33 = load i64*, i64** %offsetD.addr, align 8 + %34 = load i32, i32* %bid, align 4 + %idxprom32 = sext i32 %34 to i64 + %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32 + store i64 %conv31, i64* %arrayidx33, align 8 + br label %if.end + +if.end: ; preds = %if.then24, %if.then + br label %if.end34 + +if.end34: ; preds = %if.end, %land.lhs.true, %for.body + call void @llvm.nvvm.barrier0() + %35 = load i32, i32* %thid, align 4 + %cmp35 = icmp eq i32 %35, 0 + br i1 %cmp35, label %if.then36, label %if.end41 + +if.then36: ; preds = %if.end34 + %36 = load i64*, i64** %offsetD.addr, align 8 + %37 = load i32, i32* %bid, align 4 + %idxprom37 = sext i32 %37 to i64 + %arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37 + %38 = load i64, i64* %arrayidx38, align 8 + %39 = load i64*, i64** %currKnodeD.addr, align 8 + %40 = load i32, i32* %bid, align 4 + %idxprom39 = sext i32 %40 to i64 + %arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39 + store i64 %38, i64* %arrayidx40, align 8 + br label %if.end41 + +if.end41: ; preds = %if.then36, %if.end34 + call void @llvm.nvvm.barrier0() + br label %for.inc + +for.inc: ; preds = %if.end41 + %41 = load i32, i32* %i, align 4 + %inc = add nsw i32 %41, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %43 = load i64*, i64** %currKnodeD.addr, align 8 + %44 = load i32, i32* %bid, align 4 + %idxprom42 = sext i32 %44 to i64 + %arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42 + %45 = load i64, i64* %arrayidx43, align 8 + %arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45 + %keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2 + %46 = load i32, i32* %thid, align 4 + %idxprom46 = sext i32 %46 to i64 + %arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46 + %47 = load i32, i32* %arrayidx47, align 4 + %48 = load i32*, i32** %keysD.addr, align 8 + %49 = load i32, i32* %bid, align 4 + %idxprom48 = sext i32 %49 to i64 + %arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48 + %50 = load i32, i32* %arrayidx49, align 4 + %cmp50 = icmp eq i32 %47, %50 + br i1 %cmp50, label %if.then51, label %if.end63 + +if.then51: ; preds = %for.end + %51 = load %struct.record*, %struct.record** %recordsD.addr, align 8 + %52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %53 = load i64*, i64** %currKnodeD.addr, align 8 + %54 = load i32, i32* %bid, align 4 + %idxprom52 = sext i32 %54 to i64 + %arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52 + %55 = load i64, i64* %arrayidx53, align 8 + %arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55 + %indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1 + %56 = load i32, i32* %thid, align 4 + %idxprom56 = sext i32 %56 to i64 + %arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56 + %57 = load i32, i32* %arrayidx57, align 4 + %idxprom58 = sext i32 %57 to i64 + %arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58 + %value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0 + %58 = load i32, i32* %value, align 4 + %59 = load %struct.record*, %struct.record** %ansD.addr, align 8 + %60 = load i32, i32* %bid, align 4 + %idxprom60 = sext i32 %60 to i64 + %arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60 + %value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0 + store i32 %58, i32* %value62, align 4 + br label %if.end63 + +if.end63: ; preds = %if.then51, %for.end + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll b/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..37c05f6 --- /dev/null +++ b/examples/btree/kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,553 @@ +; ModuleID = 'kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc' +source_filename = "kernel/kernel_gpu_cuda_wrapper.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 } +%struct.record = type { i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +@.str = private unnamed_addr constant [75 x i8] c"# of blocks = %d, # of threads/block = %d (ensure that device can handle)\0A\00", align 1 +@.str.1 = private unnamed_addr constant [21 x i8] c"cudaMalloc recordsD\00", align 1 +@.str.2 = private unnamed_addr constant [23 x i8] c"cudaMalloc currKnodeD\00", align 1 +@.str.3 = private unnamed_addr constant [20 x i8] c"cudaMalloc offsetD\00", align 1 +@.str.4 = private unnamed_addr constant [18 x i8] c"cudaMalloc keysD\00", align 1 +@.str.5 = private unnamed_addr constant [16 x i8] c"cudaMalloc ansD\00", align 1 +@.str.6 = private unnamed_addr constant [27 x i8] c"cudaMalloc cudaMemcpy memD\00", align 1 +@.str.7 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy currKnodeD\00", align 1 +@.str.8 = private unnamed_addr constant [30 x i8] c"cudaMalloc cudaMemcpy offsetD\00", align 1 +@.str.9 = private unnamed_addr constant [28 x i8] c"cudaMalloc cudaMemcpy keysD\00", align 1 +@.str.10 = private unnamed_addr constant [27 x i8] c"cudaMalloc cudaMemcpy ansD\00", align 1 +@.str.11 = private unnamed_addr constant [6 x i8] c"findK\00", align 1 +@.str.12 = private unnamed_addr constant [16 x i8] c"cudaMemcpy ansD\00", align 1 +@.str.13 = private unnamed_addr constant [52 x i8] c"Time spent in different stages of GPU_CUDA KERNEL:\0A\00", align 1 +@.str.14 = private unnamed_addr constant [54 x i8] c"%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\0A\00", align 1 +@.str.15 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: ALO\0A\00", align 1 +@.str.16 = private unnamed_addr constant [41 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY IN\0A\00", align 1 +@.str.17 = private unnamed_addr constant [36 x i8] c"%15.12f s, %15.12f % : GPU: KERNEL\0A\00", align 1 +@.str.18 = private unnamed_addr constant [42 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY OUT\0A\00", align 1 +@.str.19 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: FRE\0A\00", align 1 +@.str.20 = private unnamed_addr constant [13 x i8] c"Total time:\0A\00", align 1 +@.str.21 = private unnamed_addr constant [9 x i8] c"%.12f s\0A\00", align 1 +@0 = private constant [15913 x i8] c"P\EDU\BA\01\00\10\00\18>\00\00\00\00\00\00\02\00\01\01@\00\00\00\A83\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\003\00\00\00\00\00\00\C00\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.findK\00.nv.info.findK\00.nv.shared.findK\00.nv.global\00.nv.constant0.findK\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00findK\00.text.findK\00.nv.info.findK\00.nv.shared.findK\00.nv.global\00threadIdx\00blockIdx\00.nv.constant0.findK\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\008\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00d\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00o\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00y\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\82\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00@,\00\00\00\00\00\00\04/\08\00\06\00\00\00\16\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00P\00\00\00\04\11\08\00\06\00\00\00P\00\00\00\010\00\00\01*\00\00\04\0A\08\00\05\00\00\00@\01@\00\03\19@\00\04\17\0C\00\00\00\00\00\07\008\00\00\F0!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\F0\07\00\00\04\1C\04\00\F8+\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\008\02\00\00\00\00\00\00\B4\00\00\00\00\00\00\00\03\00\00\00\07\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00i\00\00\00\01\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EC\02\00\00\00\00\00\00\80\01\00\00\00\00\00\00\00\00\00\00\07\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\01\00\00\00\06\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\80\04\00\00\00\00\00\00@,\00\00\00\00\00\00\03\00\00\00\06\00\00\16 \00\00\00\00\00\00\00\00\00\00\00\00\00\00\00^\00\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\C00\00\00\00\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\06\00\00\00\05\00\00\00\003\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\EC\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\C0-\00\00\00\00\00\00\C0-\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\06\00\00\00\C00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\02\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01H\00\00\00\E8\09\00\00\00\00\00\00\E6\09\00\00@\00\00\00\04\00\06\00=\00\00\00\00\00\00\00\00\00\00\00\11 \00\00\00\00\00\00\00\00\00\00\00\00\00\00\FB#\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F0 \0A\0A\0A\0A.version 6.4\0A.target sm_61\0A.address_size 64.\00\FF\12global .align 1 .b8 threadIdx[1];#\00\03Tblock\22\00\F0\0B\0A.weak .func (.param .b32 \12\00\F5\07_retval0) cudaMalloc(\0A&\00'64\18\00\11_\16\00?_0, \00\0B\A61\0A)\0A{\0A.loc\98\00\118\98\00!__\15\00\A0_depot0[16\C1\002regI\00;%SP\0F\00\15L\10\00\8932 %r<2>!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB3\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\05visible .entry findK\8E\04\00\88\00\01\13\00\0E]\04\0C\1B\00\1F1\1B\00\07\1F2\1B\00\07\1F3\1B\00\07\1F4\1B\00\07\1F5\1B\00\07\1F6\1B\00\07\1F7\EA\03\13?6[8\EA\03\16\ABpred %p<7>\FC\03-16\FD\03?115\FF\03\0C\1F6G\08\19I8, [\DE\00\0F\D1\02\00\1B7$\00\1F6$\00\00\1B6$\00\1F5$\00\00\1B5$\00\0F;\04\01\1B4$\00\1F3_\04\04\09\19\02\0F\DA\03\04\09X\02\0F\A9\03\04\09\97\02\13]:\01#to\81\13\04*\00\119>\03\138\1F\00\0A\1C\00!10\1D\00\1F9<\00\05!11 \00\1F7=\00\03\122\DB\03\1F1>\00\06\113!\00\1F6>\00\03\124>\00\1F3>\00\06\115!\00\1F5>\00\03\126>\00\1F5>\00\06\117!\00\1F4>\00\03\128>\00\1F7>\00\06\149\A0\04\0F>\00\00\1225\01/19\EA\04\19\1A0\17\00)16\D5\04\0C\EC\04*18\18\00\03\ED\04:d16\18\00\144u\00\1B4\18\00\03w\00*12\18\00\135w\00+10\DC\0Dj%tid.xb\05$64\8F\05\098\0AO%cta-\00\00\1F8\8F\05\02\1A0&\00$72\7F\05\F2\01bra.uni LBB6_1;\0A\08\00\10:\E8\02\11s=\00Ed21,5\00\01\00\03\04\7F\01$2,q\01\B0;\0Asetp.ge.s\1C\004p1,9\00\01(\00\A3;\0A@%p1 brag\00\1B1x\00\132x\00'2:`\00455,\BB\01\08w\00556,\8C\01\17;\A7\00557,\02\01T;\0Ashl\EE\04458, \00\833;\0Aadd.s\19\00$9,Q\00\01'\00\08f\00 60N\00\00$\00\94];\0Amul.lo7\00461,\22\00I2068S\00462,\BB\00\01*\00\08\A1\00563,\D0\01\09\A1\00464, \00\1A2N\00'5,U\00\2264\A1\00\01\D3\01\127\9F\00j65+103\98\01\146!\01\1B4\09\01%67\09\01\0Bh\00$8,9\00\01'\00\07h\00\138h\00\158\E4\01\12t\D0\002p3,\87\001%r8\E0\01\163\E0\01\1B7\DF\01\133\DF\01\183\DF\01/69\DF\01\02/70\DF\01\03/71\DF\01\04472, \00\0A\DF\01473,Q\00\01'\00\09f\00\124\D8\00-73\DF\01475,\22\00\0D\DF\01476,\BB\00\01*\00\07)\01\189\DD\01$ad\B8\00\02\D9\0529, \9D\05\00I\00\05\D2\00\01=\01*10\CE\00\02\8B\01\1D7\A4\01779,\82\00'78{\00#11\CD\00\1E9\0D\02/80\0D\02\04%817\01\0Ai\00482,9\00\01'\00\08i\00\122i\00$82\0E\02#le\0E\02#4,\89\00\00&\00\01\10\02\1F4\10\02\07\134\10\02\184\10\02/83\10\02\02584,c\05\08\10\02/85\10\02\04486, \00\0A\10\02487,Q\00\01'\00\09f\00\03\E8\02-87\10\02\138\B8\07\1F8\10\02\00490,\BB\00\01*\00\08\A1\00/91\EF\03\04\129a\01\1D9z\01793,U\00)92N\00\03\B1\02I93+4 \01595,\CA\06\0C\87\05$5,;\00\01)\00\01\97\01\165\97\01\1B6\97\01\135\97\01\185\97\01/96\97\01\02/97\97\01\03/98\97\01\04499, \00\0A\97\01D100,R\00\01(\00\08g\00#10\DC\02=100\9A\01E102,%\00\0D\9C\01D103,\C1\00\02-\00\08\A8\00?104\9F\01\04E105,\22\00\1B2S\00(6,[\00:105T\00\147\AD\0086+4\80\0D\02\C0\00\13,%\00\0Bw\01\136w\01*6:\18\00\137\18\00\D87:\0Abar.sync 0\8E\03\0AP\06\00\FA\01\14n\8F\03#6,!\00\110\F5\01\166\F5\01\1B9~\00\138f\00)8:w\01\1F8\DF\01\03?109\E0\01\04\131\C3\0A\01\22\00\0B\E2\01\03\BE\0A#10\EF\09)10\E4\01\04W\04J111]\19\00\183\0C\06\09R\00%4,\22\00\04R\00\08n\01\141\C0\09+12\F0\00\139\F0\00\1A9V\01\0A\07\09\140\08\09'0:p\01\184\08\09\07\E3\05#5,\1E\00\1F1e\09\02/15f\09\05\181t\01\1F2\00\05\03/24\10\07\03\1F2\00\05\05\122\A4\02\1D2\00\05427,Q\00\01'\00\09f\00\03\00\05\1E2\00\05\132\83\0B\0F\00\05\01430,\BB\00\01*\00\08\A1\00\1F3\00\05\05\123\B6\03\1D3\00\05733,U\00'32\EC\02\03\FE\04.33\E2\06/34\E2\06\04%35\09\01\0Bh\00$6,9\00\01'\00\07h\00\135h\00\1D6R\03\222,\87\00\22%rH\05\172\CF\0A\0DG\0B\04\D0\0A\191\D1\0A\1434\05\1A2\C3\05/38\FA\01\02/39\FA\01\03/40\FA\01\04441, \00\0A\FA\01442,Q\00\01'\00\08f\00\2243\F3\00-42\FA\01444,\22\00\0D\FA\01445,\BB\00\01*\00\09\A1\00\1F6\FA\01\04\134\0C\0E\1C6\92\01748,U\00)47N\00\139\A1\00+8+O\00450,!\00\0AO\00451,p\01\01'\00\07\E1\01\136\D7\0B\0Aj\04552,\98\0D\0Ac\00\153S\01\0Bc\00$4,9\00\01'\00\07\FA\03\00\1D\00\01\F9\03\0C`\07$13\17\02\B03:\0Aret;\0A\0A}\0A\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([15913 x i8], [15913 x i8]* @0, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 { +entry: + %height.addr = alloca i64, align 8 + %knodesD.addr = alloca %struct.knode*, align 8 + %knodes_elem.addr = alloca i64, align 8 + %recordsD.addr = alloca %struct.record*, align 8 + %currKnodeD.addr = alloca i64*, align 8 + %offsetD.addr = alloca i64*, align 8 + %keysD.addr = alloca i32*, align 8 + %ansD.addr = alloca %struct.record*, align 8 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i64 %height, i64* %height.addr, align 8 + store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8 + store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 + store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8 + store i64* %currKnodeD, i64** %currKnodeD.addr, align 8 + store i64* %offsetD, i64** %offsetD.addr, align 8 + store i32* %keysD, i32** %keysD.addr, align 8 + store %struct.record* %ansD, %struct.record** %ansD.addr, align 8 + %kernel_args = alloca i8*, i64 8, align 16 + %0 = bitcast i64* %height.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast %struct.knode** %knodesD.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i64* %knodes_elem.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast %struct.record** %recordsD.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i64** %currKnodeD.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i64** %offsetD.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32** %keysD.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast %struct.record** %ansD.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %17 = load i64, i64* %shmem_size, align 8 + %18 = load i8*, i8** %stream, align 8 + %19 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %20 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %26 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %25, i8* align 8 %26, i64 12, i1 false) + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %28 = load i64, i64* %27, align 8 + %29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %30 = load i32, i32* %29, align 8 + %31 = bitcast i8* %18 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK to i8*), i64 %22, i32 %24, i64 %28, i32 %30, i8** %kernel_args, i64 %17, %struct.CUstream_st* %31) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @kernel_gpu_cuda_wrapper(%struct.record* %records, i64 %records_mem, %struct.knode* %knodes, i64 %knodes_elem, i64 %knodes_mem, i32 %order, i64 %maxheight, i32 %count, i64* %currKnode, i64* %offset, i32* %keys, %struct.record* %ans) #0 { +entry: + %records.addr = alloca %struct.record*, align 8 + %records_mem.addr = alloca i64, align 8 + %knodes.addr = alloca %struct.knode*, align 8 + %knodes_elem.addr = alloca i64, align 8 + %knodes_mem.addr = alloca i64, align 8 + %order.addr = alloca i32, align 4 + %maxheight.addr = alloca i64, align 8 + %count.addr = alloca i32, align 4 + %currKnode.addr = alloca i64*, align 8 + %offset.addr = alloca i64*, align 8 + %keys.addr = alloca i32*, align 8 + %ans.addr = alloca %struct.record*, align 8 + %time0 = alloca i64, align 8 + %time1 = alloca i64, align 8 + %time2 = alloca i64, align 8 + %time3 = alloca i64, align 8 + %time4 = alloca i64, align 8 + %time5 = alloca i64, align 8 + %time6 = alloca i64, align 8 + %numBlocks = alloca i32, align 4 + %threadsPerBlock = alloca i32, align 4 + %recordsD = alloca %struct.record*, align 8 + %knodesD = alloca %struct.knode*, align 8 + %currKnodeD = alloca i64*, align 8 + %offsetD = alloca i64*, align 8 + %keysD = alloca i32*, align 8 + %ansD = alloca %struct.record*, align 8 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp32 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp32.coerce = alloca { i64, i32 }, align 4 + store %struct.record* %records, %struct.record** %records.addr, align 8 + store i64 %records_mem, i64* %records_mem.addr, align 8 + store %struct.knode* %knodes, %struct.knode** %knodes.addr, align 8 + store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 + store i64 %knodes_mem, i64* %knodes_mem.addr, align 8 + store i32 %order, i32* %order.addr, align 4 + store i64 %maxheight, i64* %maxheight.addr, align 8 + store i32 %count, i32* %count.addr, align 4 + store i64* %currKnode, i64** %currKnode.addr, align 8 + store i64* %offset, i64** %offset.addr, align 8 + store i32* %keys, i32** %keys.addr, align 8 + store %struct.record* %ans, %struct.record** %ans.addr, align 8 + %call = call i64 @get_time() + store i64 %call, i64* %time0, align 8 + %call1 = call i32 @cudaThreadSynchronize() + %0 = load i32, i32* %count.addr, align 4 + store i32 %0, i32* %numBlocks, align 4 + %1 = load i32, i32* %order.addr, align 4 + %cmp = icmp slt i32 %1, 1024 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %2 = load i32, i32* %order.addr, align 4 + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %2, %cond.true ], [ 1024, %cond.false ] + store i32 %cond, i32* %threadsPerBlock, align 4 + %3 = load i32, i32* %numBlocks, align 4 + %4 = load i32, i32* %threadsPerBlock, align 4 + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @.str, i64 0, i64 0), i32 %3, i32 %4) + %call3 = call i64 @get_time() + store i64 %call3, i64* %time1, align 8 + %5 = bitcast %struct.record** %recordsD to i8** + %6 = load i64, i64* %records_mem.addr, align 8 + %call4 = call i32 @cudaMalloc(i8** %5, i64 %6) + call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i64 0, i64 0)) + %7 = bitcast %struct.knode** %knodesD to i8** + %8 = load i64, i64* %knodes_mem.addr, align 8 + %call5 = call i32 @cudaMalloc(i8** %7, i64 %8) + call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i64 0, i64 0)) + %9 = bitcast i64** %currKnodeD to i8** + %10 = load i32, i32* %count.addr, align 4 + %conv = sext i32 %10 to i64 + %mul = mul i64 %conv, 8 + %call6 = call i32 @cudaMalloc(i8** %9, i64 %mul) + call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.2, i64 0, i64 0)) + %11 = bitcast i64** %offsetD to i8** + %12 = load i32, i32* %count.addr, align 4 + %conv7 = sext i32 %12 to i64 + %mul8 = mul i64 %conv7, 8 + %call9 = call i32 @cudaMalloc(i8** %11, i64 %mul8) + call void @checkCUDAError(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.3, i64 0, i64 0)) + %13 = bitcast i32** %keysD to i8** + %14 = load i32, i32* %count.addr, align 4 + %conv10 = sext i32 %14 to i64 + %mul11 = mul i64 %conv10, 4 + %call12 = call i32 @cudaMalloc(i8** %13, i64 %mul11) + call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0)) + %15 = bitcast %struct.record** %ansD to i8** + %16 = load i32, i32* %count.addr, align 4 + %conv13 = sext i32 %16 to i64 + %mul14 = mul i64 %conv13, 4 + %call15 = call i32 @cudaMalloc(i8** %15, i64 %mul14) + call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.5, i64 0, i64 0)) + %call16 = call i64 @get_time() + store i64 %call16, i64* %time2, align 8 + %17 = load %struct.record*, %struct.record** %recordsD, align 8 + %18 = bitcast %struct.record* %17 to i8* + %19 = load %struct.record*, %struct.record** %records.addr, align 8 + %20 = bitcast %struct.record* %19 to i8* + %21 = load i64, i64* %records_mem.addr, align 8 + %call17 = call i32 @cudaMemcpy(i8* %18, i8* %20, i64 %21, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.6, i64 0, i64 0)) + %22 = load %struct.knode*, %struct.knode** %knodesD, align 8 + %23 = bitcast %struct.knode* %22 to i8* + %24 = load %struct.knode*, %struct.knode** %knodes.addr, align 8 + %25 = bitcast %struct.knode* %24 to i8* + %26 = load i64, i64* %knodes_mem.addr, align 8 + %call18 = call i32 @cudaMemcpy(i8* %23, i8* %25, i64 %26, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.6, i64 0, i64 0)) + %27 = load i64*, i64** %currKnodeD, align 8 + %28 = bitcast i64* %27 to i8* + %29 = load i64*, i64** %currKnode.addr, align 8 + %30 = bitcast i64* %29 to i8* + %31 = load i32, i32* %count.addr, align 4 + %conv19 = sext i32 %31 to i64 + %mul20 = mul i64 %conv19, 8 + %call21 = call i32 @cudaMemcpy(i8* %28, i8* %30, i64 %mul20, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.7, i64 0, i64 0)) + %32 = load i64*, i64** %offsetD, align 8 + %33 = bitcast i64* %32 to i8* + %34 = load i64*, i64** %offset.addr, align 8 + %35 = bitcast i64* %34 to i8* + %36 = load i32, i32* %count.addr, align 4 + %conv22 = sext i32 %36 to i64 + %mul23 = mul i64 %conv22, 8 + %call24 = call i32 @cudaMemcpy(i8* %33, i8* %35, i64 %mul23, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.8, i64 0, i64 0)) + %37 = load i32*, i32** %keysD, align 8 + %38 = bitcast i32* %37 to i8* + %39 = load i32*, i32** %keys.addr, align 8 + %40 = bitcast i32* %39 to i8* + %41 = load i32, i32* %count.addr, align 4 + %conv25 = sext i32 %41 to i64 + %mul26 = mul i64 %conv25, 4 + %call27 = call i32 @cudaMemcpy(i8* %38, i8* %40, i64 %mul26, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.9, i64 0, i64 0)) + %42 = load %struct.record*, %struct.record** %ansD, align 8 + %43 = bitcast %struct.record* %42 to i8* + %44 = load %struct.record*, %struct.record** %ans.addr, align 8 + %45 = bitcast %struct.record* %44 to i8* + %46 = load i32, i32* %count.addr, align 4 + %conv28 = sext i32 %46 to i64 + %mul29 = mul i64 %conv28, 4 + %call30 = call i32 @cudaMemcpy(i8* %43, i8* %45, i64 %mul29, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.10, i64 0, i64 0)) + %call31 = call i64 @get_time() + store i64 %call31, i64* %time3, align 8 + %47 = load i32, i32* %numBlocks, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 %47, i32 1, i32 1) + %48 = load i32, i32* %threadsPerBlock, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp32, i32 %48, i32 1, i32 1) + %49 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %50 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %49, i8* align 4 %50, i64 12, i1 false) + %51 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %52 = load i64, i64* %51, align 4 + %53 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %54 = load i32, i32* %53, align 4 + %55 = bitcast { i64, i32 }* %agg.tmp32.coerce to i8* + %56 = bitcast %struct.dim3* %agg.tmp32 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %55, i8* align 4 %56, i64 12, i1 false) + %57 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp32.coerce, i32 0, i32 0 + %58 = load i64, i64* %57, align 4 + %59 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp32.coerce, i32 0, i32 1 + %60 = load i32, i32* %59, align 4 + %call33 = call i32 @__cudaPushCallConfiguration(i64 %52, i32 %54, i64 %58, i32 %60, i64 0, i8* null) + %tobool = icmp ne i32 %call33, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %cond.end + %61 = load i64, i64* %maxheight.addr, align 8 + %62 = load %struct.knode*, %struct.knode** %knodesD, align 8 + %63 = load i64, i64* %knodes_elem.addr, align 8 + %64 = load %struct.record*, %struct.record** %recordsD, align 8 + %65 = load i64*, i64** %currKnodeD, align 8 + %66 = load i64*, i64** %offsetD, align 8 + %67 = load i32*, i32** %keysD, align 8 + %68 = load %struct.record*, %struct.record** %ansD, align 8 + call void @findK(i64 %61, %struct.knode* %62, i64 %63, %struct.record* %64, i64* %65, i64* %66, i32* %67, %struct.record* %68) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %cond.end + %call34 = call i32 @cudaThreadSynchronize() + call void @checkCUDAError(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.11, i64 0, i64 0)) + %call35 = call i64 @get_time() + store i64 %call35, i64* %time4, align 8 + %69 = load %struct.record*, %struct.record** %ans.addr, align 8 + %70 = bitcast %struct.record* %69 to i8* + %71 = load %struct.record*, %struct.record** %ansD, align 8 + %72 = bitcast %struct.record* %71 to i8* + %73 = load i32, i32* %count.addr, align 4 + %conv36 = sext i32 %73 to i64 + %mul37 = mul i64 %conv36, 4 + %call38 = call i32 @cudaMemcpy(i8* %70, i8* %72, i64 %mul37, i32 2) + call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.12, i64 0, i64 0)) + %call39 = call i64 @get_time() + store i64 %call39, i64* %time5, align 8 + %74 = load %struct.record*, %struct.record** %recordsD, align 8 + %75 = bitcast %struct.record* %74 to i8* + %call40 = call i32 @cudaFree(i8* %75) + %76 = load %struct.knode*, %struct.knode** %knodesD, align 8 + %77 = bitcast %struct.knode* %76 to i8* + %call41 = call i32 @cudaFree(i8* %77) + %78 = load i64*, i64** %currKnodeD, align 8 + %79 = bitcast i64* %78 to i8* + %call42 = call i32 @cudaFree(i8* %79) + %80 = load i64*, i64** %offsetD, align 8 + %81 = bitcast i64* %80 to i8* + %call43 = call i32 @cudaFree(i8* %81) + %82 = load i32*, i32** %keysD, align 8 + %83 = bitcast i32* %82 to i8* + %call44 = call i32 @cudaFree(i8* %83) + %84 = load %struct.record*, %struct.record** %ansD, align 8 + %85 = bitcast %struct.record* %84 to i8* + %call45 = call i32 @cudaFree(i8* %85) + %call46 = call i64 @get_time() + store i64 %call46, i64* %time6, align 8 + %call47 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.13, i64 0, i64 0)) + %86 = load i64, i64* %time1, align 8 + %87 = load i64, i64* %time0, align 8 + %sub = sub nsw i64 %86, %87 + %conv48 = sitofp i64 %sub to float + %div = fdiv float %conv48, 1.000000e+06 + %conv49 = fpext float %div to double + %88 = load i64, i64* %time1, align 8 + %89 = load i64, i64* %time0, align 8 + %sub50 = sub nsw i64 %88, %89 + %conv51 = sitofp i64 %sub50 to float + %90 = load i64, i64* %time6, align 8 + %91 = load i64, i64* %time0, align 8 + %sub52 = sub nsw i64 %90, %91 + %conv53 = sitofp i64 %sub52 to float + %div54 = fdiv float %conv51, %conv53 + %mul55 = fmul contract float %div54, 1.000000e+02 + %conv56 = fpext float %mul55 to double + %call57 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([54 x i8], [54 x i8]* @.str.14, i64 0, i64 0), double %conv49, double %conv56) + %92 = load i64, i64* %time2, align 8 + %93 = load i64, i64* %time1, align 8 + %sub58 = sub nsw i64 %92, %93 + %conv59 = sitofp i64 %sub58 to float + %div60 = fdiv float %conv59, 1.000000e+06 + %conv61 = fpext float %div60 to double + %94 = load i64, i64* %time2, align 8 + %95 = load i64, i64* %time1, align 8 + %sub62 = sub nsw i64 %94, %95 + %conv63 = sitofp i64 %sub62 to float + %96 = load i64, i64* %time6, align 8 + %97 = load i64, i64* %time0, align 8 + %sub64 = sub nsw i64 %96, %97 + %conv65 = sitofp i64 %sub64 to float + %div66 = fdiv float %conv63, %conv65 + %mul67 = fmul contract float %div66, 1.000000e+02 + %conv68 = fpext float %mul67 to double + %call69 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.15, i64 0, i64 0), double %conv61, double %conv68) + %98 = load i64, i64* %time3, align 8 + %99 = load i64, i64* %time2, align 8 + %sub70 = sub nsw i64 %98, %99 + %conv71 = sitofp i64 %sub70 to float + %div72 = fdiv float %conv71, 1.000000e+06 + %conv73 = fpext float %div72 to double + %100 = load i64, i64* %time3, align 8 + %101 = load i64, i64* %time2, align 8 + %sub74 = sub nsw i64 %100, %101 + %conv75 = sitofp i64 %sub74 to float + %102 = load i64, i64* %time6, align 8 + %103 = load i64, i64* %time0, align 8 + %sub76 = sub nsw i64 %102, %103 + %conv77 = sitofp i64 %sub76 to float + %div78 = fdiv float %conv75, %conv77 + %mul79 = fmul contract float %div78, 1.000000e+02 + %conv80 = fpext float %mul79 to double + %call81 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.16, i64 0, i64 0), double %conv73, double %conv80) + %104 = load i64, i64* %time4, align 8 + %105 = load i64, i64* %time3, align 8 + %sub82 = sub nsw i64 %104, %105 + %conv83 = sitofp i64 %sub82 to float + %div84 = fdiv float %conv83, 1.000000e+06 + %conv85 = fpext float %div84 to double + %106 = load i64, i64* %time4, align 8 + %107 = load i64, i64* %time3, align 8 + %sub86 = sub nsw i64 %106, %107 + %conv87 = sitofp i64 %sub86 to float + %108 = load i64, i64* %time6, align 8 + %109 = load i64, i64* %time0, align 8 + %sub88 = sub nsw i64 %108, %109 + %conv89 = sitofp i64 %sub88 to float + %div90 = fdiv float %conv87, %conv89 + %mul91 = fmul contract float %div90, 1.000000e+02 + %conv92 = fpext float %mul91 to double + %call93 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.17, i64 0, i64 0), double %conv85, double %conv92) + %110 = load i64, i64* %time5, align 8 + %111 = load i64, i64* %time4, align 8 + %sub94 = sub nsw i64 %110, %111 + %conv95 = sitofp i64 %sub94 to float + %div96 = fdiv float %conv95, 1.000000e+06 + %conv97 = fpext float %div96 to double + %112 = load i64, i64* %time5, align 8 + %113 = load i64, i64* %time4, align 8 + %sub98 = sub nsw i64 %112, %113 + %conv99 = sitofp i64 %sub98 to float + %114 = load i64, i64* %time6, align 8 + %115 = load i64, i64* %time0, align 8 + %sub100 = sub nsw i64 %114, %115 + %conv101 = sitofp i64 %sub100 to float + %div102 = fdiv float %conv99, %conv101 + %mul103 = fmul contract float %div102, 1.000000e+02 + %conv104 = fpext float %mul103 to double + %call105 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.18, i64 0, i64 0), double %conv97, double %conv104) + %116 = load i64, i64* %time6, align 8 + %117 = load i64, i64* %time5, align 8 + %sub106 = sub nsw i64 %116, %117 + %conv107 = sitofp i64 %sub106 to float + %div108 = fdiv float %conv107, 1.000000e+06 + %conv109 = fpext float %div108 to double + %118 = load i64, i64* %time6, align 8 + %119 = load i64, i64* %time5, align 8 + %sub110 = sub nsw i64 %118, %119 + %conv111 = sitofp i64 %sub110 to float + %120 = load i64, i64* %time6, align 8 + %121 = load i64, i64* %time0, align 8 + %sub112 = sub nsw i64 %120, %121 + %conv113 = sitofp i64 %sub112 to float + %div114 = fdiv float %conv111, %conv113 + %mul115 = fmul contract float %div114, 1.000000e+02 + %conv116 = fpext float %mul115 to double + %call117 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.19, i64 0, i64 0), double %conv109, double %conv116) + %call118 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.20, i64 0, i64 0)) + %122 = load i64, i64* %time6, align 8 + %123 = load i64, i64* %time0, align 8 + %sub119 = sub nsw i64 %122, %123 + %conv120 = sitofp i64 %sub119 to float + %div121 = fdiv float %conv120, 1.000000e+06 + %conv122 = fpext float %div121 to double + %call123 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.21, i64 0, i64 0), double %conv122) + ret void +} + +declare dso_local i64 @get_time() #2 + +declare dso_local i32 @cudaThreadSynchronize() #2 + +declare dso_local i32 @printf(i8*, ...) #2 + +declare dso_local i32 @cudaMalloc(i8**, i64) #2 + +declare dso_local void @checkCUDAError(i8*) #2 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2 + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #3 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @cudaFree(i8*) #2 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.11, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..4247f06 --- /dev/null +++ b/examples/btree/kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,475 @@ +; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } +%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 } + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 { +entry: + %height.addr = alloca i64, align 8 + %knodesD.addr = alloca %struct.knode*, align 8 + %knodes_elem.addr = alloca i64, align 8 + %currKnodeD.addr = alloca i64*, align 8 + %offsetD.addr = alloca i64*, align 8 + %lastKnodeD.addr = alloca i64*, align 8 + %offset_2D.addr = alloca i64*, align 8 + %startD.addr = alloca i32*, align 8 + %endD.addr = alloca i32*, align 8 + %RecstartD.addr = alloca i32*, align 8 + %ReclenD.addr = alloca i32*, align 8 + %thid = alloca i32, align 4 + %bid = alloca i32, align 4 + %i = alloca i32, align 4 + store i64 %height, i64* %height.addr, align 8 + store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8 + store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 + store i64* %currKnodeD, i64** %currKnodeD.addr, align 8 + store i64* %offsetD, i64** %offsetD.addr, align 8 + store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8 + store i64* %offset_2D, i64** %offset_2D.addr, align 8 + store i32* %startD, i32** %startD.addr, align 8 + store i32* %endD, i32** %endD.addr, align 8 + store i32* %RecstartD, i32** %RecstartD.addr, align 8 + store i32* %ReclenD, i32** %ReclenD.addr, align 8 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %thid, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %bid, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %conv = sext i32 %0 to i64 + %1 = load i64, i64* %height.addr, align 8 + %cmp = icmp slt i64 %conv, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %3 = load i64*, i64** %currKnodeD.addr, align 8 + %4 = load i32, i32* %bid, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom + %5 = load i64, i64* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5 + %keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2 + %6 = load i32, i32* %thid, align 4 + %idxprom3 = sext i32 %6 to i64 + %arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3 + %7 = load i32, i32* %arrayidx4, align 4 + %8 = load i32*, i32** %startD.addr, align 8 + %9 = load i32, i32* %bid, align 4 + %idxprom5 = sext i32 %9 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5 + %10 = load i32, i32* %arrayidx6, align 4 + %cmp7 = icmp sle i32 %7, %10 + br i1 %cmp7, label %land.lhs.true, label %if.end34 + +land.lhs.true: ; preds = %for.body + %11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %12 = load i64*, i64** %currKnodeD.addr, align 8 + %13 = load i32, i32* %bid, align 4 + %idxprom8 = sext i32 %13 to i64 + %arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8 + %14 = load i64, i64* %arrayidx9, align 8 + %arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14 + %keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2 + %15 = load i32, i32* %thid, align 4 + %add = add nsw i32 %15, 1 + %idxprom12 = sext i32 %add to i64 + %arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12 + %16 = load i32, i32* %arrayidx13, align 4 + %17 = load i32*, i32** %startD.addr, align 8 + %18 = load i32, i32* %bid, align 4 + %idxprom14 = sext i32 %18 to i64 + %arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14 + %19 = load i32, i32* %arrayidx15, align 4 + %cmp16 = icmp sgt i32 %16, %19 + br i1 %cmp16, label %if.then, label %if.end34 + +if.then: ; preds = %land.lhs.true + %20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %21 = load i64*, i64** %currKnodeD.addr, align 8 + %22 = load i32, i32* %bid, align 4 + %idxprom17 = sext i32 %22 to i64 + %arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17 + %23 = load i64, i64* %arrayidx18, align 8 + %arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23 + %indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1 + %24 = load i32, i32* %thid, align 4 + %idxprom20 = sext i32 %24 to i64 + %arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20 + %25 = load i32, i32* %arrayidx21, align 4 + %conv22 = sext i32 %25 to i64 + %26 = load i64, i64* %knodes_elem.addr, align 8 + %cmp23 = icmp slt i64 %conv22, %26 + br i1 %cmp23, label %if.then24, label %if.end + +if.then24: ; preds = %if.then + %27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %28 = load i64*, i64** %currKnodeD.addr, align 8 + %29 = load i32, i32* %bid, align 4 + %idxprom25 = sext i32 %29 to i64 + %arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25 + %30 = load i64, i64* %arrayidx26, align 8 + %arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30 + %indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1 + %31 = load i32, i32* %thid, align 4 + %idxprom29 = sext i32 %31 to i64 + %arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29 + %32 = load i32, i32* %arrayidx30, align 4 + %conv31 = sext i32 %32 to i64 + %33 = load i64*, i64** %offsetD.addr, align 8 + %34 = load i32, i32* %bid, align 4 + %idxprom32 = sext i32 %34 to i64 + %arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32 + store i64 %conv31, i64* %arrayidx33, align 8 + br label %if.end + +if.end: ; preds = %if.then24, %if.then + br label %if.end34 + +if.end34: ; preds = %if.end, %land.lhs.true, %for.body + %35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %36 = load i64*, i64** %lastKnodeD.addr, align 8 + %37 = load i32, i32* %bid, align 4 + %idxprom35 = sext i32 %37 to i64 + %arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35 + %38 = load i64, i64* %arrayidx36, align 8 + %arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38 + %keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2 + %39 = load i32, i32* %thid, align 4 + %idxprom39 = sext i32 %39 to i64 + %arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39 + %40 = load i32, i32* %arrayidx40, align 4 + %41 = load i32*, i32** %endD.addr, align 8 + %42 = load i32, i32* %bid, align 4 + %idxprom41 = sext i32 %42 to i64 + %arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41 + %43 = load i32, i32* %arrayidx42, align 4 + %cmp43 = icmp sle i32 %40, %43 + br i1 %cmp43, label %land.lhs.true44, label %if.end75 + +land.lhs.true44: ; preds = %if.end34 + %44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %45 = load i64*, i64** %lastKnodeD.addr, align 8 + %46 = load i32, i32* %bid, align 4 + %idxprom45 = sext i32 %46 to i64 + %arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45 + %47 = load i64, i64* %arrayidx46, align 8 + %arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47 + %keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2 + %48 = load i32, i32* %thid, align 4 + %add49 = add nsw i32 %48, 1 + %idxprom50 = sext i32 %add49 to i64 + %arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50 + %49 = load i32, i32* %arrayidx51, align 4 + %50 = load i32*, i32** %endD.addr, align 8 + %51 = load i32, i32* %bid, align 4 + %idxprom52 = sext i32 %51 to i64 + %arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52 + %52 = load i32, i32* %arrayidx53, align 4 + %cmp54 = icmp sgt i32 %49, %52 + br i1 %cmp54, label %if.then55, label %if.end75 + +if.then55: ; preds = %land.lhs.true44 + %53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %54 = load i64*, i64** %lastKnodeD.addr, align 8 + %55 = load i32, i32* %bid, align 4 + %idxprom56 = sext i32 %55 to i64 + %arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56 + %56 = load i64, i64* %arrayidx57, align 8 + %arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56 + %indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1 + %57 = load i32, i32* %thid, align 4 + %idxprom60 = sext i32 %57 to i64 + %arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60 + %58 = load i32, i32* %arrayidx61, align 4 + %conv62 = sext i32 %58 to i64 + %59 = load i64, i64* %knodes_elem.addr, align 8 + %cmp63 = icmp slt i64 %conv62, %59 + br i1 %cmp63, label %if.then64, label %if.end74 + +if.then64: ; preds = %if.then55 + %60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %61 = load i64*, i64** %lastKnodeD.addr, align 8 + %62 = load i32, i32* %bid, align 4 + %idxprom65 = sext i32 %62 to i64 + %arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65 + %63 = load i64, i64* %arrayidx66, align 8 + %arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63 + %indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1 + %64 = load i32, i32* %thid, align 4 + %idxprom69 = sext i32 %64 to i64 + %arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69 + %65 = load i32, i32* %arrayidx70, align 4 + %conv71 = sext i32 %65 to i64 + %66 = load i64*, i64** %offset_2D.addr, align 8 + %67 = load i32, i32* %bid, align 4 + %idxprom72 = sext i32 %67 to i64 + %arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72 + store i64 %conv71, i64* %arrayidx73, align 8 + br label %if.end74 + +if.end74: ; preds = %if.then64, %if.then55 + br label %if.end75 + +if.end75: ; preds = %if.end74, %land.lhs.true44, %if.end34 + call void @llvm.nvvm.barrier0() + %68 = load i32, i32* %thid, align 4 + %cmp76 = icmp eq i32 %68, 0 + br i1 %cmp76, label %if.then77, label %if.end86 + +if.then77: ; preds = %if.end75 + %69 = load i64*, i64** %offsetD.addr, align 8 + %70 = load i32, i32* %bid, align 4 + %idxprom78 = sext i32 %70 to i64 + %arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78 + %71 = load i64, i64* %arrayidx79, align 8 + %72 = load i64*, i64** %currKnodeD.addr, align 8 + %73 = load i32, i32* %bid, align 4 + %idxprom80 = sext i32 %73 to i64 + %arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80 + store i64 %71, i64* %arrayidx81, align 8 + %74 = load i64*, i64** %offset_2D.addr, align 8 + %75 = load i32, i32* %bid, align 4 + %idxprom82 = sext i32 %75 to i64 + %arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82 + %76 = load i64, i64* %arrayidx83, align 8 + %77 = load i64*, i64** %lastKnodeD.addr, align 8 + %78 = load i32, i32* %bid, align 4 + %idxprom84 = sext i32 %78 to i64 + %arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84 + store i64 %76, i64* %arrayidx85, align 8 + br label %if.end86 + +if.end86: ; preds = %if.then77, %if.end75 + call void @llvm.nvvm.barrier0() + br label %for.inc + +for.inc: ; preds = %if.end86 + %79 = load i32, i32* %i, align 4 + %inc = add nsw i32 %79, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %81 = load i64*, i64** %currKnodeD.addr, align 8 + %82 = load i32, i32* %bid, align 4 + %idxprom87 = sext i32 %82 to i64 + %arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87 + %83 = load i64, i64* %arrayidx88, align 8 + %arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83 + %keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2 + %84 = load i32, i32* %thid, align 4 + %idxprom91 = sext i32 %84 to i64 + %arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91 + %85 = load i32, i32* %arrayidx92, align 4 + %86 = load i32*, i32** %startD.addr, align 8 + %87 = load i32, i32* %bid, align 4 + %idxprom93 = sext i32 %87 to i64 + %arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93 + %88 = load i32, i32* %arrayidx94, align 4 + %cmp95 = icmp eq i32 %85, %88 + br i1 %cmp95, label %if.then96, label %if.end105 + +if.then96: ; preds = %for.end + %89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %90 = load i64*, i64** %currKnodeD.addr, align 8 + %91 = load i32, i32* %bid, align 4 + %idxprom97 = sext i32 %91 to i64 + %arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97 + %92 = load i64, i64* %arrayidx98, align 8 + %arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92 + %indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1 + %93 = load i32, i32* %thid, align 4 + %idxprom101 = sext i32 %93 to i64 + %arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101 + %94 = load i32, i32* %arrayidx102, align 4 + %95 = load i32*, i32** %RecstartD.addr, align 8 + %96 = load i32, i32* %bid, align 4 + %idxprom103 = sext i32 %96 to i64 + %arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103 + store i32 %94, i32* %arrayidx104, align 4 + br label %if.end105 + +if.end105: ; preds = %if.then96, %for.end + call void @llvm.nvvm.barrier0() + %97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %98 = load i64*, i64** %lastKnodeD.addr, align 8 + %99 = load i32, i32* %bid, align 4 + %idxprom106 = sext i32 %99 to i64 + %arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106 + %100 = load i64, i64* %arrayidx107, align 8 + %arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100 + %keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2 + %101 = load i32, i32* %thid, align 4 + %idxprom110 = sext i32 %101 to i64 + %arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110 + %102 = load i32, i32* %arrayidx111, align 4 + %103 = load i32*, i32** %endD.addr, align 8 + %104 = load i32, i32* %bid, align 4 + %idxprom112 = sext i32 %104 to i64 + %arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112 + %105 = load i32, i32* %arrayidx113, align 4 + %cmp114 = icmp eq i32 %102, %105 + br i1 %cmp114, label %if.then115, label %if.end127 + +if.then115: ; preds = %if.end105 + %106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8 + %107 = load i64*, i64** %lastKnodeD.addr, align 8 + %108 = load i32, i32* %bid, align 4 + %idxprom116 = sext i32 %108 to i64 + %arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116 + %109 = load i64, i64* %arrayidx117, align 8 + %arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109 + %indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1 + %110 = load i32, i32* %thid, align 4 + %idxprom120 = sext i32 %110 to i64 + %arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120 + %111 = load i32, i32* %arrayidx121, align 4 + %112 = load i32*, i32** %RecstartD.addr, align 8 + %113 = load i32, i32* %bid, align 4 + %idxprom122 = sext i32 %113 to i64 + %arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122 + %114 = load i32, i32* %arrayidx123, align 4 + %sub = sub nsw i32 %111, %114 + %add124 = add nsw i32 %sub, 1 + %115 = load i32*, i32** %ReclenD.addr, align 8 + %116 = load i32, i32* %bid, align 4 + %idxprom125 = sext i32 %116 to i64 + %arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125 + store i32 %add124, i32* %arrayidx126, align 4 + br label %if.end127 + +if.end127: ; preds = %if.then115, %if.end105 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll b/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..a10890a --- /dev/null +++ b/examples/btree/kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,651 @@ +; ModuleID = 'kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc' +source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +@.str = private unnamed_addr constant [75 x i8] c"# of blocks = %d, # of threads/block = %d (ensure that device can handle)\0A\00", align 1 +@.str.1 = private unnamed_addr constant [21 x i8] c"cudaMalloc recordsD\00", align 1 +@.str.2 = private unnamed_addr constant [23 x i8] c"cudaMalloc currKnodeD\00", align 1 +@.str.3 = private unnamed_addr constant [20 x i8] c"cudaMalloc offsetD\00", align 1 +@.str.4 = private unnamed_addr constant [23 x i8] c"cudaMalloc lastKnodeD\00", align 1 +@.str.5 = private unnamed_addr constant [22 x i8] c"cudaMalloc offset_2D\00", align 1 +@.str.6 = private unnamed_addr constant [18 x i8] c"cudaMalloc startD\00", align 1 +@.str.7 = private unnamed_addr constant [16 x i8] c"cudaMalloc endD\00", align 1 +@.str.8 = private unnamed_addr constant [21 x i8] c"cudaMalloc ansDStart\00", align 1 +@.str.9 = private unnamed_addr constant [22 x i8] c"cudaMalloc ansDLength\00", align 1 +@.str.10 = private unnamed_addr constant [27 x i8] c"cudaMalloc cudaMemcpy memD\00", align 1 +@.str.11 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy currKnodeD\00", align 1 +@.str.12 = private unnamed_addr constant [30 x i8] c"cudaMalloc cudaMemcpy offsetD\00", align 1 +@.str.13 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy lastKnodeD\00", align 1 +@.str.14 = private unnamed_addr constant [32 x i8] c"cudaMalloc cudaMemcpy offset_2D\00", align 1 +@.str.15 = private unnamed_addr constant [18 x i8] c"cudaMemcpy startD\00", align 1 +@.str.16 = private unnamed_addr constant [16 x i8] c"cudaMemcpy endD\00", align 1 +@.str.17 = private unnamed_addr constant [21 x i8] c"cudaMemcpy ansDStart\00", align 1 +@.str.18 = private unnamed_addr constant [22 x i8] c"cudaMemcpy ansDLength\00", align 1 +@.str.19 = private unnamed_addr constant [11 x i8] c"findRangeK\00", align 1 +@.str.20 = private unnamed_addr constant [52 x i8] c"Time spent in different stages of GPU_CUDA KERNEL:\0A\00", align 1 +@.str.21 = private unnamed_addr constant [54 x i8] c"%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\0A\00", align 1 +@.str.22 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: ALO\0A\00", align 1 +@.str.23 = private unnamed_addr constant [41 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY IN\0A\00", align 1 +@.str.24 = private unnamed_addr constant [36 x i8] c"%15.12f s, %15.12f % : GPU: KERNEL\0A\00", align 1 +@.str.25 = private unnamed_addr constant [42 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY OUT\0A\00", align 1 +@.str.26 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: FRE\0A\00", align 1 +@.str.27 = private unnamed_addr constant [13 x i8] c"Total time:\0A\00", align 1 +@.str.28 = private unnamed_addr constant [9 x i8] c"%.12f s\0A\00", align 1 +@0 = private constant [26033 x i8] c"P\EDU\BA\01\00\10\00\A0e\00\00\00\00\00\00\02\00\01\01@\00\00\00\C8V\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00 V\00\00\00\00\00\00\E0S\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.findRangeK\00.nv.info.findRangeK\00.nv.shared.findRangeK\00.nv.global\00.nv.constant0.findRangeK\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00findRangeK\00.text.findRangeK\00.nv.info.findRangeK\00.nv.shared.findRangeK\00.nv.global\00threadIdx\00blockIdx\00.nv.constant0.findRangeK\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00=\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00x\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\83\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\8D\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\96\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00\00O\00\00\00\00\00\00\04/\08\00\06\00\00\00\1C\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00h\00\00\00\04\11\08\00\06\00\00\00h\00\00\00\010\00\00\01*\00\00\04\0A\08\00\05\00\00\00@\01X\00\03\19X\00\04\17\0C\00\00\00\00\00\0A\00P\00\00\F0!\00\04\17\0C\00\00\00\00\00\09\00H\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F0!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\B8\0A\00\00\04\1C\04\00\D0N\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB3\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\0Avisible .entry findRangeK\93\04\00\8D\00\06\18\00\0Eg\04\0F \00\02\1F1 \00\0C\1F2 \00\0C\1F3 \00\0C\1F4 \00\0C\1F5 \00\0C\1F6 \00\0C\1F7 \00\0C\1F8 \00\0C\1F9 \01\0D\0FH\0B\14_6[104y\04\15\BCpred %p<11>]\0B\1E8^\0B\1F1\8F\04\0D\1F6`\0B\1A\00>\03\0F\E7\00\00\0Fh\03\01\1F0+\00\01\1F9*\00\00\1F9)\00\01\1F8)\00\00\1F8)\00\01\1F7)\00\00\1F7)\00\01\1F6)\00\00\1F6)\00\01\1F5)\00\00\1E5)\00\0F]\05\03\1F4)\00\01\1F3\86\05\04\0E1\03\0F\06\05\04\0Es\01\0F\DA\04\04\0E\C3\03\13]\E0\01#to\B7\14\05/\00\122\B8\04\04\B8\11\0B\1E\00\123\1E\00\1F2?\00\06\124!\00\1F0?\00\03\125\1E\00\1F4?\00\06\116!\00\1F9>\00\03\127>\00\1F6>\00\06\118!\00\1F8>\00\03\129>\00\1F8>\00\05!20!\00\1F7>\00\02\2221\B9\05\1F0>\00\06\028\01\1F6>\00\03\027\01\1F27\01\06\1227\01\1F5>\00\03\026\01\1F26\01\06\1226\01\1F4>\00\03\026\01\1F26\01\06$28\94\06\0F>\00\01\026\01/28\DE\06\19\1A9\17\00)16\C9\06\0C\E0\06*27\18\00\03\E1\06:d25\18\00\134u\00\1B2H\00\144w\00\0B\8E\00\135w\00\1B1\8F\00\136x\00\1B1x\00\137x\00\1B1x\00\148\ED\00\1B3\18\10k%tid.x\D7\11\05\D8\11\09t\0CN%cta-\00\1F9\CB\07\03\1B0&\00\146\BB\07\F2\01bra.uni LBB6_1;\0A\08\00\10:\F3\03\11s=\00Ed30,5\00\01\0B\04\04W\04$1,\B9\01\B0;\0Asetp.ge.s\1C\004p1,9\00\01(\00\A3;\0A@%p1 brag\00\1B6x\00\132x\00'2:`\00\149x\00\198w\00591,\EC\01\17;\A7\00592,\02\01T;\0Ashl\9A\06493, \00\833;\0Aadd.s\19\00$4,Q\00\01'\00\09f\00\00\87\05\01$\00\95];\0Amul.lo7\00$6,\22\00J2068S\00$7,\BB\00\01*\00\09\A1\00%8,\D0\01\0A\A1\00$9, \00\192N\00H100,V\00\129\A2\00\02,\02\123\A1\00\89100+1032$\01E101,\B1\02\09l\005102\0E\01\0Cm\00%3,<\00\02*\00\08o\00\144o\00\153\EE\01\12t\DA\003p4,\90\00\00'\00\01\EC\01\164\EC\01\1B7\EB\01\133\EB\01)3:\B0\00\1F4\EC\01\02?105\ED\01\03?106\EE\01\04E107,\22\00\1B3\E2\00%8,V\00\02+\00\0Am\00\149\E4\00\1D8\F5\01\101e\08\04%\00\0D\F7\01\101\AE\08\04\C8\00\02.\00\08<\01\185\F9\01$ad\C6\00\02\99\06\00\1E\00\02\19\07\00N\00\05\E1\00\024\07*16\DD\00\04,\07\1E1\BF\01\04&\07\04\8B\00(13\83\00\137\DB\00/14.\02\00/15.\02\05&16M\01\0Cp\00%7,=\00\02+\00\08p\00\148p\00\147/\02#le/\02#5,\91\00\00'\00\01/\02\1F5/\02\07\134/\02\194/\02/18/\02\03/19/\02\04/20/\02\05\03\85\07\01\22\00\0B/\02\03\7F\07\141\E6\07)21m\00\142\82\03.22/\02\03}\07\01%\00\0E/\02\03~\07#11\91\02*24\AE\00\1F6*\04\04\131y\07.12\90\01\03s\07\05\\\00*27T\00\04\DD\02J28+46\01\05\E1\05-16\CA\05%6,>\00\02+\00\01\B1\01\166\B1\01\0C\CB\05\135\B1\01\195\B1\01/31\B1\01\03/32\B1\01\04/33\B1\01\05534,\22\00\0B\B1\01535,V\00\02+\00\0Am\00\136\03\01.35\B1\01537,%\00\0E\B1\01538,\C8\00\02.\00\0A\AE\00\1F9\B1\01\05540,\22\00\0B\B1\01941,\\\00)40T\00#42\AE\00,41\B1\01543,\D7\08\0BT\00%4,\22\00\04\02\01\05x\08\01 \00\03x\08+42\B8\01\136\B8\01*6:\18\00\137\18\00\197\D0\01/45\D0\01\03546,Y\09\0A\CE\00\1F7\D0\01\05\144\FF\00\1D7\D0\01549,V\00\02+\00\09m\00$50\22\01\1E9\D0\01551,%\00\0E\D0\01552,\C8\00\02.\00\09\AE\00/53\D0\01\05554,\22\00\0B\D0\01955,\\\00(54\11\05\04\7F\03/55\81\05\00556,H\0A\0Bp\00\167\1E\01\0Cp\00%8,=\00\02+\00\07p\00$20p\00\1D8\B0\07#7,\91\00\00'\00\01\D0\03\177\9C\09\0C\19\02\138\01\02\198\01\02/59\01\02\03/60\01\02\04/61\01\02\05562,\22\00\0B\01\02563,V\00\02+\00\0Am\00\04\95\08.63\01\02565,%\00\0E\01\02566,\C8\00\02.\00\08=\01\1F1\B1\07\03\02\11\06\00\1E\00\0F\B1\07\00\116W\01+22\DD\00\03\A5\01\1E6\C0\01969,\8B\00(68\83\00\04]\06/690\02\00/700\02\05&71M\01\0Bp\00572,=\00\02+\00\08p\00\04K\01-72\B1\07#8,\91\00\00'\00\010\02\1F80\02\08\1390\02\1990\02/730\02\03/740\02\04/750\02\05576,\22\00\0B0\02577,V\00\02+\00\0Am\00\04\96\08.770\02579,%\00\0E0\02580,\C8\00\02.\00\09\AE\00/811\04\05\138v\01\1E8\91\01983,\\\00*82T\00\04\93\01,83\01\06/85\B2\07\07%9,>\00\02+\00\01\B2\01\179\B2\01\1C1\F6\0D\140\F7\0D\190\B4\01/86\B4\01\03/87\B4\01\04/88\B4\01\05\148\96\00\1D8\B4\01\03,\0D#18\BC\01)89m\00#91\06\01.90\B4\01\00\C6\0C\04%\00\0E\B4\01\03\F9\0D#181\02\1A9`\01/94\B4\01\05\00\DB\0D\04\22\00\0B\B4\01\03\FD\0D\05\\\00*95T\00\04h\0B,96\B4\01\05\FF\0D;48]T\00%9,\22\00\04\02\01\08\B5\07$99u\10\0C\1E\0B\151\B1\0F\1B1\B7\07$12\1A\00\D82:\0Abar.sync 0\F2\03\09\96\0C\01D\02\14n\F2\03\02\ED\0C\00\22\00\02 \06'10?\02\1C4\85\00\04\D3\0D\191\D4\0D8200\A6\08\06'\02/20\0B\06\05\132\9D\0E-20\0B\06\132\9E\0E\132\13\0F\142\9E\0E\04m\00\03-\03\112\A0\0E\08\19\00\09B\0E\09R\00%6,\22\00\04R\00\07w\01#20\EC\11:204R\00\187\C9\01\08\D8\00\1F8\D8\00\06%9,\22\00\0B\D8\00\03X\0E\132\B7\0E)20\FF\023211\D8\00\04\FA\16\05\19\00\182\85\03\09R\00%3,\22\00\04R\00\08\D8\00$13\DC\12\0D\09\04\04n\0D*145\02\0A&\00\04\E3\0B715:O\02\186&\12\074\07#7,\1E\00\1F1\83\12\02/27\84\12\05\186N\02/32\8C\04\02/33@\0C\03/34\8B\01\04\03#\0C\00 \00\0A\89\01\00\F8\0B\03Q\00\01'\00\09f\00\03\D6\03-36\83\04\03\BD\0B\00\22\00\0D\81\04\00\F4\0B\03\BB\00\01*\00\08\A1\00/40}\04\04\03\16\0C\00 \00\0A{\04\00\10\0C\06U\00\184\BD\07\03\FC\02.42+\08/43\DB\0F\04%44\09\01\0Bh\00$5,9\00\01'\00\07h\00\03\14\13-451\04\222,\87\00\22%ro\06\1720\04\1C8\E1\01\04S\0C\191T\0C\1F4n\06\03/47\E2\01\03\1F4m\03\05\035\0C\1D4k\03\00\0A\0C\03Q\00\01'\00\08f\00\135g\03\1E5e\06\03\CF\0B\1F5c\06\01\00\06\0C\03\BB\00\01*\00\08\A1\00\1F5_\06\05\03(\0C\1D5z\01\03\C9\0B\03U\00\185\92\0A\03\BA\0E\1C5V\06\00\1F\0C\04i\16\0Ae\00\158\06\01\0Be\00$9,9\00\01'\00\07\95\03\00\1D\00\01\94\03\0C\D4\0F$18\B2\01\1E8M\06\03\DF\1A\0F\AD\15\03/61+\08\03\1F6\AD\15\05\03\F2\0B\1D6\AD\15\00\C7\0B\03Q\00\01'\00\09f\00\03\99\02\1E6\AD\15\03]\0B\1F6\AD\15\01\02\B2\0B\13d]\0C\196\AD\15\1F6\AD\15\05\03\B6\0B\1D6Y\01\03W\0B\04U\00\08\AC\15\03?\04\1F7\AA\15\00/71\CA\0B\04%72\09\01\0Bh\00$3,9\00\01'\00\07h\00\03\DF\0A-73\A0\03\223,\87\00\22%rr\13\163\A0\03\1D2\E0\05\04\C2\0B\191\C3\0B\1F7\A3\15\03/75\E2\01\03\1F7\A1\15\05\03\A4\0B\1D7\9F\15\00y\0B\03Q\00\01'\00\09f\00\03?\0F\1E7\9A\15\03>\0B\1F7\98\15\01\00u\0B\03\BB\00\01*\00\08\A1\00/82\E2\01\04\03\97\0B\1D8z\01\03S\0B\03U\00\188`\15\03\9F\00+84\A0\03/85\A0\03\04%86\06\01\0Be\00$7,9\00\01'\00\08D\10\03\D4\0F\118T\15$ubG\16\221,\82\00\00\22\00\09`\16#2,\1F\00\09\B9\14\05\A6\0B\1B8:\08\03\A5\0B%88{\00\07\1B\04\128\1B\04\0D\EF\0D\142=\0C\C020:\0Aret;\0A\0A}\0A\00\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([26033 x i8], [26033 x i8]* @0, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 { +entry: + %height.addr = alloca i64, align 8 + %knodesD.addr = alloca %struct.knode*, align 8 + %knodes_elem.addr = alloca i64, align 8 + %currKnodeD.addr = alloca i64*, align 8 + %offsetD.addr = alloca i64*, align 8 + %lastKnodeD.addr = alloca i64*, align 8 + %offset_2D.addr = alloca i64*, align 8 + %startD.addr = alloca i32*, align 8 + %endD.addr = alloca i32*, align 8 + %RecstartD.addr = alloca i32*, align 8 + %ReclenD.addr = alloca i32*, align 8 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i64 %height, i64* %height.addr, align 8 + store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8 + store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 + store i64* %currKnodeD, i64** %currKnodeD.addr, align 8 + store i64* %offsetD, i64** %offsetD.addr, align 8 + store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8 + store i64* %offset_2D, i64** %offset_2D.addr, align 8 + store i32* %startD, i32** %startD.addr, align 8 + store i32* %endD, i32** %endD.addr, align 8 + store i32* %RecstartD, i32** %RecstartD.addr, align 8 + store i32* %ReclenD, i32** %ReclenD.addr, align 8 + %kernel_args = alloca i8*, i64 11, align 16 + %0 = bitcast i64* %height.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast %struct.knode** %knodesD.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i64* %knodes_elem.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i64** %currKnodeD.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i64** %offsetD.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i64** %lastKnodeD.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i64** %offset_2D.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast i32** %startD.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = bitcast i32** %endD.addr to i8* + %17 = getelementptr i8*, i8** %kernel_args, i32 8 + store i8* %16, i8** %17 + %18 = bitcast i32** %RecstartD.addr to i8* + %19 = getelementptr i8*, i8** %kernel_args, i32 9 + store i8* %18, i8** %19 + %20 = bitcast i32** %ReclenD.addr to i8* + %21 = getelementptr i8*, i8** %kernel_args, i32 10 + store i8* %20, i8** %21 + %22 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %23 = load i64, i64* %shmem_size, align 8 + %24 = load i8*, i8** %stream, align 8 + %25 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %26 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %25, i8* align 8 %26, i64 12, i1 false) + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %28 = load i64, i64* %27, align 8 + %29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %30 = load i32, i32* %29, align 8 + %31 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %32 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false) + %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %34 = load i64, i64* %33, align 8 + %35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %36 = load i32, i32* %35, align 8 + %37 = bitcast i8* %24 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK to i8*), i64 %28, i32 %30, i64 %34, i32 %36, i8** %kernel_args, i64 %23, %struct.CUstream_st* %37) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @kernel_gpu_cuda_wrapper_2(%struct.knode* %knodes, i64 %knodes_elem, i64 %knodes_mem, i32 %order, i64 %maxheight, i32 %count, i64* %currKnode, i64* %offset, i64* %lastKnode, i64* %offset_2, i32* %start, i32* %end, i32* %recstart, i32* %reclength) #0 { +entry: + %knodes.addr = alloca %struct.knode*, align 8 + %knodes_elem.addr = alloca i64, align 8 + %knodes_mem.addr = alloca i64, align 8 + %order.addr = alloca i32, align 4 + %maxheight.addr = alloca i64, align 8 + %count.addr = alloca i32, align 4 + %currKnode.addr = alloca i64*, align 8 + %offset.addr = alloca i64*, align 8 + %lastKnode.addr = alloca i64*, align 8 + %offset_2.addr = alloca i64*, align 8 + %start.addr = alloca i32*, align 8 + %end.addr = alloca i32*, align 8 + %recstart.addr = alloca i32*, align 8 + %reclength.addr = alloca i32*, align 8 + %time0 = alloca i64, align 8 + %time1 = alloca i64, align 8 + %time2 = alloca i64, align 8 + %time3 = alloca i64, align 8 + %time4 = alloca i64, align 8 + %time5 = alloca i64, align 8 + %time6 = alloca i64, align 8 + %numBlocks = alloca i32, align 4 + %threadsPerBlock = alloca i32, align 4 + %knodesD = alloca %struct.knode*, align 8 + %currKnodeD = alloca i64*, align 8 + %offsetD = alloca i64*, align 8 + %lastKnodeD = alloca i64*, align 8 + %offset_2D = alloca i64*, align 8 + %startD = alloca i32*, align 8 + %endD = alloca i32*, align 8 + %ansDStart = alloca i32*, align 8 + %ansDLength = alloca i32*, align 8 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp54 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp54.coerce = alloca { i64, i32 }, align 4 + store %struct.knode* %knodes, %struct.knode** %knodes.addr, align 8 + store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 + store i64 %knodes_mem, i64* %knodes_mem.addr, align 8 + store i32 %order, i32* %order.addr, align 4 + store i64 %maxheight, i64* %maxheight.addr, align 8 + store i32 %count, i32* %count.addr, align 4 + store i64* %currKnode, i64** %currKnode.addr, align 8 + store i64* %offset, i64** %offset.addr, align 8 + store i64* %lastKnode, i64** %lastKnode.addr, align 8 + store i64* %offset_2, i64** %offset_2.addr, align 8 + store i32* %start, i32** %start.addr, align 8 + store i32* %end, i32** %end.addr, align 8 + store i32* %recstart, i32** %recstart.addr, align 8 + store i32* %reclength, i32** %reclength.addr, align 8 + %call = call i64 @get_time() + store i64 %call, i64* %time0, align 8 + %call1 = call i32 @cudaThreadSynchronize() + %0 = load i32, i32* %count.addr, align 4 + store i32 %0, i32* %numBlocks, align 4 + %1 = load i32, i32* %order.addr, align 4 + %cmp = icmp slt i32 %1, 1024 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %2 = load i32, i32* %order.addr, align 4 + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %2, %cond.true ], [ 1024, %cond.false ] + store i32 %cond, i32* %threadsPerBlock, align 4 + %3 = load i32, i32* %numBlocks, align 4 + %4 = load i32, i32* %threadsPerBlock, align 4 + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @.str, i64 0, i64 0), i32 %3, i32 %4) + %call3 = call i64 @get_time() + store i64 %call3, i64* %time1, align 8 + %5 = bitcast %struct.knode** %knodesD to i8** + %6 = load i64, i64* %knodes_mem.addr, align 8 + %call4 = call i32 @cudaMalloc(i8** %5, i64 %6) + call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i64 0, i64 0)) + %7 = bitcast i64** %currKnodeD to i8** + %8 = load i32, i32* %count.addr, align 4 + %conv = sext i32 %8 to i64 + %mul = mul i64 %conv, 8 + %call5 = call i32 @cudaMalloc(i8** %7, i64 %mul) + call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.2, i64 0, i64 0)) + %9 = bitcast i64** %offsetD to i8** + %10 = load i32, i32* %count.addr, align 4 + %conv6 = sext i32 %10 to i64 + %mul7 = mul i64 %conv6, 8 + %call8 = call i32 @cudaMalloc(i8** %9, i64 %mul7) + call void @checkCUDAError(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.3, i64 0, i64 0)) + %11 = bitcast i64** %lastKnodeD to i8** + %12 = load i32, i32* %count.addr, align 4 + %conv9 = sext i32 %12 to i64 + %mul10 = mul i64 %conv9, 8 + %call11 = call i32 @cudaMalloc(i8** %11, i64 %mul10) + call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.4, i64 0, i64 0)) + %13 = bitcast i64** %offset_2D to i8** + %14 = load i32, i32* %count.addr, align 4 + %conv12 = sext i32 %14 to i64 + %mul13 = mul i64 %conv12, 8 + %call14 = call i32 @cudaMalloc(i8** %13, i64 %mul13) + call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.5, i64 0, i64 0)) + %15 = bitcast i32** %startD to i8** + %16 = load i32, i32* %count.addr, align 4 + %conv15 = sext i32 %16 to i64 + %mul16 = mul i64 %conv15, 4 + %call17 = call i32 @cudaMalloc(i8** %15, i64 %mul16) + call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.6, i64 0, i64 0)) + %17 = bitcast i32** %endD to i8** + %18 = load i32, i32* %count.addr, align 4 + %conv18 = sext i32 %18 to i64 + %mul19 = mul i64 %conv18, 4 + %call20 = call i32 @cudaMalloc(i8** %17, i64 %mul19) + call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0)) + %19 = bitcast i32** %ansDStart to i8** + %20 = load i32, i32* %count.addr, align 4 + %conv21 = sext i32 %20 to i64 + %mul22 = mul i64 %conv21, 4 + %call23 = call i32 @cudaMalloc(i8** %19, i64 %mul22) + call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.8, i64 0, i64 0)) + %21 = bitcast i32** %ansDLength to i8** + %22 = load i32, i32* %count.addr, align 4 + %conv24 = sext i32 %22 to i64 + %mul25 = mul i64 %conv24, 4 + %call26 = call i32 @cudaMalloc(i8** %21, i64 %mul25) + call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.9, i64 0, i64 0)) + %call27 = call i64 @get_time() + store i64 %call27, i64* %time2, align 8 + %23 = load %struct.knode*, %struct.knode** %knodesD, align 8 + %24 = bitcast %struct.knode* %23 to i8* + %25 = load %struct.knode*, %struct.knode** %knodes.addr, align 8 + %26 = bitcast %struct.knode* %25 to i8* + %27 = load i64, i64* %knodes_mem.addr, align 8 + %call28 = call i32 @cudaMemcpy(i8* %24, i8* %26, i64 %27, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.10, i64 0, i64 0)) + %28 = load i64*, i64** %currKnodeD, align 8 + %29 = bitcast i64* %28 to i8* + %30 = load i64*, i64** %currKnode.addr, align 8 + %31 = bitcast i64* %30 to i8* + %32 = load i32, i32* %count.addr, align 4 + %conv29 = sext i32 %32 to i64 + %mul30 = mul i64 %conv29, 8 + %call31 = call i32 @cudaMemcpy(i8* %29, i8* %31, i64 %mul30, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.11, i64 0, i64 0)) + %33 = load i64*, i64** %offsetD, align 8 + %34 = bitcast i64* %33 to i8* + %35 = load i64*, i64** %offset.addr, align 8 + %36 = bitcast i64* %35 to i8* + %37 = load i32, i32* %count.addr, align 4 + %conv32 = sext i32 %37 to i64 + %mul33 = mul i64 %conv32, 8 + %call34 = call i32 @cudaMemcpy(i8* %34, i8* %36, i64 %mul33, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.12, i64 0, i64 0)) + %38 = load i64*, i64** %lastKnodeD, align 8 + %39 = bitcast i64* %38 to i8* + %40 = load i64*, i64** %lastKnode.addr, align 8 + %41 = bitcast i64* %40 to i8* + %42 = load i32, i32* %count.addr, align 4 + %conv35 = sext i32 %42 to i64 + %mul36 = mul i64 %conv35, 8 + %call37 = call i32 @cudaMemcpy(i8* %39, i8* %41, i64 %mul36, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.13, i64 0, i64 0)) + %43 = load i64*, i64** %offset_2D, align 8 + %44 = bitcast i64* %43 to i8* + %45 = load i64*, i64** %offset_2.addr, align 8 + %46 = bitcast i64* %45 to i8* + %47 = load i32, i32* %count.addr, align 4 + %conv38 = sext i32 %47 to i64 + %mul39 = mul i64 %conv38, 8 + %call40 = call i32 @cudaMemcpy(i8* %44, i8* %46, i64 %mul39, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.14, i64 0, i64 0)) + %48 = load i32*, i32** %startD, align 8 + %49 = bitcast i32* %48 to i8* + %50 = load i32*, i32** %start.addr, align 8 + %51 = bitcast i32* %50 to i8* + %52 = load i32, i32* %count.addr, align 4 + %conv41 = sext i32 %52 to i64 + %mul42 = mul i64 %conv41, 4 + %call43 = call i32 @cudaMemcpy(i8* %49, i8* %51, i64 %mul42, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.15, i64 0, i64 0)) + %53 = load i32*, i32** %endD, align 8 + %54 = bitcast i32* %53 to i8* + %55 = load i32*, i32** %end.addr, align 8 + %56 = bitcast i32* %55 to i8* + %57 = load i32, i32* %count.addr, align 4 + %conv44 = sext i32 %57 to i64 + %mul45 = mul i64 %conv44, 4 + %call46 = call i32 @cudaMemcpy(i8* %54, i8* %56, i64 %mul45, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.16, i64 0, i64 0)) + %58 = load i32*, i32** %ansDStart, align 8 + %59 = bitcast i32* %58 to i8* + %60 = load i32*, i32** %recstart.addr, align 8 + %61 = bitcast i32* %60 to i8* + %62 = load i32, i32* %count.addr, align 4 + %conv47 = sext i32 %62 to i64 + %mul48 = mul i64 %conv47, 4 + %call49 = call i32 @cudaMemcpy(i8* %59, i8* %61, i64 %mul48, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0)) + %63 = load i32*, i32** %ansDLength, align 8 + %64 = bitcast i32* %63 to i8* + %65 = load i32*, i32** %reclength.addr, align 8 + %66 = bitcast i32* %65 to i8* + %67 = load i32, i32* %count.addr, align 4 + %conv50 = sext i32 %67 to i64 + %mul51 = mul i64 %conv50, 4 + %call52 = call i32 @cudaMemcpy(i8* %64, i8* %66, i64 %mul51, i32 1) + call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.18, i64 0, i64 0)) + %call53 = call i64 @get_time() + store i64 %call53, i64* %time3, align 8 + %68 = load i32, i32* %numBlocks, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 %68, i32 1, i32 1) + %69 = load i32, i32* %threadsPerBlock, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp54, i32 %69, i32 1, i32 1) + %70 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %71 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %70, i8* align 4 %71, i64 12, i1 false) + %72 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %73 = load i64, i64* %72, align 4 + %74 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %75 = load i32, i32* %74, align 4 + %76 = bitcast { i64, i32 }* %agg.tmp54.coerce to i8* + %77 = bitcast %struct.dim3* %agg.tmp54 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %76, i8* align 4 %77, i64 12, i1 false) + %78 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp54.coerce, i32 0, i32 0 + %79 = load i64, i64* %78, align 4 + %80 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp54.coerce, i32 0, i32 1 + %81 = load i32, i32* %80, align 4 + %call55 = call i32 @__cudaPushCallConfiguration(i64 %73, i32 %75, i64 %79, i32 %81, i64 0, i8* null) + %tobool = icmp ne i32 %call55, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %cond.end + %82 = load i64, i64* %maxheight.addr, align 8 + %83 = load %struct.knode*, %struct.knode** %knodesD, align 8 + %84 = load i64, i64* %knodes_elem.addr, align 8 + %85 = load i64*, i64** %currKnodeD, align 8 + %86 = load i64*, i64** %offsetD, align 8 + %87 = load i64*, i64** %lastKnodeD, align 8 + %88 = load i64*, i64** %offset_2D, align 8 + %89 = load i32*, i32** %startD, align 8 + %90 = load i32*, i32** %endD, align 8 + %91 = load i32*, i32** %ansDStart, align 8 + %92 = load i32*, i32** %ansDLength, align 8 + call void @findRangeK(i64 %82, %struct.knode* %83, i64 %84, i64* %85, i64* %86, i64* %87, i64* %88, i32* %89, i32* %90, i32* %91, i32* %92) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %cond.end + %call56 = call i32 @cudaThreadSynchronize() + call void @checkCUDAError(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0)) + %call57 = call i64 @get_time() + store i64 %call57, i64* %time4, align 8 + %93 = load i32*, i32** %recstart.addr, align 8 + %94 = bitcast i32* %93 to i8* + %95 = load i32*, i32** %ansDStart, align 8 + %96 = bitcast i32* %95 to i8* + %97 = load i32, i32* %count.addr, align 4 + %conv58 = sext i32 %97 to i64 + %mul59 = mul i64 %conv58, 4 + %call60 = call i32 @cudaMemcpy(i8* %94, i8* %96, i64 %mul59, i32 2) + call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0)) + %98 = load i32*, i32** %reclength.addr, align 8 + %99 = bitcast i32* %98 to i8* + %100 = load i32*, i32** %ansDLength, align 8 + %101 = bitcast i32* %100 to i8* + %102 = load i32, i32* %count.addr, align 4 + %conv61 = sext i32 %102 to i64 + %mul62 = mul i64 %conv61, 4 + %call63 = call i32 @cudaMemcpy(i8* %99, i8* %101, i64 %mul62, i32 2) + call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.18, i64 0, i64 0)) + %call64 = call i64 @get_time() + store i64 %call64, i64* %time5, align 8 + %103 = load %struct.knode*, %struct.knode** %knodesD, align 8 + %104 = bitcast %struct.knode* %103 to i8* + %call65 = call i32 @cudaFree(i8* %104) + %105 = load i64*, i64** %currKnodeD, align 8 + %106 = bitcast i64* %105 to i8* + %call66 = call i32 @cudaFree(i8* %106) + %107 = load i64*, i64** %offsetD, align 8 + %108 = bitcast i64* %107 to i8* + %call67 = call i32 @cudaFree(i8* %108) + %109 = load i64*, i64** %lastKnodeD, align 8 + %110 = bitcast i64* %109 to i8* + %call68 = call i32 @cudaFree(i8* %110) + %111 = load i64*, i64** %offset_2D, align 8 + %112 = bitcast i64* %111 to i8* + %call69 = call i32 @cudaFree(i8* %112) + %113 = load i32*, i32** %startD, align 8 + %114 = bitcast i32* %113 to i8* + %call70 = call i32 @cudaFree(i8* %114) + %115 = load i32*, i32** %endD, align 8 + %116 = bitcast i32* %115 to i8* + %call71 = call i32 @cudaFree(i8* %116) + %117 = load i32*, i32** %ansDStart, align 8 + %118 = bitcast i32* %117 to i8* + %call72 = call i32 @cudaFree(i8* %118) + %119 = load i32*, i32** %ansDLength, align 8 + %120 = bitcast i32* %119 to i8* + %call73 = call i32 @cudaFree(i8* %120) + %call74 = call i64 @get_time() + store i64 %call74, i64* %time6, align 8 + %call75 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.20, i64 0, i64 0)) + %121 = load i64, i64* %time1, align 8 + %122 = load i64, i64* %time0, align 8 + %sub = sub nsw i64 %121, %122 + %conv76 = sitofp i64 %sub to float + %div = fdiv float %conv76, 1.000000e+06 + %conv77 = fpext float %div to double + %123 = load i64, i64* %time1, align 8 + %124 = load i64, i64* %time0, align 8 + %sub78 = sub nsw i64 %123, %124 + %conv79 = sitofp i64 %sub78 to float + %125 = load i64, i64* %time6, align 8 + %126 = load i64, i64* %time0, align 8 + %sub80 = sub nsw i64 %125, %126 + %conv81 = sitofp i64 %sub80 to float + %div82 = fdiv float %conv79, %conv81 + %mul83 = fmul contract float %div82, 1.000000e+02 + %conv84 = fpext float %mul83 to double + %call85 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([54 x i8], [54 x i8]* @.str.21, i64 0, i64 0), double %conv77, double %conv84) + %127 = load i64, i64* %time2, align 8 + %128 = load i64, i64* %time1, align 8 + %sub86 = sub nsw i64 %127, %128 + %conv87 = sitofp i64 %sub86 to float + %div88 = fdiv float %conv87, 1.000000e+06 + %conv89 = fpext float %div88 to double + %129 = load i64, i64* %time2, align 8 + %130 = load i64, i64* %time1, align 8 + %sub90 = sub nsw i64 %129, %130 + %conv91 = sitofp i64 %sub90 to float + %131 = load i64, i64* %time6, align 8 + %132 = load i64, i64* %time0, align 8 + %sub92 = sub nsw i64 %131, %132 + %conv93 = sitofp i64 %sub92 to float + %div94 = fdiv float %conv91, %conv93 + %mul95 = fmul contract float %div94, 1.000000e+02 + %conv96 = fpext float %mul95 to double + %call97 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.22, i64 0, i64 0), double %conv89, double %conv96) + %133 = load i64, i64* %time3, align 8 + %134 = load i64, i64* %time2, align 8 + %sub98 = sub nsw i64 %133, %134 + %conv99 = sitofp i64 %sub98 to float + %div100 = fdiv float %conv99, 1.000000e+06 + %conv101 = fpext float %div100 to double + %135 = load i64, i64* %time3, align 8 + %136 = load i64, i64* %time2, align 8 + %sub102 = sub nsw i64 %135, %136 + %conv103 = sitofp i64 %sub102 to float + %137 = load i64, i64* %time6, align 8 + %138 = load i64, i64* %time0, align 8 + %sub104 = sub nsw i64 %137, %138 + %conv105 = sitofp i64 %sub104 to float + %div106 = fdiv float %conv103, %conv105 + %mul107 = fmul contract float %div106, 1.000000e+02 + %conv108 = fpext float %mul107 to double + %call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.23, i64 0, i64 0), double %conv101, double %conv108) + %139 = load i64, i64* %time4, align 8 + %140 = load i64, i64* %time3, align 8 + %sub110 = sub nsw i64 %139, %140 + %conv111 = sitofp i64 %sub110 to float + %div112 = fdiv float %conv111, 1.000000e+06 + %conv113 = fpext float %div112 to double + %141 = load i64, i64* %time4, align 8 + %142 = load i64, i64* %time3, align 8 + %sub114 = sub nsw i64 %141, %142 + %conv115 = sitofp i64 %sub114 to float + %143 = load i64, i64* %time6, align 8 + %144 = load i64, i64* %time0, align 8 + %sub116 = sub nsw i64 %143, %144 + %conv117 = sitofp i64 %sub116 to float + %div118 = fdiv float %conv115, %conv117 + %mul119 = fmul contract float %div118, 1.000000e+02 + %conv120 = fpext float %mul119 to double + %call121 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.24, i64 0, i64 0), double %conv113, double %conv120) + %145 = load i64, i64* %time5, align 8 + %146 = load i64, i64* %time4, align 8 + %sub122 = sub nsw i64 %145, %146 + %conv123 = sitofp i64 %sub122 to float + %div124 = fdiv float %conv123, 1.000000e+06 + %conv125 = fpext float %div124 to double + %147 = load i64, i64* %time5, align 8 + %148 = load i64, i64* %time4, align 8 + %sub126 = sub nsw i64 %147, %148 + %conv127 = sitofp i64 %sub126 to float + %149 = load i64, i64* %time6, align 8 + %150 = load i64, i64* %time0, align 8 + %sub128 = sub nsw i64 %149, %150 + %conv129 = sitofp i64 %sub128 to float + %div130 = fdiv float %conv127, %conv129 + %mul131 = fmul contract float %div130, 1.000000e+02 + %conv132 = fpext float %mul131 to double + %call133 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.25, i64 0, i64 0), double %conv125, double %conv132) + %151 = load i64, i64* %time6, align 8 + %152 = load i64, i64* %time5, align 8 + %sub134 = sub nsw i64 %151, %152 + %conv135 = sitofp i64 %sub134 to float + %div136 = fdiv float %conv135, 1.000000e+06 + %conv137 = fpext float %div136 to double + %153 = load i64, i64* %time6, align 8 + %154 = load i64, i64* %time5, align 8 + %sub138 = sub nsw i64 %153, %154 + %conv139 = sitofp i64 %sub138 to float + %155 = load i64, i64* %time6, align 8 + %156 = load i64, i64* %time0, align 8 + %sub140 = sub nsw i64 %155, %156 + %conv141 = sitofp i64 %sub140 to float + %div142 = fdiv float %conv139, %conv141 + %mul143 = fmul contract float %div142, 1.000000e+02 + %conv144 = fpext float %mul143 to double + %call145 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.26, i64 0, i64 0), double %conv137, double %conv144) + %call146 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.27, i64 0, i64 0)) + %157 = load i64, i64* %time6, align 8 + %158 = load i64, i64* %time0, align 8 + %sub147 = sub nsw i64 %157, %158 + %conv148 = sitofp i64 %sub147 to float + %div149 = fdiv float %conv148, 1.000000e+06 + %conv150 = fpext float %div149 to double + %call151 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.28, i64 0, i64 0), double %conv150) + ret void +} + +declare dso_local i64 @get_time() #2 + +declare dso_local i32 @cudaThreadSynchronize() #2 + +declare dso_local i32 @printf(i8*, ...) #2 + +declare dso_local i32 @cudaMalloc(i8**, i64) #2 + +declare dso_local void @checkCUDAError(i8*) #2 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2 + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #3 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @cudaFree(i8*) #2 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK to i8*), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/btree/main.c b/examples/btree/main.c new file mode 100644 index 0000000..8ddf6d4 --- /dev/null +++ b/examples/btree/main.c @@ -0,0 +1,2192 @@ +// # ifdef __cplusplus +// extern "C" { +// # endif + +//========================================================================================================================================================================================================200 +//======================================================================================================================================================150 +//====================================================================================================100 +//==================================================50 + +//========================================================================================================================================================================================================200 +// INFORMATION +//========================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// UPDATE +//======================================================================================================================================================150 + +// 2009; Amittai Aviram; entire code written in C; +// 2010; Jordan Fix and Andrew Wilkes; code converted to CUDA; +// 2011.10; Lukasz G. Szafaryn; code converted to portable form, to C, OpenMP, +// CUDA, PGI versions; 2011.12; Lukasz G. Szafaryn; Split different versions for +// Rodinia. 2011.12; Lukasz G. Szafaryn; code converted to OpenCL; 2012.10; Ke +// Wang; Change it to non-interactive mode. Use command option read command from +// file. And also add output for easy verification among different platforms and +// devices.Merged into Rodinia main distribution 2.2. +//======================================================================================================================================================150 +// DESCRIPTION +//======================================================================================================================================================150 + +// Description + +//======================================================================================================================================================150 +// USE +//======================================================================================================================================================150 + +// EXAMPLE: +// ./b+tree file ./input/mil.txt command ./command.txt +// ...then enter any of the following commands after the prompt > : +// f -- Find the value under key +// p -- Print the path from the root to key k and its associated value +// t -- Print the B+ tree +// l -- Print the keys of the leaves (bottom row of the tree) +// v -- Toggle output of pointer addresses ("verbose") in tree and leaves. +// k -- Run bundled queries on the CPU and GPU (B+Tree) (Selects random +// values for each search) j -- Run a range search of bundled +// queries on the CPU and GPU (B+Tree) with the range of each search of size +// x -- Run a single search for value z on the GPU and CPU +// y -- Run a single range search for range a-b on the GPU and CPU +// q -- Quit. (Or use Ctl-D.) + +//======================================================================================================================================================150 +// END +//======================================================================================================================================================150 + +//========================================================================================================================================================================================================200 +// DEFINE/INCLUDE +//========================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// LIBRARIES +//======================================================================================================================================================150 + +#include // (in directory known to compiler) needed by INT_MIN, INT_MAX +#include // (in directory known to compiler) needed by printf, stderr +// #include // +// (in directory known to compiler) needed by ??? +#include // (in directory known to compiler) needed by log, pow +#include // (in directory known to compiler) needed by memset + +//======================================================================================================================================================150 +// COMMON +//======================================================================================================================================================150 + +#include "./common.h" // (in directory provided here) + +//======================================================================================================================================================150 +// DEFINE +//======================================================================================================================================================150 + +//======================================================================================================================================================150 +// UTILITIES +//======================================================================================================================================================150 + +#include "./util/num/num.h" // (in directory provided here) +#include "./util/timer/timer.h" // (in directory provided here) + +//======================================================================================================================================================150 +// KERNEL HEADERS +//======================================================================================================================================================150 + +#include "./kernel/kernel_gpu_cuda_wrapper.h" // (in directory provided here) +#include "./kernel/kernel_gpu_cuda_wrapper_2.h" // (in directory provided here) + +//======================================================================================================================================================150 +// HEADER +//======================================================================================================================================================150 + +#include "./main.h" // (in directory provided here) + +//======================================================================================================================================================150 +// END +//======================================================================================================================================================150 + +//========================================================================================================================================================================================================200 +// VARIABLES +//========================================================================================================================================================================================================200 + +// general variables +knode *knodes; +record *krecords; +char *mem; +long freeptr; +long malloc_size; +long size; +long maxheight; + +/* The order determines the maximum and minimum + * number of entries (keys and pointers) in any + * node. Every node has at most order - 1 keys and + * at least (roughly speaking) half that number. + * Every leaf has as many pointers to data as keys, + * and every internal node has one more pointer + * to a subtree than the number of keys. + * This global variable is initialized to the + * default value. + */ +int order = DEFAULT_ORDER; + +/* The queue is used to print the tree in + * level order, starting from the root + * printing each entire rank on a separate + * line, finishing with the leaves. + */ +node *queue = NULL; + +/* The user can toggle on and off the "verbose" + * property, which causes the pointer addresses + * to be printed out in hexadecimal notation + * next to their corresponding keys. + */ +bool verbose_output = false; + +//========================================================================================================================================================================================================200 +// FUNCTIONS +//========================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// Components +//======================================================================================================================================================150 + +void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with), + void (*datum_delete)(void *)) { + l->head = l->tail = NULL; + l->length = 0; + l->compare = compare; + l->datum_delete = datum_delete; +} + +void list_delete(list_t *l) { + + list_item_t *li, *del; + + for (li = l->head; li;) { + + del = li; + li = li->next; + list_item_delete(del, l->datum_delete); + } + + l->head = l->tail = NULL; + l->length = 0; +} + +void list_reset(list_t *l) { list_delete(l); } + +void list_insert_item_head(list_t *l, list_item_t *i) { + if (l->head) { + i->next = l->head; + l->head->pred = i; + l->head = i; + l->head->pred = NULL; + } else { + l->head = l->tail = i; + i->next = i->pred = NULL; + } + l->length++; +} + +void list_insert_item_tail(list_t *l, list_item_t *i) { + if (l->head) { + l->tail->next = i; + i->pred = l->tail; + i->next = NULL; + l->tail = i; + } else { + l->head = l->tail = i; + i->next = i->pred = NULL; + } + l->length++; +} + +void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i) { + /* Assume next is actually in the list! */ + /* If it's not, we may lose the list. */ + if (l->head == next) { + i->next = next; + i->pred = NULL; + l->head = i; + next->pred = i; + } else { + i->next = next; + i->pred = next->pred; + next->pred->next = i; + next->pred = i; + } + l->length++; +} + +void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i) { + /* Assume pred is actually in the list! */ + /* If it's not, we may lose the list. */ + if (l->tail == pred) { + i->pred = pred; + i->next = NULL; + l->tail = i; + pred->next = i; + } else { + i->pred = pred; + i->next = pred->next; + pred->next->pred = i; + pred->next = i; + } + l->length++; +} + +void list_insert_item_sorted(list_t *l, list_item_t *i) { + list_item_t *itr; + + if (l->head) { + for (itr = l->head; itr && l->compare(list_item_get_datum(i), + list_item_get_datum(itr)) < 0; + itr = itr->next) + ; + if (itr) { + i->next = itr; + i->pred = itr->pred; + itr->pred = i; + i->pred->next = i; + } else { + l->tail->next = i; + i->pred = l->tail; + i->next = NULL; + l->tail = i; + } + } else { + l->head = l->tail = i; + i->pred = i->next = NULL; + } + l->length++; +} + +void list_insert_head(list_t *l, void *v) { + list_item_t *i; + i = (list_item_t *)malloc(sizeof(*i)); + list_item_init(i, v); + if (l->head) { + i->next = l->head; + l->head->pred = i; + l->head = i; + l->head->pred = NULL; + } else { + l->head = l->tail = i; + i->next = i->pred = NULL; + } + l->length++; +} + +void list_insert_tail(list_t *l, void *v) { + list_item_t *i; + + i = (list_item_t *)malloc(sizeof(*i)); + list_item_init(i, v); + if (l->head) { + l->tail->next = i; + i->pred = l->tail; + i->next = NULL; + l->tail = i; + } else { + l->head = l->tail = i; + i->next = i->pred = NULL; + } + l->length++; +} + +void list_insert_before(list_t *l, list_item_t *next, void *v) { + list_item_t *i; + + i = (list_item_t *)malloc(sizeof(*i)); + list_item_init(i, v); + + /* Assume next is actually in the list! */ + /* If it's not, we may lose the list. */ + if (l->head == next) { + i->next = next; + i->pred = NULL; + l->head = i; + next->pred = i; + } else { + i->next = next; + i->pred = next->pred; + next->pred->next = i; + next->pred = i; + } + l->length++; +} + +void list_insert_after(list_t *l, list_item_t *pred, void *v) { + list_item_t *i; + + i = (list_item_t *)malloc(sizeof(*i)); + list_item_init(i, v); + + /* Assume pred is actually in the list! */ + /* If it's not, we may lose the list. */ + if (l->tail == pred) { + i->pred = pred; + i->next = NULL; + l->tail = i; + pred->next = i; + } else { + i->pred = pred; + i->next = pred->next; + pred->next->pred = i; + pred->next = i; + } + l->length++; +} + +void list_insert_sorted(list_t *l, void *v) { + list_item_t *itr; + list_item_t *i; + + i = (list_item_t *)malloc(sizeof(*i)); + list_item_init(i, v); + + if (l->head) { + for (itr = l->head; itr && l->compare(list_item_get_datum(i), + list_item_get_datum(itr)) < 0; + itr = itr->next) + ; + if (itr) { + i->next = itr; + i->pred = itr->pred; + itr->pred = i; + i->pred->next = i; + } else { + l->tail->next = i; + i->pred = l->tail; + i->next = NULL; + l->tail = i; + } + } else { + l->head = l->tail = i; + i->pred = i->next = NULL; + } + l->length++; +} + +void list_remove_item(list_t *l, list_item_t *i) { + if (i == l->head) { + l->head = l->head->next; + if (l->head) + l->head->pred = NULL; + else + l->tail = NULL; + } else if (i == l->tail) { + l->tail = l->tail->pred; + l->tail->next = NULL; + } else { + i->pred->next = i->next; + i->next->pred = i->pred; + } + l->length--; + list_item_delete(i, l->datum_delete); +} + +void list_remove_head(list_t *l) { list_remove_item(l, l->head); } + +void list_remove_tail(list_t *l) { list_remove_item(l, l->tail); } + +list_item_t *list_find_item(list_t *l, void *datum) { + list_item_t *li; + + for (li = l->head; li && l->compare(datum, list_item_get_datum(li)); + li = li->next) + ; + + return li; +} + +list_item_t *list_get_head_item(list_t *l) { return l->head; } + +list_item_t *list_get_tail_item(list_t *l) { return l->tail; } + +void *list_find(list_t *l, void *datum) { + list_item_t *li; + + for (li = l->head; li && l->compare(datum, list_item_get_datum(li)); + li = li->next) + ; + + return li ? li->datum : NULL; +} + +void *list_get_head(list_t *l) { return l->head ? l->head->datum : NULL; } + +void *list_get_tail(list_t *l) { return l->tail ? l->tail->datum : NULL; } + +uint32_t list_get_length(list_t *l) { return l->length; } + +bool list_is_empty(list_t *l) { return (l->length == 0); } + +bool list_not_empty(list_t *l) { return (l->length != 0); } + +void list_visit_items(list_t *l, void (*visitor)(void *v)) { + list_item_t *li; + + for (li = l->head; li; li = li->next) + visitor(list_item_get_datum(li)); +} + +void list_item_init(list_item_t *li, void *datum) { + li->pred = li->next = NULL; + li->datum = datum; +} + +void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum)) { + if (datum_delete) { + datum_delete(li->datum); + } + + free(li); +} + +void *list_item_get_datum(list_item_t *li) { return li->datum; } + +void list_iterator_init(list_t *l, list_iterator_t *li) { + *li = l ? l->head : NULL; +} + +void list_iterator_delete(list_iterator_t *li) { *li = NULL; } + +void list_iterator_next(list_iterator_t *li) { + if (*li) + *li = (*li)->next; +} + +void list_iterator_prev(list_iterator_t *li) { + if (*li) + *li = (*li)->pred; +} + +void *list_iterator_get_datum(list_iterator_t *li) { + return *li ? (*li)->datum : NULL; +} + +bool list_iterator_is_valid(list_iterator_t *li) { return (*li != NULL); } + +void list_reverse_iterator_init(list_t *l, list_reverse_iterator_t *li) { + *li = l ? l->tail : NULL; +} + +void list_reverse_iterator_delete(list_reverse_iterator_t *li) { *li = NULL; } + +void list_reverse_iterator_next(list_reverse_iterator_t *li) { + if (*li) + *li = (*li)->pred; +} + +void list_reverse_iterator_prev(list_reverse_iterator_t *li) { + if (*li) + *li = (*li)->next; +} + +void *list_reverse_iterator_get_datum(list_reverse_iterator_t *li) { + return *li ? (*li)->datum : NULL; +} + +bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li) { + return (li != NULL); +} + +//======================================================================================================================================================150 +// OUTPUT AND UTILITIES +//======================================================================================================================================================150 + +/* */ +void *kmalloc(int size) { + + // printf("size: %d, current offset: %p\n",size,freeptr); + void *r = (void *)freeptr; + freeptr += size; + if (freeptr > malloc_size + (long)mem) { + printf("Memory Overflow\n"); + exit(1); + } + return r; +} + +// transforms the current B+ Tree into a single, contiguous block of memory to +// be used on the GPU +long transform_to_cuda(node *root, bool verbose) { + + struct timeval one, two; + double time; + gettimeofday(&one, NULL); + long max_nodes = (long)(pow(order, log(size) / log(order / 2.0) - 1) + 1); + malloc_size = size * sizeof(record) + max_nodes * sizeof(knode); + mem = (char *)malloc(malloc_size); + if (mem == NULL) { + printf("Initial malloc error\n"); + exit(1); + } + freeptr = (long)mem; + + krecords = (record *)kmalloc(size * sizeof(record)); + // printf("%d records\n", size); + knodes = (knode *)kmalloc(max_nodes * sizeof(knode)); + // printf("%d knodes\n", max_nodes); + + queue = NULL; + enqueue(root); + node *n; + knode *k; + int i; + long nodeindex = 0; + long recordindex = 0; + long queueindex = 0; + knodes[0].location = nodeindex++; + + while (queue != NULL) { + n = dequeue(); + k = &knodes[queueindex]; + k->location = queueindex++; + k->is_leaf = n->is_leaf; + k->num_keys = n->num_keys + 2; + // start at 1 because 0 is set to INT_MIN + k->keys[0] = INT_MIN; + k->keys[k->num_keys - 1] = INT_MAX; + for (i = k->num_keys; i < order; i++) + k->keys[i] = INT_MAX; + if (!k->is_leaf) { + k->indices[0] = nodeindex++; + // if(k->indices[0]>3953){ + // printf("ERROR: %d\n", k->indices[0]); + // } + for (i = 1; i < k->num_keys - 1; i++) { + k->keys[i] = n->keys[i - 1]; + enqueue((node *)n->pointers[i - 1]); + k->indices[i] = nodeindex++; + // if(k->indices[i]>3953){ + // printf("ERROR 1: %d\n", k->indices[i]); + // } + // knodes[nodeindex].location = nodeindex++; + } + // for final point of n + enqueue((node *)n->pointers[i - 1]); + } else { + k->indices[0] = 0; + for (i = 1; i < k->num_keys - 1; i++) { + k->keys[i] = n->keys[i - 1]; + krecords[recordindex].value = ((record *)n->pointers[i - 1])->value; + k->indices[i] = recordindex++; + // if(k->indices[i]>3953){ + // printf("ERROR 2: %d\n", k->indices[i]); + // } + } + } + + k->indices[k->num_keys - 1] = queueindex; + // if(k->indices[k->num_keys-1]>3953){ + // printf("ERROR 3: %d\n", k->indices[k->num_keys-1]); + // } + + if (verbose) { + printf("Successfully created knode with index %d\n", k->location); + printf("Is Leaf: %d, Num Keys: %d\n", k->is_leaf, k->num_keys); + printf("Pointers: "); + for (i = 0; i < k->num_keys; i++) + printf("%d | ", k->indices[i]); + printf("\nKeys: "); + for (i = 0; i < k->num_keys; i++) + printf("%d | ", k->keys[i]); + printf("\n\n"); + } + } + long mem_used = size * sizeof(record) + (nodeindex) * sizeof(knode); + if (verbose) { + for (i = 0; i < size; i++) + printf("%d ", krecords[i].value); + printf("\nNumber of records = %d, sizeof(record)=%d, total=%d\n", size, + sizeof(record), size * sizeof(record)); + printf("Number of knodes = %d, sizeof(knode)=%d, total=%d\n", nodeindex, + sizeof(knode), (nodeindex) * sizeof(knode)); + printf("\nDone Transformation. Mem used: %d\n", mem_used); + } + gettimeofday(&two, NULL); + double oneD = one.tv_sec + (double)one.tv_usec * .000001; + double twoD = two.tv_sec + (double)two.tv_usec * .000001; + time = twoD - oneD; + printf("Tree transformation took %f\n", time); + + return mem_used; +} + +/* */ +list_t *findRange(node *root, int start, int end) { + + int i; + node *c = find_leaf(root, start, false); + + if (c == NULL) + return NULL; + + list_t *retList = (list_t *)malloc(sizeof(list_t)); + list_init(retList, NULL, NULL); + + int counter = 0; + bool cont = true; + while (cont && c != 0) { + cont = false; + for (i = 0; i < c->num_keys; i++) { + if (c->keys[i] >= start && c->keys[i] <= end) { + // list_insert_tail(retList,(record *)c->pointers[i]); + counter++; + cont = true; + } else { + cont = false; + break; + } + } + c = (node *)c->pointers[order - 1]; + } + return retList; +} + +/* First message to the user. */ +void usage_1(void) { + + printf("B+ Tree of Order %d.\n", order); + printf("\tAmittai Aviram -- amittai.aviram@yale.edu Version %s\n", Version); + printf("\tfollowing Silberschatz, Korth, Sidarshan, Database Concepts, 5th " + "ed.\n\n"); + printf("To build a B+ tree of a different order, start again and enter the " + "order\n"); + printf("as an integer argument: bpt . "); + printf("3 <= order <=20\n"); + printf("To start with input from a file of newline-delimited integers, start " + "again and enter\n"); + printf("the order followed by the filename: bpt .\n"); +} + +/* Second message to the user. */ +void usage_2(void) { + + printf("Enter any of the following commands after the prompt > :\n"); + printf("\ti -- Insert (an integer) as both key and value).\n"); + printf("\tf -- Find the value under key .\n"); + printf("\tp -- Print the path from the root to key k and its associated " + "value.\n"); + printf("\td -- Delete key and its associated value.\n"); + printf("\tx -- Destroy the whole tree. Start again with an empty tree of " + "the same order.\n"); + printf("\tt -- Print the B+ tree.\n"); + printf("\tl -- Print the keys of the leaves (bottom row of the tree).\n"); + printf("\tv -- Toggle output of pointer addresses (\"verbose\") in tree and " + "leaves.\n"); + printf("\tq -- Quit. (Or use Ctl-D.)\n"); + printf("\t? -- Print this help message.\n"); +} + +/* Helper function for printing the tree out. See print_tree. */ +void enqueue(node *new_node) { + node *c; + if (queue == NULL) { + queue = new_node; + queue->next = NULL; + } else { + c = queue; + while (c->next != NULL) { + c = c->next; + } + c->next = new_node; + new_node->next = NULL; + } +} + +/* Helper function for printing the tree out. See print_tree. */ +node *dequeue(void) { + node *n = queue; + queue = queue->next; + n->next = NULL; + return n; +} + +/* Prints the bottom row of keys of the tree (with their respective pointers, if + * the verbose_output flag is set. */ +void print_leaves(node *root) { + int i; + node *c = root; + if (root == NULL) { + printf("Empty tree.\n"); + return; + } + while (!c->is_leaf) + c = (node *)c->pointers[0]; + while (true) { + for (i = 0; i < c->num_keys; i++) { + if (verbose_output) + // printf("%x ", (unsigned int)c->pointers[i]); + printf("%d ", c->keys[i]); + } + if (verbose_output) + // printf("%x ", (unsigned int)c->pointers[order - 1]); + if (c->pointers[order - 1] != NULL) { + printf(" | "); + c = (node *)c->pointers[order - 1]; + } else + break; + } + printf("\n"); +} + +/* Utility function to give the height of the tree, which length in number of + * edges of the path from the root to any leaf. */ +int height(node *root) { + int h = 0; + node *c = root; + while (!c->is_leaf) { + c = (node *)c->pointers[0]; + h++; + } + return h; +} + +/* Utility function to give the length in edges of the path from any node to the + * root. */ +int path_to_root(node *root, node *child) { + int length = 0; + node *c = child; + while (c != root) { + c = c->parent; + length++; + } + return length; +} + +/* Prints the B+ tree in the command line in level (rank) order, with the keys + * in each node and the '|' symbol to separate nodes. With the verbose_output + * flag set. the values of the pointers corresponding to the keys also appear + * next to their respective keys, in hexadecimal notation. */ +void print_tree(node *root) { + + node *n = NULL; + int i = 0; + int rank = 0; + int new_rank = 0; + + if (root == NULL) { + printf("Empty tree.\n"); + return; + } + queue = NULL; + enqueue(root); + while (queue != NULL) { + n = dequeue(); + if (n->parent != NULL && n == n->parent->pointers[0]) { + new_rank = path_to_root(root, n); + if (new_rank != rank) { + rank = new_rank; + printf("\n"); + } + } + if (verbose_output) + printf("(%x)", n); + for (i = 0; i < n->num_keys; i++) { + if (verbose_output) + printf("%x ", n->pointers[i]); + printf("%d ", n->keys[i]); + } + if (!n->is_leaf) + for (i = 0; i <= n->num_keys; i++) + enqueue((node *)n->pointers[i]); + if (verbose_output) { + if (n->is_leaf) + printf("%x ", n->pointers[order - 1]); + else + printf("%x ", n->pointers[n->num_keys]); + } + printf("| "); + } + printf("\n"); +} + +/* Traces the path from the root to a leaf, searching by key. Displays + * information about the path if the verbose flag is set. Returns the leaf + * containing the given key. */ +node *find_leaf(node *root, int key, bool verbose) { + + int i = 0; + node *c = root; + if (c == NULL) { + if (verbose) + printf("Empty tree.\n"); + return c; + } + while (!c->is_leaf) { + if (verbose) { + printf("["); + for (i = 0; i < c->num_keys - 1; i++) + printf("%d ", c->keys[i]); + printf("%d] ", c->keys[i]); + } + i = 0; + while (i < c->num_keys) { + if (key >= c->keys[i]) + i++; + else + break; + } + if (verbose) + printf("%d ->\n", i); + c = (node *)c->pointers[i]; + } + if (verbose) { + printf("Leaf ["); + for (i = 0; i < c->num_keys - 1; i++) + printf("%d ", c->keys[i]); + printf("%d] ->\n", c->keys[i]); + } + return c; +} + +/* Finds and returns the record to which a key refers. */ +record *find(node *root, int key, bool verbose) { + + int i = 0; + node *c = find_leaf(root, key, verbose); + if (c == NULL) + return NULL; + for (i = 0; i < c->num_keys; i++) + if (c->keys[i] == key) + break; + if (i == c->num_keys) + return NULL; + else + return (record *)c->pointers[i]; +} + +/* Finds the appropriate place to split a node that is too big into two. */ +int cut(int length) { + if (length % 2 == 0) + return length / 2; + else + return length / 2 + 1; +} + +//======================================================================================================================================================150 +// INSERTION +//======================================================================================================================================================150 + +/* Creates a new record to hold the value to which a key refers. */ +record *make_record(int value) { + record *new_record = (record *)malloc(sizeof(record)); + if (new_record == NULL) { + perror("Record creation."); + exit(EXIT_FAILURE); + } else { + new_record->value = value; + } + return new_record; +} + +/* Creates a new general node, which can be adapted to serve as either a leaf or + * an internal node. */ +node *make_node(void) { + node *new_node; + new_node = (node *)malloc(sizeof(node)); + if (new_node == NULL) { + perror("Node creation."); + exit(EXIT_FAILURE); + } + new_node->keys = (int *)malloc((order - 1) * sizeof(int)); + if (new_node->keys == NULL) { + perror("New node keys array."); + exit(EXIT_FAILURE); + } + new_node->pointers = (void **)malloc(order * sizeof(void *)); + if (new_node->pointers == NULL) { + perror("New node pointers array."); + exit(EXIT_FAILURE); + } + new_node->is_leaf = false; + new_node->num_keys = 0; + new_node->parent = NULL; + new_node->next = NULL; + return new_node; +} + +/* Creates a new leaf by creating a node and then adapting it appropriately. */ +node *make_leaf(void) { + node *leaf = make_node(); + leaf->is_leaf = true; + return leaf; +} + +/* Helper function used in insert_into_parent to find the index of the parent's + * pointer to the node to the left of the key to be inserted. */ +int get_left_index(node *parent, node *left) { + + int left_index = 0; + while (left_index <= parent->num_keys && parent->pointers[left_index] != left) + left_index++; + return left_index; +} + +/* Inserts a new pointer to a record and its corresponding key into a leaf. + * Returns the altered leaf. */ +node *insert_into_leaf(node *leaf, int key, record *pointer) { + + int i, insertion_point; + + insertion_point = 0; + while (insertion_point < leaf->num_keys && leaf->keys[insertion_point] < key) + insertion_point++; + + for (i = leaf->num_keys; i > insertion_point; i--) { + leaf->keys[i] = leaf->keys[i - 1]; + leaf->pointers[i] = leaf->pointers[i - 1]; + } + leaf->keys[insertion_point] = key; + leaf->pointers[insertion_point] = pointer; + leaf->num_keys++; + return leaf; +} + +/* Inserts a new key and pointer to a new record into a leaf so as to exceed the + * tree's order, causing the leaf to be split in half. */ +node *insert_into_leaf_after_splitting(node *root, node *leaf, int key, + record *pointer) { + + node *new_leaf; + int *temp_keys; + void **temp_pointers; + int insertion_index, split, new_key, i, j; + + new_leaf = make_leaf(); + + temp_keys = (int *)malloc(order * sizeof(int)); + if (temp_keys == NULL) { + perror("Temporary keys array."); + exit(EXIT_FAILURE); + } + + temp_pointers = (void **)malloc(order * sizeof(void *)); + if (temp_pointers == NULL) { + perror("Temporary pointers array."); + exit(EXIT_FAILURE); + } + + insertion_index = 0; + while (leaf->keys[insertion_index] < key && insertion_index < order - 1) + insertion_index++; + + for (i = 0, j = 0; i < leaf->num_keys; i++, j++) { + if (j == insertion_index) + j++; + temp_keys[j] = leaf->keys[i]; + temp_pointers[j] = leaf->pointers[i]; + } + + temp_keys[insertion_index] = key; + temp_pointers[insertion_index] = pointer; + + leaf->num_keys = 0; + + split = cut(order - 1); + + for (i = 0; i < split; i++) { + leaf->pointers[i] = temp_pointers[i]; + leaf->keys[i] = temp_keys[i]; + leaf->num_keys++; + } + + for (i = split, j = 0; i < order; i++, j++) { + new_leaf->pointers[j] = temp_pointers[i]; + new_leaf->keys[j] = temp_keys[i]; + new_leaf->num_keys++; + } + + free(temp_pointers); + free(temp_keys); + + new_leaf->pointers[order - 1] = leaf->pointers[order - 1]; + leaf->pointers[order - 1] = new_leaf; + + for (i = leaf->num_keys; i < order - 1; i++) + leaf->pointers[i] = NULL; + for (i = new_leaf->num_keys; i < order - 1; i++) + new_leaf->pointers[i] = NULL; + + new_leaf->parent = leaf->parent; + new_key = new_leaf->keys[0]; + + return insert_into_parent(root, leaf, new_key, new_leaf); +} + +/* Inserts a new key and pointer to a node into a node into which these can fit + * without violating the B+ tree properties. */ +node *insert_into_node(node *root, node *n, int left_index, int key, + node *right) { + + int i; + + for (i = n->num_keys; i > left_index; i--) { + n->pointers[i + 1] = n->pointers[i]; + n->keys[i] = n->keys[i - 1]; + } + n->pointers[left_index + 1] = right; + n->keys[left_index] = key; + n->num_keys++; + return root; +} + +/* Inserts a new key and pointer to a node into a node, causing the node's size + * to exceed the order, and causing the node to split into two. */ +node *insert_into_node_after_splitting(node *root, node *old_node, + int left_index, int key, node *right) { + + int i, j, split, k_prime; + node *new_node, *child; + int *temp_keys; + node **temp_pointers; + + /* First create a temporary set of keys and pointers + * to hold everything in order, including + * the new key and pointer, inserted in their + * correct places. + * Then create a new node and copy half of the + * keys and pointers to the old node and + * the other half to the new. + */ + + temp_pointers = (node **)malloc((order + 1) * sizeof(node *)); + if (temp_pointers == NULL) { + perror("Temporary pointers array for splitting nodes."); + exit(EXIT_FAILURE); + } + temp_keys = (int *)malloc(order * sizeof(int)); + if (temp_keys == NULL) { + perror("Temporary keys array for splitting nodes."); + exit(EXIT_FAILURE); + } + + for (i = 0, j = 0; i < old_node->num_keys + 1; i++, j++) { + if (j == left_index + 1) + j++; + temp_pointers[j] = (node *)old_node->pointers[i]; + } + + for (i = 0, j = 0; i < old_node->num_keys; i++, j++) { + if (j == left_index) + j++; + temp_keys[j] = old_node->keys[i]; + } + + temp_pointers[left_index + 1] = right; + temp_keys[left_index] = key; + + /* Create the new node and copy + * half the keys and pointers to the + * old and half to the new. + */ + split = cut(order); + new_node = make_node(); + old_node->num_keys = 0; + for (i = 0; i < split - 1; i++) { + old_node->pointers[i] = temp_pointers[i]; + old_node->keys[i] = temp_keys[i]; + old_node->num_keys++; + } + old_node->pointers[i] = temp_pointers[i]; + k_prime = temp_keys[split - 1]; + for (++i, j = 0; i < order; i++, j++) { + new_node->pointers[j] = temp_pointers[i]; + new_node->keys[j] = temp_keys[i]; + new_node->num_keys++; + } + new_node->pointers[j] = temp_pointers[i]; + free(temp_pointers); + free(temp_keys); + new_node->parent = old_node->parent; + for (i = 0; i <= new_node->num_keys; i++) { + child = (node *)new_node->pointers[i]; + child->parent = new_node; + } + + /* Insert a new key into the parent of the two + * nodes resulting from the split, with + * the old node to the left and the new to the right. + */ + + return insert_into_parent(root, old_node, k_prime, new_node); +} + +/* Inserts a new node (leaf or internal node) into the B+ tree. Returns the root + * of the tree after insertion. */ +node *insert_into_parent(node *root, node *left, int key, node *right) { + + int left_index; + node *parent; + + parent = left->parent; + + /* Case: new root. */ + + if (parent == NULL) + return insert_into_new_root(left, key, right); + + /* Case: leaf or node. (Remainder of + * function body.) + */ + + /* Find the parent's pointer to the left + * node. + */ + + left_index = get_left_index(parent, left); + + /* Simple case: the new key fits into the node. + */ + + if (parent->num_keys < order - 1) + return insert_into_node(root, parent, left_index, key, right); + + /* Harder case: split a node in order + * to preserve the B+ tree properties. + */ + + return insert_into_node_after_splitting(root, parent, left_index, key, right); +} + +/* Creates a new root for two subtrees and inserts the appropriate key into the + * new root. */ +node *insert_into_new_root(node *left, int key, node *right) { + + node *root = make_node(); + root->keys[0] = key; + root->pointers[0] = left; + root->pointers[1] = right; + root->num_keys++; + root->parent = NULL; + left->parent = root; + right->parent = root; + return root; +} + +/* First insertion: start a new tree. */ +node *start_new_tree(int key, record *pointer) { + + node *root = make_leaf(); + root->keys[0] = key; + root->pointers[0] = pointer; + root->pointers[order - 1] = NULL; + root->parent = NULL; + root->num_keys++; + return root; +} + +/* Master insertion function. Inserts a key and an associated value into the B+ + * tree, causing the tree to be adjusted however necessary to maintain the B+ + * tree properties. */ +node *insert(node *root, int key, int value) { + + record *pointer; + node *leaf; + + /* The current implementation ignores duplicates. */ + if (find(root, key, false) != NULL) + return root; + + /* Create a new record for the value. */ + pointer = make_record(value); + + /* Case: the tree does not exist yet. Start a new tree. */ + if (root == NULL) + return start_new_tree(key, pointer); + + /* Case: the tree already exists. (Rest of function body.) */ + leaf = find_leaf(root, key, false); + + /* Case: leaf has room for key and pointer. */ + if (leaf->num_keys < order - 1) { + leaf = insert_into_leaf(leaf, key, pointer); + return root; + } + + /* Case: leaf must be split. */ + return insert_into_leaf_after_splitting(root, leaf, key, pointer); +} + +//======================================================================================================================================================150 +// DELETION +//======================================================================================================================================================150 + +/* Utility function for deletion. Retrieves the index of a node's nearest + * neighbor (sibling) to the left if one exists. If not (the node is the + * leftmost child), returns -1 to signify this special case. */ +int get_neighbor_index(node *n) { + + int i; + + /* Return the index of the key to the left + * of the pointer in the parent pointing + * to n. + * If n is the leftmost child, this means + * return -1. + */ + for (i = 0; i <= n->parent->num_keys; i++) + if (n->parent->pointers[i] == n) + return i - 1; + + // Error state. + printf("Search for nonexistent pointer to node in parent.\n"); + // printf("Node: %#x\n", (unsigned int)n); + exit(EXIT_FAILURE); +} + +/* */ +node *remove_entry_from_node(node *n, int key, node *pointer) { + + int i, num_pointers; + + // Remove the key and shift other keys accordingly. + i = 0; + while (n->keys[i] != key) + i++; + for (++i; i < n->num_keys; i++) + n->keys[i - 1] = n->keys[i]; + + // Remove the pointer and shift other pointers accordingly. + // First determine number of pointers. + num_pointers = n->is_leaf ? n->num_keys : n->num_keys + 1; + i = 0; + while (n->pointers[i] != pointer) + i++; + for (++i; i < num_pointers; i++) + n->pointers[i - 1] = n->pointers[i]; + + // One key fewer. + n->num_keys--; + + // Set the other pointers to NULL for tidiness. + // A leaf uses the last pointer to point to the next leaf. + if (n->is_leaf) + for (i = n->num_keys; i < order - 1; i++) + n->pointers[i] = NULL; + else + for (i = n->num_keys + 1; i < order; i++) + n->pointers[i] = NULL; + + return n; +} + +/* */ +node *adjust_root(node *root) { + + node *new_root; + + /* Case: nonempty root. + * Key and pointer have already been deleted, + * so nothing to be done. + */ + + if (root->num_keys > 0) + return root; + + /* Case: empty root. + */ + + // If it has a child, promote + // the first (only) child + // as the new root. + + if (!root->is_leaf) { + new_root = (node *)root->pointers[0]; + new_root->parent = NULL; + } + + // If it is a leaf (has no children), + // then the whole tree is empty. + + else + new_root = NULL; + + free(root->keys); + free(root->pointers); + free(root); + + return new_root; +} + +/* Coalesces a node that has become too small after deletion with a neighboring + * node that can accept the additional entries without exceeding the maximum. */ +node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index, + int k_prime) { + + int i, j, neighbor_insertion_index, n_start, n_end, new_k_prime; + node *tmp; + bool split; + + /* Swap neighbor with node if node is on the + * extreme left and neighbor is to its right. + */ + + if (neighbor_index == -1) { + tmp = n; + n = neighbor; + neighbor = tmp; + } + + /* Starting point in the neighbor for copying + * keys and pointers from n. + * Recall that n and neighbor have swapped places + * in the special case of n being a leftmost child. + */ + + neighbor_insertion_index = neighbor->num_keys; + + /* + * Nonleaf nodes may sometimes need to remain split, + * if the insertion of k_prime would cause the resulting + * single coalesced node to exceed the limit order - 1. + * The variable split is always false for leaf nodes + * and only sometimes set to true for nonleaf nodes. + */ + + split = false; + + /* Case: nonleaf node. + * Append k_prime and the following pointer. + * If there is room in the neighbor, append + * all pointers and keys from the neighbor. + * Otherwise, append only cut(order) - 2 keys and + * cut(order) - 1 pointers. + */ + + if (!n->is_leaf) { + + /* Append k_prime. + */ + + neighbor->keys[neighbor_insertion_index] = k_prime; + neighbor->num_keys++; + + /* Case (default): there is room for all of n's keys and pointers + * in the neighbor after appending k_prime. + */ + + n_end = n->num_keys; + + /* Case (special): k cannot fit with all the other keys and pointers + * into one coalesced node. + */ + n_start = 0; // Only used in this special case. + if (n->num_keys + neighbor->num_keys >= order) { + split = true; + n_end = cut(order) - 2; + } + + for (i = neighbor_insertion_index + 1, j = 0; j < n_end; i++, j++) { + neighbor->keys[i] = n->keys[j]; + neighbor->pointers[i] = n->pointers[j]; + neighbor->num_keys++; + n->num_keys--; + n_start++; + } + + /* The number of pointers is always + * one more than the number of keys. + */ + + neighbor->pointers[i] = n->pointers[j]; + + /* If the nodes are still split, remove the first key from + * n. + */ + if (split) { + new_k_prime = n->keys[n_start]; + for (i = 0, j = n_start + 1; i < n->num_keys; i++, j++) { + n->keys[i] = n->keys[j]; + n->pointers[i] = n->pointers[j]; + } + n->pointers[i] = n->pointers[j]; + n->num_keys--; + } + + /* All children must now point up to the same parent. + */ + + for (i = 0; i < neighbor->num_keys + 1; i++) { + tmp = (node *)neighbor->pointers[i]; + tmp->parent = neighbor; + } + } + + /* In a leaf, append the keys and pointers of + * n to the neighbor. + * Set the neighbor's last pointer to point to + * what had been n's right neighbor. + */ + + else { + for (i = neighbor_insertion_index, j = 0; j < n->num_keys; i++, j++) { + neighbor->keys[i] = n->keys[j]; + neighbor->pointers[i] = n->pointers[j]; + neighbor->num_keys++; + } + neighbor->pointers[order - 1] = n->pointers[order - 1]; + } + + if (!split) { + root = delete_entry(root, n->parent, k_prime, n); + free(n->keys); + free(n->pointers); + free(n); + } else + for (i = 0; i < n->parent->num_keys; i++) + if (n->parent->pointers[i + 1] == n) { + n->parent->keys[i] = new_k_prime; + break; + } + + return root; +} + +/* Redistributes entries between two nodes when one has become too small after + * deletion but its neighbor is too big to append the small node's entries + * without exceeding the maximum */ +node *redistribute_nodes(node *root, node *n, node *neighbor, + int neighbor_index, int k_prime_index, int k_prime) { + + int i; + node *tmp; + + /* Case: n has a neighbor to the left. + * Pull the neighbor's last key-pointer pair over + * from the neighbor's right end to n's left end. + */ + + if (neighbor_index != -1) { + if (!n->is_leaf) + n->pointers[n->num_keys + 1] = n->pointers[n->num_keys]; + for (i = n->num_keys; i > 0; i--) { + n->keys[i] = n->keys[i - 1]; + n->pointers[i] = n->pointers[i - 1]; + } + if (!n->is_leaf) { + n->pointers[0] = neighbor->pointers[neighbor->num_keys]; + tmp = (node *)n->pointers[0]; + tmp->parent = n; + neighbor->pointers[neighbor->num_keys] = NULL; + n->keys[0] = k_prime; + n->parent->keys[k_prime_index] = neighbor->keys[neighbor->num_keys - 1]; + } else { + n->pointers[0] = neighbor->pointers[neighbor->num_keys - 1]; + neighbor->pointers[neighbor->num_keys - 1] = NULL; + n->keys[0] = neighbor->keys[neighbor->num_keys - 1]; + n->parent->keys[k_prime_index] = n->keys[0]; + } + } + + /* Case: n is the leftmost child. + * Take a key-pointer pair from the neighbor to the right. + * Move the neighbor's leftmost key-pointer pair + * to n's rightmost position. + */ + + else { + if (n->is_leaf) { + n->keys[n->num_keys] = neighbor->keys[0]; + n->pointers[n->num_keys] = neighbor->pointers[0]; + n->parent->keys[k_prime_index] = neighbor->keys[1]; + } else { + n->keys[n->num_keys] = k_prime; + n->pointers[n->num_keys + 1] = neighbor->pointers[0]; + tmp = (node *)n->pointers[n->num_keys + 1]; + tmp->parent = n; + n->parent->keys[k_prime_index] = neighbor->keys[0]; + } + for (i = 0; i < neighbor->num_keys; i++) { + neighbor->keys[i] = neighbor->keys[i + 1]; + neighbor->pointers[i] = neighbor->pointers[i + 1]; + } + if (!n->is_leaf) + neighbor->pointers[i] = neighbor->pointers[i + 1]; + } + + /* n now has one more key and one more pointer; + * the neighbor has one fewer of each. + */ + + n->num_keys++; + neighbor->num_keys--; + + return root; +} + +/* Deletes an entry from the B+ tree. Removes the record and its key and pointer + * from the leaf, and then makes all appropriate changes to preserve the B+ tree + * properties. */ +node *delete_entry(node *root, node *n, int key, void *pointer) { + + int min_keys; + node *neighbor; + int neighbor_index; + int k_prime_index, k_prime; + int capacity; + + // Remove key and pointer from node. + + n = remove_entry_from_node(n, key, (node *)pointer); + + /* Case: deletion from the root. + */ + + if (n == root) + return adjust_root(root); + + /* Case: deletion from a node below the root. + * (Rest of function body.) + */ + + /* Determine minimum allowable size of node, + * to be preserved after deletion. + */ + + min_keys = n->is_leaf ? cut(order - 1) : cut(order) - 1; + + /* Case: node stays at or above minimum. + * (The simple case.) + */ + + if (n->num_keys >= min_keys) + return root; + + /* Case: node falls below minimum. + * Either coalescence or redistribution + * is needed. + */ + + /* Find the appropriate neighbor node with which + * to coalesce. + * Also find the key (k_prime) in the parent + * between the pointer to node n and the pointer + * to the neighbor. + */ + + neighbor_index = get_neighbor_index(n); + k_prime_index = neighbor_index == -1 ? 0 : neighbor_index; + k_prime = n->parent->keys[k_prime_index]; + neighbor = neighbor_index == -1 ? (node *)n->parent->pointers[1] + : (node *)n->parent->pointers[neighbor_index]; + + capacity = n->is_leaf ? order : order - 1; + + /* Coalescence. */ + + if (neighbor->num_keys + n->num_keys < capacity) + return coalesce_nodes(root, n, neighbor, neighbor_index, k_prime); + + /* Redistribution. */ + + else + return redistribute_nodes(root, n, neighbor, neighbor_index, k_prime_index, + k_prime); +} + +/* Master deletion function. */ +node *deleteVal(node *root, int key) { + + node *key_leaf; + record *key_record; + + key_record = find(root, key, false); + key_leaf = find_leaf(root, key, false); + if (key_record != NULL && key_leaf != NULL) { + free(key_record); + root = delete_entry(root, key_leaf, key, key_record); + } + return root; +} + +/* */ +void destroy_tree_nodes(node *root) { + int i; + if (root->is_leaf) + for (i = 0; i < root->num_keys; i++) + free(root->pointers[i]); + else + for (i = 0; i < root->num_keys + 1; i++) + destroy_tree_nodes((node *)root->pointers[i]); + free(root->pointers); + free(root->keys); + free(root); +} + +/* */ +node *destroy_tree(node *root) { + destroy_tree_nodes(root); + return NULL; +} + +//======================================================================================================================================================150 +// END +//======================================================================================================================================================150 + +//========================================================================================================================================================================================================200 +// MAIN FUNCTION +//========================================================================================================================================================================================================200 + +int main(int argc, char **argv) { + + printf("WG size of kernel 1 & 2 = %d \n", DEFAULT_ORDER); + + // ------------------------------------------------------------60 + // figure out and display whether 32-bit or 64-bit architecture + // ------------------------------------------------------------60 + + // if(sizeof(int *)==8){ + // printf("64 bit machine\n"); + // } + // else if(sizeof(int *)==4){ + // printf("32 bit machine\n"); + // } + + // ------------------------------------------------------------60 + // set GPU + // ------------------------------------------------------------60 + + int device = 0; + cudaSetDevice(device); + printf("Selecting device %d\n", device); + + // ------------------------------------------------------------60 + // read inputs + // ------------------------------------------------------------60 + + // assing default values + int cur_arg; + int arch_arg; + arch_arg = 0; + int cores_arg; + cores_arg = 1; + char *input_file = NULL; + char *command_file = NULL; + char *output = "output.txt"; + FILE *pFile; + + // go through arguments + for (cur_arg = 1; cur_arg < argc; cur_arg++) { + // check if -file + if (strcmp(argv[cur_arg], "file") == 0) { + // check if value provided + if (argc >= cur_arg + 1) { + input_file = argv[cur_arg + 1]; + cur_arg = cur_arg + 1; + // value is not a number + } + // value not provided + else { + printf("ERROR: Missing value to -file parameter\n"); + return -1; + } + } else if (strcmp(argv[cur_arg], "command") == 0) { + // check if value provided + if (argc >= cur_arg + 1) { + command_file = argv[cur_arg + 1]; + cur_arg = cur_arg + 1; + // value is not a number + } + // value not provided + else { + printf("ERROR: Missing value to command parameter\n"); + return -1; + } + } + } + // Print configuration + if ((input_file == NULL) || (command_file == NULL)) + printf("Usage: ./b+tree file input_file command command_list\n"); + + // For debug + printf("Input File: %s \n", input_file); + printf("Command File: %s \n", command_file); + + FILE *commandFile; + long lSize; + char *commandBuffer; + size_t result; + + commandFile = fopen(command_file, "rb"); + if (commandFile == NULL) { + fputs("Command File error", stderr); + exit(1); + } + + // obtain file size: + fseek(commandFile, 0, SEEK_END); + lSize = ftell(commandFile); + rewind(commandFile); + + // allocate memory to contain the whole file: + commandBuffer = (char *)malloc(sizeof(char) * lSize); + if (commandBuffer == NULL) { + fputs("Command Buffer memory error", stderr); + exit(2); + } + + // copy the file into the buffer: + result = fread(commandBuffer, 1, lSize, commandFile); + if (result != lSize) { + fputs("Command file reading error", stderr); + exit(3); + } + + /* the whole file is now loaded in the memory buffer. */ + + // terminate + fclose(commandFile); + + // For Debug + char *sPointer = commandBuffer; + printf("Command Buffer: \n"); + printf("%s", commandBuffer); + // + + pFile = fopen(output, "w+"); + if (pFile == NULL) + fputs("Fail to open %s !\n", output); + fprintf(pFile, "******starting******\n"); + fclose(pFile); + + // ------------------------------------------------------------60 + // general variables + // ------------------------------------------------------------60 + + FILE *file_pointer; + node *root; + root = NULL; + record *r; + int input; + char instruction; + order = DEFAULT_ORDER; + verbose_output = false; + + // usage_1(); + // usage_2(); + + // ------------------------------------------------------------60 + // get input from file, if file provided + // ------------------------------------------------------------60 + + if (input_file != NULL) { + + printf("Getting input from file %s...\n", input_file); + + // open input file + file_pointer = fopen(input_file, "r"); + if (file_pointer == NULL) { + perror("Failure to open input file."); + exit(EXIT_FAILURE); + } + + // get # of numbers in the file + fscanf(file_pointer, "%d\n", &input); + size = input; + + // save all numbers + while (!feof(file_pointer)) { + fscanf(file_pointer, "%d\n", &input); + root = insert(root, input, input); + } + + // close file + fclose(file_pointer); + // print_tree(root); + // printf("Height of tree = %d\n", height(root)); + + } else { + printf("ERROR: Argument -file missing\n"); + return 0; + } + + // ------------------------------------------------------------60 + // get tree statistics + // ------------------------------------------------------------60 + + printf("Transforming data to a GPU suitable structure...\n"); + long mem_used = transform_to_cuda(root, 0); + maxheight = height(root); + long rootLoc = (long)knodes - (long)mem; + + // ------------------------------------------------------------60 + // process commands + // ------------------------------------------------------------60 + char *commandPointer = commandBuffer; + + printf("Waiting for command\n"); + printf("> "); + while (sscanf(commandPointer, "%c", &instruction) != EOF) { + commandPointer++; + switch (instruction) { + // ----------------------------------------40 + // Insert + // ----------------------------------------40 + + case 'i': { + scanf("%d", &input); + while (getchar() != (int)'\n') + ; + root = insert(root, input, input); + print_tree(root); + break; + } + + // ----------------------------------------40 + // n/a + // ----------------------------------------40 + + case 'f': { + } + + // ----------------------------------------40 + // find + // ----------------------------------------40 + + case 'p': { + scanf("%d", &input); + while (getchar() != (int)'\n') + ; + r = find(root, input, instruction == 'p'); + if (r == NULL) + printf("Record not found under key %d.\n", input); + else + printf("Record found: %d\n", r->value); + break; + } + + // ----------------------------------------40 + // delete value + // ----------------------------------------40 + + case 'd': { + scanf("%d", &input); + while (getchar() != (int)'\n') + ; + root = (node *)deleteVal(root, input); + print_tree(root); + break; + } + + // ----------------------------------------40 + // destroy tree + // ----------------------------------------40 + + case 'x': { + while (getchar() != (int)'\n') + ; + root = destroy_tree(root); + print_tree(root); + break; + } + + // ----------------------------------------40 + // print leaves + // ----------------------------------------40 + + case 'l': { + while (getchar() != (int)'\n') + ; + print_leaves(root); + break; + } + + // ----------------------------------------40 + // print tree + // ----------------------------------------40 + + case 't': { + while (getchar() != (int)'\n') + ; + print_tree(root); + break; + } + + // ----------------------------------------40 + // toggle verbose output + // ----------------------------------------40 + + case 'v': { + while (getchar() != (int)'\n') + ; + verbose_output = !verbose_output; + break; + } + + // ----------------------------------------40 + // quit + // ----------------------------------------40 + + case 'q': { + while (getchar() != (int)'\n') + ; + return EXIT_SUCCESS; + } + + // ----------------------------------------40 + // [GPU] find K (initK, findK) + // ----------------------------------------40 + + case 'k': { + + // get # of queries from user + int count; + sscanf(commandPointer, "%d", &count); + while (*commandPointer != 32 && commandPointer != '\n') + commandPointer++; + + printf("\n ******command: k count=%d \n", count); + if (count > 65535) { + printf("ERROR: Number of requested querries should be 65,535 at most. " + "(limited by # of CUDA blocks)\n"); + exit(0); + } + + // INPUT: records CPU allocation (setting pointer in mem variable) + record *records = (record *)mem; + long records_elem = (long)rootLoc / sizeof(record); + long records_mem = (long)rootLoc; + printf("records_elem=%d, records_unit_mem=%d, records_mem=%d\n", + (int)records_elem, (int)sizeof(record), (int)records_mem); + + // INPUT: knodes CPU allocation (setting pointer in mem variable) + knode *knodes = (knode *)((long)mem + (long)rootLoc); + long knodes_elem = ((long)(mem_used) - (long)rootLoc) / sizeof(knode); + long knodes_mem = (long)(mem_used) - (long)rootLoc; + printf("knodes_elem=%d, knodes_unit_mem=%d, knodes_mem=%d\n", + (int)knodes_elem, (int)sizeof(knode), (int)knodes_mem); + + // INPUT: currKnode CPU allocation + long *currKnode; + currKnode = (long *)malloc(count * sizeof(long)); + // INPUT: offset CPU initialization + memset(currKnode, 0, count * sizeof(long)); + + // INPUT: offset CPU allocation + long *offset; + offset = (long *)malloc(count * sizeof(long)); + // INPUT: offset CPU initialization + memset(offset, 0, count * sizeof(long)); + + // INPUT: keys CPU allocation + int *keys; + keys = (int *)malloc(count * sizeof(int)); + // INPUT: keys CPU initialization + int i; + for (i = 0; i < count; i++) { + keys[i] = (rand() / (float)RAND_MAX) * size; + } + + // OUTPUT: ans CPU allocation + record *ans = (record *)malloc(sizeof(record) * count); + // OUTPUT: ans CPU initialization + for (i = 0; i < count; i++) { + ans[i].value = -1; + } + + // CUDA kernel + kernel_gpu_cuda_wrapper(records, records_mem, knodes, knodes_elem, + knodes_mem, + + order, maxheight, count, + + currKnode, offset, keys, ans); + + /* printf("ans: \n"); */ + /* for(i = 0; i < count; i++){ */ + /* printf("%d ",ans[i].value); */ + /* } */ + + /* printf(" \n"); */ + + pFile = fopen(output, "aw+"); + if (pFile == NULL) { + fputs("Fail to open %s !\n", output); + } + + fprintf(pFile, "\n ******command: k count=%d \n", count); + for (i = 0; i < count; i++) { + fprintf(pFile, "%d %d\n", i, ans[i].value); + } + fprintf(pFile, " \n"); + fclose(pFile); + + // free memory + free(currKnode); + free(offset); + free(keys); + free(ans); + + // break out of case + break; + } + + // ----------------------------------------40 + // find range + // ----------------------------------------40 + + case 'r': { + int start, end; + scanf("%d", &start); + scanf("%d", &end); + if (start > end) { + input = start; + start = end; + end = input; + } + printf("For range %d to %d, ", start, end); + list_t *ansList; + ansList = findRange(root, start, end); + printf("%d records found\n", list_get_length(ansList)); + // list_iterator_t iter; + free(ansList); + break; + } + + // ----------------------------------------40 + // [GPU] find Range K (initK, findRangeK) + // ----------------------------------------40 + + case 'j': { + + // get # of queries from user + int count; + sscanf(commandPointer, "%d", &count); + while (*commandPointer != 32 && commandPointer != '\n') + commandPointer++; + + int rSize; + sscanf(commandPointer, "%d", &rSize); + while (*commandPointer != 32 && commandPointer != '\n') + commandPointer++; + + printf("\n******command: j count=%d, rSize=%d \n", count, rSize); + if (rSize > size || rSize < 0) { + printf("Search range size is larger than data set size %d.\n", + (int)size); + exit(0); + } + + // INPUT: knodes CPU allocation (setting pointer in mem variable) + knode *knodes = (knode *)((long)mem + (long)rootLoc); + long knodes_elem = ((long)(mem_used) - (long)rootLoc) / sizeof(knode); + long knodes_mem = (long)(mem_used) - (long)rootLoc; + printf("knodes_elem=%d, knodes_unit_mem=%d, knodes_mem=%d\n", + (int)knodes_elem, (int)sizeof(knode), (int)knodes_mem); + + // INPUT: currKnode CPU allocation + long *currKnode; + currKnode = (long *)malloc(count * sizeof(long)); + // INPUT: offset CPU initialization + memset(currKnode, 0, count * sizeof(long)); + + // INPUT: offset CPU allocation + long *offset; + offset = (long *)malloc(count * sizeof(long)); + // INPUT: offset CPU initialization + memset(offset, 0, count * sizeof(long)); + + // INPUT: lastKnode CPU allocation + long *lastKnode; + lastKnode = (long *)malloc(count * sizeof(long)); + // INPUT: offset CPU initialization + memset(lastKnode, 0, count * sizeof(long)); + + // INPUT: offset_2 CPU allocation + long *offset_2; + offset_2 = (long *)malloc(count * sizeof(long)); + // INPUT: offset CPU initialization + memset(offset_2, 0, count * sizeof(long)); + + // INPUT: start, end CPU allocation + int *start; + start = (int *)malloc(count * sizeof(int)); + int *end; + end = (int *)malloc(count * sizeof(int)); + // INPUT: start, end CPU initialization + int i; + for (i = 0; i < count; i++) { + start[i] = (rand() / (float)RAND_MAX) * size; + end[i] = start[i] + rSize; + if (end[i] >= size) { + start[i] = start[i] - (end[i] - size); + end[i] = size - 1; + } + } + + // INPUT: recstart, reclenght CPU allocation + int *recstart; + recstart = (int *)malloc(count * sizeof(int)); + int *reclength; + reclength = (int *)malloc(count * sizeof(int)); + // OUTPUT: ans CPU initialization + for (i = 0; i < count; i++) { + recstart[i] = 0; + reclength[i] = 0; + } + + // CUDA kernel + kernel_gpu_cuda_wrapper_2(knodes, knodes_elem, knodes_mem, + + order, maxheight, count, + + currKnode, offset, lastKnode, offset_2, start, + end, recstart, reclength); + + pFile = fopen(output, "aw+"); + if (pFile == NULL) { + fputs("Fail to open %s !\n", output); + } + + fprintf(pFile, "\n******command: j count=%d, rSize=%d \n", count, rSize); + for (i = 0; i < count; i++) { + fprintf(pFile, "%d %d %d\n", i, recstart[i], reclength[i]); + } + fprintf(pFile, " \n"); + fclose(pFile); + + // free memory + free(currKnode); + free(offset); + free(lastKnode); + free(offset_2); + free(start); + free(end); + free(recstart); + free(reclength); + + // break out of case + break; + } + + // ----------------------------------------40 + // default + // ----------------------------------------40 + + default: { + + // usage_2(); + break; + } + } + printf("> "); + } + printf("\n"); + + // ------------------------------------------------------------60 + // free remaining memory and exit + // ------------------------------------------------------------60 + + free(mem); + return EXIT_SUCCESS; +} + +//========================================================================================================================================================================================================200 +// END +//========================================================================================================================================================================================================200 + +// # ifdef __cplusplus +// } +// # endif diff --git a/examples/btree/run.sh b/examples/btree/run.sh new file mode 100755 index 0000000..bcc5f79 --- /dev/null +++ b/examples/btree/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e +clang -c -emit-llvm util/timer/timer.c +clang -c -emit-llvm util/num/num.c +#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61 +#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61 +#clang++ kernel/kernel_gpu_cuda_wrapper.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v +#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v +clang -c -emit-llvm main.c + +llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll +llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc +../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc +../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc +../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc + +llc --relocation-model=pic --filetype=obj main.bc +llc --relocation-model=pic --filetype=obj cuda.bc +llc --relocation-model=pic --filetype=obj num.bc +llc --relocation-model=pic --filetype=obj timer.bc +llc --relocation-model=pic --filetype=obj kernel1.bc +llc --relocation-model=pic --filetype=obj kernel2.bc +llc --relocation-model=pic --filetype=obj host1.bc +llc --relocation-model=pic --filetype=obj host2.bc +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o b+tree.out \ + -fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \ + -lc -lx86Runtime -lthreadPool -lpthread + +./b+tree.out file ../../rodinia-data/b+tree/mil.txt \ + command ../../rodinia-data/b+tree/command.txt +if grep -q "0 840187 6001" output.txt; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/btree/util/cuda/cuda.cu b/examples/btree/util/cuda/cuda.cu new file mode 100755 index 0000000..dafe6a0 --- /dev/null +++ b/examples/btree/util/cuda/cuda.cu @@ -0,0 +1,75 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//===============================================================================================================================================================================================================200 +// SET_DEVICE CODE +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// INCLUDE/DEFINE +//======================================================================================================================================================150 + +#include "cuda.h" // (in library path specified to compiler) + +//======================================================================================================================================================150 +// FUNCTIONS +//======================================================================================================================================================150 + +//====================================================================================================100 +// SET DEVICE +//====================================================================================================100 + +void setdevice(void){ + + // variables + int num_devices; + int device; + + // work + cudaGetDeviceCount(&num_devices); + if (num_devices > 1) { + + // variables + int max_multiprocessors; + int max_device; + cudaDeviceProp properties; + + // initialize variables + max_multiprocessors = 0; + max_device = 0; + + for (device = 0; device < num_devices; device++) { + cudaGetDeviceProperties(&properties, device); + if (max_multiprocessors < properties.multiProcessorCount) { + max_multiprocessors = properties.multiProcessorCount; + max_device = device; + } + } + cudaSetDevice(max_device); + } + +} + +//====================================================================================================100 +// GET LAST ERROR +//====================================================================================================100 + +void checkCUDAError(const char *msg) +{ + cudaError_t err = cudaGetLastError(); + if( cudaSuccess != err) { + // fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); + printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); + fflush(NULL); + exit(EXIT_FAILURE); + } +} + +//===============================================================================================================================================================================================================200 +// END +//===============================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/util/cuda/cuda.h b/examples/btree/util/cuda/cuda.h new file mode 100644 index 0000000..b5ce6dc --- /dev/null +++ b/examples/btree/util/cuda/cuda.h @@ -0,0 +1,37 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//===============================================================================================================================================================================================================200 +// SET_DEVICE HEADER +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// INCLUDE/DEFINE +//======================================================================================================================================================150 + +#include // (in library path known to compiler) needed by printf + +//======================================================================================================================================================150 +// FUNCTION PROTOTYPES +//======================================================================================================================================================150 + +//====================================================================================================100 +// SET DEVICE +//====================================================================================================100 + +void setdevice(void); + +//====================================================================================================100 +// GET LAST ERROR +//====================================================================================================100 + +void checkCUDAError(const char *msg); + +//===============================================================================================================================================================================================================200 +// END SET_DEVICE HEADER +//===============================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/util/num/num.c b/examples/btree/util/num/num.c new file mode 100644 index 0000000..3b3a452 --- /dev/null +++ b/examples/btree/util/num/num.c @@ -0,0 +1,55 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//===============================================================================================================================================================================================================200 +// DESCRIPTION +//===============================================================================================================================================================================================================200 + +// Returns: 0 if string does not represent integer +// 1 if string represents integer + +//===============================================================================================================================================================================================================200 +// NUM CODE +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// ISINTEGER FUNCTION +//======================================================================================================================================================150 + +int isInteger(char *str) { + + //====================================================================================================100 + // make sure it's not empty + //====================================================================================================100 + + if (*str == '\0') { + return 0; + } + + //====================================================================================================100 + // if any digit is not a number, return false + //====================================================================================================100 + + for (; *str != '\0'; str++) { + if (*str < 48 || + *str > + 57) { // digit characters (need to include . if checking for float) + return 0; + } + } + + //====================================================================================================100 + // it got past all my checks so I think it's a number + //====================================================================================================100 + + return 1; +} + +//===============================================================================================================================================================================================================200 +// END NUM CODE +//===============================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/util/num/num.h b/examples/btree/util/num/num.h new file mode 100755 index 0000000..27a5e42 --- /dev/null +++ b/examples/btree/util/num/num.h @@ -0,0 +1,21 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//===============================================================================================================================================================================================================200 +// FILE HEADER +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// ISINTEGER FUNCTION PROTOTYPE +//======================================================================================================================================================150 + +int isInteger(char *str); + +//===============================================================================================================================================================================================================200 +// END FILE HEADER +//===============================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/util/timer/timer.c b/examples/btree/util/timer/timer.c new file mode 100644 index 0000000..b6aace4 --- /dev/null +++ b/examples/btree/util/timer/timer.c @@ -0,0 +1,36 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//===============================================================================================================================================================================================================200 +// TIMER CODE +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// INCLUDE/DEFINE +//======================================================================================================================================================150 + +#include + +//======================================================================================================================================================150 +// FUNCTIONS +//======================================================================================================================================================150 + +//====================================================================================================100 +// DISPLAY TIME +//====================================================================================================100 + +// Returns the current system time in microseconds +long long get_time() { + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000) + tv.tv_usec; +} + +//===============================================================================================================================================================================================================200 +// END TIMER CODE +//===============================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/btree/util/timer/timer.h b/examples/btree/util/timer/timer.h new file mode 100644 index 0000000..1744df4 --- /dev/null +++ b/examples/btree/util/timer/timer.h @@ -0,0 +1,21 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//===============================================================================================================================================================================================================200 +// TIMER HEADER +//===============================================================================================================================================================================================================200 + +//======================================================================================================================================================150 +// FUNCTION PROTOTYPES +//======================================================================================================================================================150 + +long long get_time(); + +//===============================================================================================================================================================================================================200 +// END TIMER HEADER +//===============================================================================================================================================================================================================200 + +#ifdef __cplusplus +} +#endif diff --git a/examples/cfd/euler3d.cu b/examples/cfd/euler3d.cu new file mode 100755 index 0000000..ddaa774 --- /dev/null +++ b/examples/cfd/euler3d.cu @@ -0,0 +1,662 @@ +#include +#include +#include +#include + +/* + * Options + * + */ +#define GAMMA 1.4f +#define iterations 2 +// #ifndef block_length +// #define block_length 192 +// #endif + +#define NDIM 3 +#define NNB 4 + +#define RK 3 // 3rd order RK +#define ff_mach 1.2f +#define deg_angle_of_attack 0.0f + +/* + * not options + */ + +#ifdef RD_WG_SIZE_0_0 +#define BLOCK_SIZE_0 RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define BLOCK_SIZE_0 RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE_0 RD_WG_SIZE +#else +#define BLOCK_SIZE_0 192 +#endif + +#ifdef RD_WG_SIZE_1_0 +#define BLOCK_SIZE_1 RD_WG_SIZE_1_0 +#elif defined(RD_WG_SIZE_1) +#define BLOCK_SIZE_1 RD_WG_SIZE_1 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE_1 RD_WG_SIZE +#else +#define BLOCK_SIZE_1 192 +#endif + +#ifdef RD_WG_SIZE_2_0 +#define BLOCK_SIZE_2 RD_WG_SIZE_2_0 +#elif defined(RD_WG_SIZE_1) +#define BLOCK_SIZE_2 RD_WG_SIZE_2 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE_2 RD_WG_SIZE +#else +#define BLOCK_SIZE_2 192 +#endif + +#ifdef RD_WG_SIZE_3_0 +#define BLOCK_SIZE_3 RD_WG_SIZE_3_0 +#elif defined(RD_WG_SIZE_3) +#define BLOCK_SIZE_3 RD_WG_SIZE_3 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE_3 RD_WG_SIZE +#else +#define BLOCK_SIZE_3 192 +#endif + +#ifdef RD_WG_SIZE_4_0 +#define BLOCK_SIZE_4 RD_WG_SIZE_4_0 +#elif defined(RD_WG_SIZE_4) +#define BLOCK_SIZE_4 RD_WG_SIZE_4 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE_4 RD_WG_SIZE +#else +#define BLOCK_SIZE_4 192 +#endif + +// #if block_length > 128 +// #warning "the kernels may fail too launch on some systems if the block length +// is too large" #endif + +#define VAR_DENSITY 0 +#define VAR_MOMENTUM 1 +#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM) +#define NVAR (VAR_DENSITY_ENERGY + 1) + +/* + * Generic functions + */ +template T *alloc(int N) { + T *t; + checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N)); + return t; +} + +template void dealloc(T *array) { + checkCudaErrors(cudaFree((void *)array)); +} + +template void copy(T *dst, T *src, int N) { + checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T), + cudaMemcpyDeviceToDevice)); +} + +template void upload(T *dst, T *src, int N) { + checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T), + cudaMemcpyHostToDevice)); +} + +template void download(T *dst, T *src, int N) { + checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T), + cudaMemcpyDeviceToHost)); +} + +void dump(float *variables, int nel, int nelr) { + float *h_variables = new float[nelr * NVAR]; + download(h_variables, variables, nelr * NVAR); + + { + std::ofstream file("density"); + file << nel << " " << nelr << std::endl; + for (int i = 0; i < nel; i++) + file << h_variables[i + VAR_DENSITY * nelr] << std::endl; + } + + { + std::ofstream file("momentum"); + file << nel << " " << nelr << std::endl; + for (int i = 0; i < nel; i++) { + for (int j = 0; j != NDIM; j++) + file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " "; + file << std::endl; + } + } + + { + std::ofstream file("density_energy"); + file << nel << " " << nelr << std::endl; + for (int i = 0; i < nel; i++) + file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl; + } + delete[] h_variables; +} + +/* + * Element-based Cell-centered FVM solver functions + */ +__constant__ float ff_variable[NVAR]; +__constant__ float3 ff_flux_contribution_momentum_x[1]; +__constant__ float3 ff_flux_contribution_momentum_y[1]; +__constant__ float3 ff_flux_contribution_momentum_z[1]; +__constant__ float3 ff_flux_contribution_density_energy[1]; + +__global__ void cuda_initialize_variables(int nelr, float *variables) { + const int i = (blockDim.x * blockIdx.x + threadIdx.x); + for (int j = 0; j < NVAR; j++) + variables[i + j * nelr] = ff_variable[j]; +} +void initialize_variables(int nelr, float *variables) { + dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1); + cuda_initialize_variables<<>>(nelr, variables); + getLastCudaError("initialize_variables failed"); +} + +__device__ __host__ inline void compute_flux_contribution( + float &density, float3 &momentum, float &density_energy, float &pressure, + float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y, + float3 &fc_momentum_z, float3 &fc_density_energy) { + fc_momentum_x.x = velocity.x * momentum.x + pressure; + fc_momentum_x.y = velocity.x * momentum.y; + fc_momentum_x.z = velocity.x * momentum.z; + + fc_momentum_y.x = fc_momentum_x.y; + fc_momentum_y.y = velocity.y * momentum.y + pressure; + fc_momentum_y.z = velocity.y * momentum.z; + + fc_momentum_z.x = fc_momentum_x.z; + fc_momentum_z.y = fc_momentum_y.z; + fc_momentum_z.z = velocity.z * momentum.z + pressure; + + float de_p = density_energy + pressure; + fc_density_energy.x = velocity.x * de_p; + fc_density_energy.y = velocity.y * de_p; + fc_density_energy.z = velocity.z * de_p; +} + +__device__ inline void compute_velocity(float &density, float3 &momentum, + float3 &velocity) { + velocity.x = momentum.x / density; + velocity.y = momentum.y / density; + velocity.z = momentum.z / density; +} + +__device__ inline float compute_speed_sqd(float3 &velocity) { + return velocity.x * velocity.x + velocity.y * velocity.y + + velocity.z * velocity.z; +} + +__device__ inline float compute_pressure(float &density, float &density_energy, + float &speed_sqd) { + return (float(GAMMA) - float(1.0f)) * + (density_energy - float(0.5f) * density * speed_sqd); +} + +__device__ inline float compute_speed_of_sound(float &density, + float &pressure) { + return sqrtf(float(GAMMA) * pressure / density); +} + +__global__ void cuda_compute_step_factor(int nelr, float *variables, + float *areas, float *step_factors) { + const int i = (blockDim.x * blockIdx.x + threadIdx.x); + + float density = variables[i + VAR_DENSITY * nelr]; + float3 momentum; + momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr]; + momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr]; + momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr]; + + float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr]; + + float3 velocity; + compute_velocity(density, momentum, velocity); + float speed_sqd = compute_speed_sqd(velocity); + float pressure = compute_pressure(density, density_energy, speed_sqd); + float speed_of_sound = compute_speed_of_sound(density, pressure); + + // dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time + // stepping, this later would need to be divided by the area, so we just do it + // all at once + step_factors[i] = + float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound)); +} +void compute_step_factor(int nelr, float *variables, float *areas, + float *step_factors) { + dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2); + cuda_compute_step_factor<<>>(nelr, variables, areas, step_factors); + getLastCudaError("compute_step_factor failed"); +} + +/* + * + * + */ +__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements, + float *normals, float *variables, + float *fluxes) { + const float smoothing_coefficient = float(0.2f); + const int i = (blockDim.x * blockIdx.x + threadIdx.x); + + int j, nb; + float3 normal; + float normal_len; + float factor; + + float density_i = variables[i + VAR_DENSITY * nelr]; + float3 momentum_i; + momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr]; + momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr]; + momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr]; + + float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr]; + + float3 velocity_i; + compute_velocity(density_i, momentum_i, velocity_i); + float speed_sqd_i = compute_speed_sqd(velocity_i); + float speed_i = sqrtf(speed_sqd_i); + float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i); + float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i); + float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, + flux_contribution_i_momentum_z; + float3 flux_contribution_i_density_energy; + compute_flux_contribution( + density_i, momentum_i, density_energy_i, pressure_i, velocity_i, + flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, + flux_contribution_i_momentum_z, flux_contribution_i_density_energy); + + float flux_i_density = float(0.0f); + float3 flux_i_momentum; + flux_i_momentum.x = float(0.0f); + flux_i_momentum.y = float(0.0f); + flux_i_momentum.z = float(0.0f); + float flux_i_density_energy = float(0.0f); + + float3 velocity_nb; + float density_nb, density_energy_nb; + float3 momentum_nb; + float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, + flux_contribution_nb_momentum_z; + float3 flux_contribution_nb_density_energy; + float speed_sqd_nb, speed_of_sound_nb, pressure_nb; + +#pragma unroll + for (j = 0; j < NNB; j++) { + nb = elements_surrounding_elements[i + j * nelr]; + normal.x = normals[i + (j + 0 * NNB) * nelr]; + normal.y = normals[i + (j + 1 * NNB) * nelr]; + normal.z = normals[i + (j + 2 * NNB) * nelr]; + normal_len = + sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z); + + if (nb >= 0) // a legitimate neighbor + { + density_nb = variables[nb + VAR_DENSITY * nelr]; + momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr]; + momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr]; + momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr]; + density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr]; + compute_velocity(density_nb, momentum_nb, velocity_nb); + speed_sqd_nb = compute_speed_sqd(velocity_nb); + pressure_nb = + compute_pressure(density_nb, density_energy_nb, speed_sqd_nb); + speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb); + compute_flux_contribution( + density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, + flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, + flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy); + + // artificial viscosity + factor = -normal_len * smoothing_coefficient * float(0.5f) * + (speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i + + speed_of_sound_nb); + flux_i_density += factor * (density_i - density_nb); + flux_i_density_energy += factor * (density_energy_i - density_energy_nb); + flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x); + flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y); + flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z); + + // accumulate cell-centered fluxes + factor = float(0.5f) * normal.x; + flux_i_density += factor * (momentum_nb.x + momentum_i.x); + flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x + + flux_contribution_i_density_energy.x); + flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x + + flux_contribution_i_momentum_x.x); + flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x + + flux_contribution_i_momentum_y.x); + flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x + + flux_contribution_i_momentum_z.x); + + factor = float(0.5f) * normal.y; + flux_i_density += factor * (momentum_nb.y + momentum_i.y); + flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y + + flux_contribution_i_density_energy.y); + flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y + + flux_contribution_i_momentum_x.y); + flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y + + flux_contribution_i_momentum_y.y); + flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y + + flux_contribution_i_momentum_z.y); + + factor = float(0.5f) * normal.z; + flux_i_density += factor * (momentum_nb.z + momentum_i.z); + flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z + + flux_contribution_i_density_energy.z); + flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z + + flux_contribution_i_momentum_x.z); + flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z + + flux_contribution_i_momentum_y.z); + flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z + + flux_contribution_i_momentum_z.z); + } else if (nb == -1) // a wing boundary + { + flux_i_momentum.x += normal.x * pressure_i; + flux_i_momentum.y += normal.y * pressure_i; + flux_i_momentum.z += normal.z * pressure_i; + } else if (nb == -2) // a far field boundary + { + factor = float(0.5f) * normal.x; + flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x); + flux_i_density_energy += + factor * (ff_flux_contribution_density_energy[0].x + + flux_contribution_i_density_energy.x); + flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x + + flux_contribution_i_momentum_x.x); + flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x + + flux_contribution_i_momentum_y.x); + flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x + + flux_contribution_i_momentum_z.x); + + factor = float(0.5f) * normal.y; + flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y); + flux_i_density_energy += + factor * (ff_flux_contribution_density_energy[0].y + + flux_contribution_i_density_energy.y); + flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y + + flux_contribution_i_momentum_x.y); + flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y + + flux_contribution_i_momentum_y.y); + flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y + + flux_contribution_i_momentum_z.y); + + factor = float(0.5f) * normal.z; + flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z); + flux_i_density_energy += + factor * (ff_flux_contribution_density_energy[0].z + + flux_contribution_i_density_energy.z); + flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z + + flux_contribution_i_momentum_x.z); + flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z + + flux_contribution_i_momentum_y.z); + flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z + + flux_contribution_i_momentum_z.z); + } + } + + fluxes[i + VAR_DENSITY * nelr] = flux_i_density; + fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x; + fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y; + fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z; + fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy; +} +void compute_flux(int nelr, int *elements_surrounding_elements, float *normals, + float *variables, float *fluxes) { + dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3); + cuda_compute_flux<<>>(nelr, elements_surrounding_elements, normals, + variables, fluxes); + getLastCudaError("compute_flux failed"); +} + +__global__ void cuda_time_step(int j, int nelr, float *old_variables, + float *variables, float *step_factors, + float *fluxes) { + const int i = (blockDim.x * blockIdx.x + threadIdx.x); + + float factor = step_factors[i] / float(RK + 1 - j); + + variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] + + factor * fluxes[i + VAR_DENSITY * nelr]; + variables[i + VAR_DENSITY_ENERGY * nelr] = + old_variables[i + VAR_DENSITY_ENERGY * nelr] + + factor * fluxes[i + VAR_DENSITY_ENERGY * nelr]; + variables[i + (VAR_MOMENTUM + 0) * nelr] = + old_variables[i + (VAR_MOMENTUM + 0) * nelr] + + factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr]; + variables[i + (VAR_MOMENTUM + 1) * nelr] = + old_variables[i + (VAR_MOMENTUM + 1) * nelr] + + factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr]; + variables[i + (VAR_MOMENTUM + 2) * nelr] = + old_variables[i + (VAR_MOMENTUM + 2) * nelr] + + factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr]; +} +void time_step(int j, int nelr, float *old_variables, float *variables, + float *step_factors, float *fluxes) { + dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4); + cuda_time_step<<>>(j, nelr, old_variables, variables, step_factors, + fluxes); + getLastCudaError("update failed"); +} + +/* + * Main function + */ +int main(int argc, char **argv) { + printf("WG size of kernel:initialize = %d, WG size of " + "kernel:compute_step_factor = %d, WG size of kernel:compute_flux = " + "%d, WG size of kernel:time_step = %d\n", + BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4); + + if (argc < 2) { + std::cout << "specify data file name" << std::endl; + return 0; + } + const char *data_file_name = argv[1]; + + cudaDeviceProp prop; + int dev; + + checkCudaErrors(cudaSetDevice(0)); + + // set far field conditions and load them into constant memory on the gpu + { + float h_ff_variable[NVAR]; + const float angle_of_attack = + float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack); + + h_ff_variable[VAR_DENSITY] = float(1.4); + + float ff_pressure = float(1.0f); + float ff_speed_of_sound = + sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]); + float ff_speed = float(ff_mach) * ff_speed_of_sound; + + float3 ff_velocity; + ff_velocity.x = ff_speed * float(cos((float)angle_of_attack)); + ff_velocity.y = ff_speed * float(sin((float)angle_of_attack)); + ff_velocity.z = 0.0f; + + h_ff_variable[VAR_MOMENTUM + 0] = + h_ff_variable[VAR_DENSITY] * ff_velocity.x; + h_ff_variable[VAR_MOMENTUM + 1] = + h_ff_variable[VAR_DENSITY] * ff_velocity.y; + h_ff_variable[VAR_MOMENTUM + 2] = + h_ff_variable[VAR_DENSITY] * ff_velocity.z; + + h_ff_variable[VAR_DENSITY_ENERGY] = + h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) + + (ff_pressure / float(GAMMA - 1.0f)); + + float3 h_ff_momentum; + h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0); + h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1); + h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2); + float3 h_ff_flux_contribution_momentum_x; + float3 h_ff_flux_contribution_momentum_y; + float3 h_ff_flux_contribution_momentum_z; + float3 h_ff_flux_contribution_density_energy; + compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, + h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, + ff_velocity, h_ff_flux_contribution_momentum_x, + h_ff_flux_contribution_momentum_y, + h_ff_flux_contribution_momentum_z, + h_ff_flux_contribution_density_energy); + + // copy far field conditions to the gpu + checkCudaErrors( + cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float))); + checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x, + &h_ff_flux_contribution_momentum_x, + sizeof(float3))); + checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y, + &h_ff_flux_contribution_momentum_y, + sizeof(float3))); + checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z, + &h_ff_flux_contribution_momentum_z, + sizeof(float3))); + + checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy, + &h_ff_flux_contribution_density_energy, + sizeof(float3))); + } + int nel; + int nelr; + + // read in domain geometry + float *areas; + int *elements_surrounding_elements; + float *normals; + { + std::ifstream file(data_file_name); + + file >> nel; + nelr = + BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0)); + + float *h_areas = new float[nelr]; + int *h_elements_surrounding_elements = new int[nelr * NNB]; + float *h_normals = new float[nelr * NDIM * NNB]; + + // read in data + for (int i = 0; i < nel; i++) { + file >> h_areas[i]; + for (int j = 0; j < NNB; j++) { + file >> h_elements_surrounding_elements[i + j * nelr]; + if (h_elements_surrounding_elements[i + j * nelr] < 0) + h_elements_surrounding_elements[i + j * nelr] = -1; + h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with + // Fortran numbering + + for (int k = 0; k < NDIM; k++) { + file >> h_normals[i + (j + k * NNB) * nelr]; + h_normals[i + (j + k * NNB) * nelr] = + -h_normals[i + (j + k * NNB) * nelr]; + } + } + } + + // fill in remaining data + int last = nel - 1; + for (int i = nel; i < nelr; i++) { + h_areas[i] = h_areas[last]; + for (int j = 0; j < NNB; j++) { + // duplicate the last element + h_elements_surrounding_elements[i + j * nelr] = + h_elements_surrounding_elements[last + j * nelr]; + for (int k = 0; k < NDIM; k++) + h_normals[last + (j + k * NNB) * nelr] = + h_normals[last + (j + k * NNB) * nelr]; + } + } + + areas = alloc(nelr); + upload(areas, h_areas, nelr); + + elements_surrounding_elements = alloc(nelr * NNB); + upload(elements_surrounding_elements, h_elements_surrounding_elements, + nelr * NNB); + + normals = alloc(nelr * NDIM * NNB); + upload(normals, h_normals, nelr * NDIM * NNB); + + delete[] h_areas; + delete[] h_elements_surrounding_elements; + delete[] h_normals; + } + + // Create arrays and set initial conditions + float *variables = alloc(nelr * NVAR); + initialize_variables(nelr, variables); + + float *old_variables = alloc(nelr * NVAR); + float *fluxes = alloc(nelr * NVAR); + float *step_factors = alloc(nelr); + + // make sure all memory is floatly allocated before we start timing + initialize_variables(nelr, old_variables); + initialize_variables(nelr, fluxes); + cudaMemset((void *)step_factors, 0, sizeof(float) * nelr); + // make sure CUDA isn't still doing something before we start timing + cudaThreadSynchronize(); + + // these need to be computed the first time in order to compute time step + std::cout << "Starting..." << std::endl; + + StopWatchInterface *timer = 0; + // unsigned int timer = 0; + + // CUT_SAFE_CALL( cutCreateTimer( &timer)); + // CUT_SAFE_CALL( cutStartTimer( timer)); + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + // Begin iterations + for (int i = 0; i < iterations; i++) { + copy(old_variables, variables, nelr * NVAR); + + // for the first iteration we compute the time step + compute_step_factor(nelr, variables, areas, step_factors); + getLastCudaError("compute_step_factor failed"); + + for (int j = 0; j < RK; j++) { + compute_flux(nelr, elements_surrounding_elements, normals, variables, + fluxes); + getLastCudaError("compute_flux failed"); + time_step(j, nelr, old_variables, variables, step_factors, fluxes); + getLastCudaError("time_step failed"); + } + } + + cudaThreadSynchronize(); + // CUT_SAFE_CALL( cutStopTimer(timer) ); + sdkStopTimer(&timer); + + std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations + << " seconds per iteration" << std::endl; + + std::cout << "Saving solution..." << std::endl; + dump(variables, nel, nelr); + std::cout << "Saved solution..." << std::endl; + + std::cout << "Cleaning up..." << std::endl; + dealloc(areas); + dealloc(elements_surrounding_elements); + dealloc(normals); + + dealloc(variables); + dealloc(old_variables); + dealloc(fluxes); + dealloc(step_factors); + + std::cout << "Done..." << std::endl; + + return 0; +} diff --git a/examples/cfd/run.sh b/examples/cfd/run.sh new file mode 100644 index 0000000..bc5d506 --- /dev/null +++ b/examples/cfd/run.sh @@ -0,0 +1,15 @@ +# # #!/bin/bash +clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v + +/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc +/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread + +./a.out ../rodinia-data/cfd/fvcorr.domn.097K +# ./demo 1024 +# # # ./demo -f ../../data/matrix3.txt +# # # run -f ../../data/gaussian/matrix3.txt diff --git a/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..e0f12f5 --- /dev/null +++ b/examples/gauss/gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,396 @@ +; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "gaussian.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_blockDim_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any + +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 { +entry: + %m_cuda.addr = alloca float*, align 8 + %a_cuda.addr = alloca float*, align 8 + %Size.addr = alloca i32, align 4 + %t.addr = alloca i32, align 4 + store float* %m_cuda, float** %m_cuda.addr, align 8 + store float* %a_cuda, float** %a_cuda.addr, align 8 + store i32 %Size, i32* %Size.addr, align 4 + store i32 %t, i32* %t.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %mul = mul i32 %call1, %call2 + %add = add i32 %call, %mul + %0 = load i32, i32* %Size.addr, align 4 + %sub = sub nsw i32 %0, 1 + %1 = load i32, i32* %t.addr, align 4 + %sub3 = sub nsw i32 %sub, %1 + %cmp = icmp uge i32 %add, %sub3 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %return + +if.end: ; preds = %entry + %2 = load float*, float** %a_cuda.addr, align 8 + %3 = load i32, i32* %Size.addr, align 4 + %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %mul6 = mul i32 %call4, %call5 + %call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add8 = add i32 %mul6, %call7 + %4 = load i32, i32* %t.addr, align 4 + %add9 = add i32 %add8, %4 + %add10 = add i32 %add9, 1 + %mul11 = mul i32 %3, %add10 + %idx.ext = zext i32 %mul11 to i64 + %add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext + %5 = load i32, i32* %t.addr, align 4 + %idx.ext12 = sext i32 %5 to i64 + %add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12 + %6 = load float, float* %add.ptr13, align 4 + %7 = load float*, float** %a_cuda.addr, align 8 + %8 = load i32, i32* %Size.addr, align 4 + %9 = load i32, i32* %t.addr, align 4 + %mul14 = mul nsw i32 %8, %9 + %idx.ext15 = sext i32 %mul14 to i64 + %add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15 + %10 = load i32, i32* %t.addr, align 4 + %idx.ext17 = sext i32 %10 to i64 + %add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17 + %11 = load float, float* %add.ptr18, align 4 + %div = fdiv float %6, %11 + %12 = load float*, float** %m_cuda.addr, align 8 + %13 = load i32, i32* %Size.addr, align 4 + %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %mul21 = mul i32 %call19, %call20 + %call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add23 = add i32 %mul21, %call22 + %14 = load i32, i32* %t.addr, align 4 + %add24 = add i32 %add23, %14 + %add25 = add i32 %add24, 1 + %mul26 = mul i32 %13, %add25 + %idx.ext27 = zext i32 %mul26 to i64 + %add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27 + %15 = load i32, i32* %t.addr, align 4 + %idx.ext29 = sext i32 %15 to i64 + %add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29 + store float %div, float* %add.ptr30, align 4 + br label %return + +return: ; preds = %if.end, %if.then + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + ret i32 %0 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 { +entry: + %m_cuda.addr = alloca float*, align 8 + %a_cuda.addr = alloca float*, align 8 + %b_cuda.addr = alloca float*, align 8 + %Size.addr = alloca i32, align 4 + %j1.addr = alloca i32, align 4 + %t.addr = alloca i32, align 4 + %xidx = alloca i32, align 4 + %yidx = alloca i32, align 4 + store float* %m_cuda, float** %m_cuda.addr, align 8 + store float* %a_cuda, float** %a_cuda.addr, align 8 + store float* %b_cuda, float** %b_cuda.addr, align 8 + store i32 %Size, i32* %Size.addr, align 4 + store i32 %j1, i32* %j1.addr, align 4 + store i32 %t, i32* %t.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %mul = mul i32 %call1, %call2 + %add = add i32 %call, %mul + %0 = load i32, i32* %Size.addr, align 4 + %sub = sub nsw i32 %0, 1 + %1 = load i32, i32* %t.addr, align 4 + %sub3 = sub nsw i32 %sub, %1 + %cmp = icmp uge i32 %add, %sub3 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %if.end58 + +if.end: ; preds = %entry + %call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3 + %call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3 + %call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3 + %mul7 = mul i32 %call5, %call6 + %add8 = add i32 %call4, %mul7 + %2 = load i32, i32* %Size.addr, align 4 + %3 = load i32, i32* %t.addr, align 4 + %sub9 = sub nsw i32 %2, %3 + %cmp10 = icmp uge i32 %add8, %sub9 + br i1 %cmp10, label %if.then11, label %if.end12 + +if.then11: ; preds = %if.end + br label %if.end58 + +if.end12: ; preds = %if.end + %call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %mul15 = mul i32 %call13, %call14 + %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add17 = add i32 %mul15, %call16 + store i32 %add17, i32* %xidx, align 4 + %call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3 + %call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3 + %mul20 = mul i32 %call18, %call19 + %call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3 + %add22 = add i32 %mul20, %call21 + store i32 %add22, i32* %yidx, align 4 + %4 = load float*, float** %m_cuda.addr, align 8 + %5 = load i32, i32* %Size.addr, align 4 + %6 = load i32, i32* %xidx, align 4 + %add23 = add nsw i32 %6, 1 + %7 = load i32, i32* %t.addr, align 4 + %add24 = add nsw i32 %add23, %7 + %mul25 = mul nsw i32 %5, %add24 + %8 = load i32, i32* %t.addr, align 4 + %add26 = add nsw i32 %mul25, %8 + %idxprom = sext i32 %add26 to i64 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %9 = load float, float* %arrayidx, align 4 + %10 = load float*, float** %a_cuda.addr, align 8 + %11 = load i32, i32* %Size.addr, align 4 + %12 = load i32, i32* %t.addr, align 4 + %mul27 = mul nsw i32 %11, %12 + %13 = load i32, i32* %yidx, align 4 + %14 = load i32, i32* %t.addr, align 4 + %add28 = add nsw i32 %13, %14 + %add29 = add nsw i32 %mul27, %add28 + %idxprom30 = sext i32 %add29 to i64 + %arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30 + %15 = load float, float* %arrayidx31, align 4 + %mul32 = fmul contract float %9, %15 + %16 = load float*, float** %a_cuda.addr, align 8 + %17 = load i32, i32* %Size.addr, align 4 + %18 = load i32, i32* %xidx, align 4 + %add33 = add nsw i32 %18, 1 + %19 = load i32, i32* %t.addr, align 4 + %add34 = add nsw i32 %add33, %19 + %mul35 = mul nsw i32 %17, %add34 + %20 = load i32, i32* %yidx, align 4 + %21 = load i32, i32* %t.addr, align 4 + %add36 = add nsw i32 %20, %21 + %add37 = add nsw i32 %mul35, %add36 + %idxprom38 = sext i32 %add37 to i64 + %arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38 + %22 = load float, float* %arrayidx39, align 4 + %sub40 = fsub contract float %22, %mul32 + store float %sub40, float* %arrayidx39, align 4 + %23 = load i32, i32* %yidx, align 4 + %cmp41 = icmp eq i32 %23, 0 + br i1 %cmp41, label %if.then42, label %if.end58 + +if.then42: ; preds = %if.end12 + %24 = load float*, float** %m_cuda.addr, align 8 + %25 = load i32, i32* %Size.addr, align 4 + %26 = load i32, i32* %xidx, align 4 + %add43 = add nsw i32 %26, 1 + %27 = load i32, i32* %t.addr, align 4 + %add44 = add nsw i32 %add43, %27 + %mul45 = mul nsw i32 %25, %add44 + %28 = load i32, i32* %yidx, align 4 + %29 = load i32, i32* %t.addr, align 4 + %add46 = add nsw i32 %28, %29 + %add47 = add nsw i32 %mul45, %add46 + %idxprom48 = sext i32 %add47 to i64 + %arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48 + %30 = load float, float* %arrayidx49, align 4 + %31 = load float*, float** %b_cuda.addr, align 8 + %32 = load i32, i32* %t.addr, align 4 + %idxprom50 = sext i32 %32 to i64 + %arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50 + %33 = load float, float* %arrayidx51, align 4 + %mul52 = fmul contract float %30, %33 + %34 = load float*, float** %b_cuda.addr, align 8 + %35 = load i32, i32* %xidx, align 4 + %add53 = add nsw i32 %35, 1 + %36 = load i32, i32* %t.addr, align 4 + %add54 = add nsw i32 %add53, %36 + %idxprom55 = sext i32 %add54 to i64 + %arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55 + %37 = load float, float* %arrayidx56, align 4 + %sub57 = fsub contract float %37, %mul52 + store float %sub57, float* %arrayidx56, align 4 + br label %if.end58 + +if.end58: ; preds = %if.then, %if.then11, %if.then42, %if.end12 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + ret i32 %0 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7} +!llvm.ident = !{!9} +!nvvmir.version = !{!10} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1} +!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1} +!5 = !{null, !"align", i32 8} +!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!7 = !{null, !"align", i32 16} +!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!10 = !{i32 1, i32 4} diff --git a/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll b/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..a3dae03 --- /dev/null +++ b/examples/gauss/gaussian-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,1551 @@ +; ModuleID = 'gaussian-host-x86_64-unknown-linux-gnu.bc' +source_filename = "gaussian.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%struct.cudaDeviceProp = type { [256 x i8], %struct.CUuuid_st, [8 x i8], i32, i64, i64, i32, i32, i64, i32, [3 x i32], [3 x i32], i32, i64, i32, i32, i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], [2 x i32], [3 x i32], [2 x i32], [3 x i32], [3 x i32], i32, [2 x i32], [3 x i32], [2 x i32], i32, [2 x i32], [3 x i32], [2 x i32], [3 x i32], i32, [2 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32 } +%struct.CUuuid_st = type { [16 x i8] } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZSt3expf = comdat any + +$_ZN4dim3C2Ejjj = comdat any + +@Size = dso_local global i32 0, align 4 +@a = dso_local global float* null, align 8 +@b = dso_local global float* null, align 8 +@finalVec = dso_local global float* null, align 8 +@m = dso_local global float* null, align 8 +@fp = dso_local global %struct._IO_FILE* null, align 8 +@totalKernelTime = dso_local global i32 0, align 4 +@.str = private unnamed_addr constant [56 x i8] c"WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\0A\00", align 1 +@.str.1 = private unnamed_addr constant [45 x i8] c"Usage: gaussian -f filename / -s size [-q]\0A\0A\00", align 1 +@.str.2 = private unnamed_addr constant [62 x i8] c"-q (quiet) suppresses printing the matrix and result values.\0A\00", align 1 +@.str.3 = private unnamed_addr constant [34 x i8] c"-f (filename) path of input file\0A\00", align 1 +@.str.4 = private unnamed_addr constant [66 x i8] c"-s (size) size of matrix. Create matrix and rhs in this program \0A\00", align 1 +@.str.5 = private unnamed_addr constant [68 x i8] c"The first line of the file contains the dimension of the matrix, n.\00", align 1 +@.str.6 = private unnamed_addr constant [43 x i8] c"The second line of the file is a newline.\0A\00", align 1 +@.str.7 = private unnamed_addr constant [64 x i8] c"The next n lines contain n tab separated values for the matrix.\00", align 1 +@.str.8 = private unnamed_addr constant [41 x i8] c"The next line of the file is a newline.\0A\00", align 1 +@.str.9 = private unnamed_addr constant [70 x i8] c"The next line of the file is a 1xn vector with tab separated values.\0A\00", align 1 +@.str.10 = private unnamed_addr constant [52 x i8] c"The next line of the file is a newline. (optional)\0A\00", align 1 +@.str.11 = private unnamed_addr constant [69 x i8] c"The final line of the file is the pre-computed solution. (optional)\0A\00", align 1 +@.str.12 = private unnamed_addr constant [23 x i8] c"Example: matrix4.txt:\0A\00", align 1 +@.str.13 = private unnamed_addr constant [3 x i8] c"4\0A\00", align 1 +@.str.14 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 +@.str.15 = private unnamed_addr constant [19 x i8] c"-0.6\09-0.5\090.7\090.3\0A\00", align 1 +@.str.16 = private unnamed_addr constant [19 x i8] c"-0.3\09-0.9\090.3\090.7\0A\00", align 1 +@.str.17 = private unnamed_addr constant [21 x i8] c"-0.4\09-0.5\09-0.3\09-0.8\0A\00", align 1 +@.str.18 = private unnamed_addr constant [18 x i8] c"0.0\09-0.1\090.2\090.9\0A\00", align 1 +@.str.19 = private unnamed_addr constant [24 x i8] c"-0.85\09-0.68\090.24\09-0.53\0A\00", align 1 +@.str.20 = private unnamed_addr constant [19 x i8] c"0.7\090.0\09-0.4\09-0.5\0A\00", align 1 +@.str.21 = private unnamed_addr constant [47 x i8] c"Create matrix internally in parse, size = %d \0A\00", align 1 +@.str.22 = private unnamed_addr constant [20 x i8] c"Read file from %s \0A\00", align 1 +@.str.23 = private unnamed_addr constant [15 x i8] c"Matrix m is: \0A\00", align 1 +@.str.24 = private unnamed_addr constant [15 x i8] c"Matrix a is: \0A\00", align 1 +@.str.25 = private unnamed_addr constant [14 x i8] c"Array b is: \0A\00", align 1 +@.str.26 = private unnamed_addr constant [25 x i8] c"The final solution is: \0A\00", align 1 +@.str.27 = private unnamed_addr constant [49 x i8] c"\0ATime total (including memory transfers)\09%f sec\0A\00", align 1 +@.str.28 = private unnamed_addr constant [31 x i8] c"Time for CUDA kernels:\09%f sec\0A\00", align 1 +@.str.29 = private unnamed_addr constant [23 x i8] c"Total Device found: %d\00", align 1 +@.str.30 = private unnamed_addr constant [22 x i8] c"\0ADevice Name \09\09 - %s \00", align 1 +@.str.31 = private unnamed_addr constant [40 x i8] c"\0A**************************************\00", align 1 +@.str.32 = private unnamed_addr constant [33 x i8] c"\0ATotal Global Memory\09\09\09 - %lu KB\00", align 1 +@.str.33 = private unnamed_addr constant [46 x i8] c"\0AShared memory available per block \09 - %lu KB\00", align 1 +@.str.34 = private unnamed_addr constant [45 x i8] c"\0ANumber of registers per thread block \09 - %d\00", align 1 +@.str.35 = private unnamed_addr constant [31 x i8] c"\0AWarp size in threads \09\09\09 - %d\00", align 1 +@.str.36 = private unnamed_addr constant [31 x i8] c"\0AMemory Pitch \09\09\09\09 - %zu bytes\00", align 1 +@.str.37 = private unnamed_addr constant [35 x i8] c"\0AMaximum threads per block \09\09 - %d\00", align 1 +@.str.38 = private unnamed_addr constant [47 x i8] c"\0AMaximum Thread Dimension (block) \09 - %d %d %d\00", align 1 +@.str.39 = private unnamed_addr constant [46 x i8] c"\0AMaximum Thread Dimension (grid) \09 - %d %d %d\00", align 1 +@.str.40 = private unnamed_addr constant [39 x i8] c"\0ATotal constant memory \09\09\09 - %zu bytes\00", align 1 +@.str.41 = private unnamed_addr constant [23 x i8] c"\0ACUDA ver \09\09\09\09 - %d.%d\00", align 1 +@.str.42 = private unnamed_addr constant [26 x i8] c"\0AClock rate \09\09\09\09 - %d KHz\00", align 1 +@.str.43 = private unnamed_addr constant [35 x i8] c"\0ATexture Alignment \09\09\09 - %zu bytes\00", align 1 +@.str.44 = private unnamed_addr constant [26 x i8] c"\0ADevice Overlap \09\09\09\09 - %s\00", align 1 +@.str.45 = private unnamed_addr constant [8 x i8] c"Allowed\00", align 1 +@.str.46 = private unnamed_addr constant [12 x i8] c"Not Allowed\00", align 1 +@.str.47 = private unnamed_addr constant [38 x i8] c"\0ANumber of Multi processors \09\09 - %d\0A\0A\00", align 1 +@.str.48 = private unnamed_addr constant [4 x i8] c"\0A%s\00", align 1 +@.str.49 = private unnamed_addr constant [22 x i8] c"The file name is: %s\0A\00", align 1 +@.str.50 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.51 = private unnamed_addr constant [3 x i8] c"%d\00", align 1 +@.str.52 = private unnamed_addr constant [24 x i8] c"The input matrix a is:\0A\00", align 1 +@.str.53 = private unnamed_addr constant [23 x i8] c"The input array b is:\0A\00", align 1 +@.str.54 = private unnamed_addr constant [18 x i8] c"1d grid size: %d\0A\00", align 1 +@.str.55 = private unnamed_addr constant [14 x i8] c"BlockXY: %d \0A\00", align 1 +@.str.56 = private unnamed_addr constant [32 x i8] c"first grid size: %d second: %d\0A\00", align 1 +@.str.57 = private unnamed_addr constant [5 x i8] c"Fan2\00", align 1 +@.str.58 = private unnamed_addr constant [3 x i8] c"%f\00", align 1 +@.str.59 = private unnamed_addr constant [6 x i8] c"%.2f \00", align 1 +@.str.60 = private unnamed_addr constant [3 x i8] c"\0A\0A\00", align 1 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.61 = private unnamed_addr constant [21 x i8] c"Cuda error: %s: %s.\0A\00", align 1 +@0 = private unnamed_addr constant [14 x i8] c"_Z4Fan1PfS_ii\00", align 1 +@1 = private unnamed_addr constant [17 x i8] c"_Z4Fan2PfS_S_iii\00", align 1 +@2 = private constant [16065 x i8] c"P\EDU\BA\01\00\10\00\B0>\00\00\00\00\00\00\02\00\01\01@\00\00\00h4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\C03\00\00\00\00\00\00\C00\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z4Fan2PfS_S_iii\00.nv.info._Z4Fan2PfS_S_iii\00.nv.shared._Z4Fan2PfS_S_iii\00.nv.global\00.nv.constant0._Z4Fan2PfS_S_iii\00.text._Z4Fan1PfS_ii\00.nv.info._Z4Fan1PfS_ii\00.nv.shared._Z4Fan1PfS_ii\00.nv.constant0._Z4Fan1PfS_ii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z4Fan2PfS_S_iii\00.text._Z4Fan2PfS_S_iii\00.nv.info._Z4Fan2PfS_S_iii\00.nv.shared._Z4Fan2PfS_S_iii\00.nv.global\00threadIdx\00blockIdx\00blockDim\00.nv.constant0._Z4Fan2PfS_S_iii\00_param\00_Z4Fan1PfS_ii\00.text._Z4Fan1PfS_ii\00.nv.info._Z4Fan1PfS_ii\00.nv.shared._Z4Fan1PfS_ii\00$_Z4Fan1PfS_ii$__cuda_sm3x_div_rn_noftz_f32\00$_Z4Fan1PfS_ii$__cuda_sm3x_div_rn_noftz_f32_slowpath\00.nv.constant0._Z4Fan1PfS_ii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00C\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\90\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9B\00\00\00\01\00\0B\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\A5\00\00\00\01\00\0B\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\AE\00\00\00\01\00\0B\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\B7\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EB\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00/\01\00\00\22\00\0A\00\D8\09\00\00\00\00\00\00`\01\00\00\00\00\00\00[\01\00\00\22\00\0A\008\0B\00\00\00\00\00\00H\08\00\00\00\00\00\00\90\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\14\00\00\00\00\00\00\DD\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00\80\13\00\00\00\00\00\00\04/\08\00\0C\00\00\00\13\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00\00\00\00\00\04\11\08\00\09\00\00\00\00\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00\18\00\00\00\04\11\08\00\0C\00\00\00\18\00\00\00\04/\08\00\0B\00\00\00\0F\00\00\00\04#\08\00\0B\00\00\00\00\00\00\00\04\12\08\00\0B\00\00\000\00\00\00\04\11\08\00\0B\00\00\000\00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\10\00\F8\04\00\00h\06\00\00\90\07\00\008\08\00\00\04\1C\04\00p\14\00\00\04\1E\04\00 \00\00\00\010\00\00\01*\00\00\04\0A\08\00\0A\00\00\00@\01\18\00\03\19\18\00\04\17\0C\00\00\00\00\00\03\00\14\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\98\03\00\00\C8\05\00\00\04\1C\04\00\D0\09\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB3\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\0Dvisible .entry _Z4Fan1PfS_ii\96\04\00\90\00\09\1B\00\0Em\04\0F#\00\05\07D\04\00\A8\00\0F#\00\01\1F2#\00\08\0F\EE\07\1BO6[24\A6\03\15wpred %px\0A\10fH\01\18f\B8\03\03\9A\0A\0E\12\08/21\CB\03\0C\1F6\CB\03\1C\0E\F5\00\0FM\03\06\0EC\01\0F$\03\07\0E\92\01\0F\FB\02\07\0F\E1\01\02\13]\C8\00#to\DB\12\07+\04\02\C5\02\01\9E\0D\0A\1C\00\144\B4\02\0F;\00\03\145\16\03\0F;\00\00\116\1C\00\1F5H\03\02\1F6H\03\02\1F4H\03\09\04\16\00/201\03\02h%tid.x\15\00\00\BB\00\0A\1F2;\0A\02\03~\00\179~\00$3,@\0AS;\0Aneg\16\00\114\B7\0AV;\0Afma\CC\0A$5,\1A\00\132l\0A\0E\81\0A\175s\01(43s\01\01j\04\22ne\C1\003p3,!\00\02g\04\163g\04\1B6g\04\135O\04\185\B3\0D/21)\03\01/44)\03\02/45)\03\02/46)\03\03347,5\00\00$\00\09\1A\00\02u\01\1E7)\03349,\80\00\00&\00\07e\00/50w\02\03351,\1E\00\0Ce\00552,Q\00-51\F9\01\02\EE\0C\1A5\F9\01\02\F6\04\01\1C\00\0B\F9\01$4,/\01\01'\00\07\F9\01\136\F9\01\194Z\03(25\F7\0E\0C|\00\146\AF\00\08|\00$7,\1C\00\0B|\00$8,R\00\01'\00\07|\00\137|\00\1F8d\00\00\02J\01\0A0\0E\02I\05\01\1C\00\0Ad\00(31d\00'30d\00\128d\00)31\D9\02\00^\00+f6\D9\02\01y\04\02\1B\00\00\CD\00)f8\DA\02!31\DA\02\00h\08\0F\E8\06\02\B06:\0Aret;\0A\0A}\0A\00\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([16065 x i8], [16065 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13create_matrixPfi(float* %m, i32 %size) #0 { +entry: + %m.addr = alloca float*, align 8 + %size.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %lamda = alloca float, align 4 + %saved_stack = alloca i8*, align 8 + %__vla_expr0 = alloca i64, align 8 + %coe_i = alloca float, align 4 + store float* %m, float** %m.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + store float 0xBF847AE140000000, float* %lamda, align 4 + %0 = load i32, i32* %size.addr, align 4 + %mul = mul nsw i32 2, %0 + %sub = sub nsw i32 %mul, 1 + %1 = zext i32 %sub to i64 + %2 = call i8* @llvm.stacksave() + store i8* %2, i8** %saved_stack, align 8 + %vla = alloca float, i64 %1, align 16 + store i64 %1, i64* %__vla_expr0, align 8 + store float 0.000000e+00, float* %coe_i, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %3 = load i32, i32* %i, align 4 + %4 = load i32, i32* %size.addr, align 4 + %cmp = icmp slt i32 %3, %4 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %5 = load float, float* %lamda, align 4 + %6 = load i32, i32* %i, align 4 + %conv = sitofp i32 %6 to float + %mul1 = fmul contract float %5, %conv + %call = call float @_ZSt3expf(float %mul1) + %mul2 = fmul contract float 1.000000e+01, %call + store float %mul2, float* %coe_i, align 4 + %7 = load i32, i32* %size.addr, align 4 + %sub3 = sub nsw i32 %7, 1 + %8 = load i32, i32* %i, align 4 + %add = add nsw i32 %sub3, %8 + store i32 %add, i32* %j, align 4 + %9 = load float, float* %coe_i, align 4 + %10 = load i32, i32* %j, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds float, float* %vla, i64 %idxprom + store float %9, float* %arrayidx, align 4 + %11 = load i32, i32* %size.addr, align 4 + %sub4 = sub nsw i32 %11, 1 + %12 = load i32, i32* %i, align 4 + %sub5 = sub nsw i32 %sub4, %12 + store i32 %sub5, i32* %j, align 4 + %13 = load float, float* %coe_i, align 4 + %14 = load i32, i32* %j, align 4 + %idxprom6 = sext i32 %14 to i64 + %arrayidx7 = getelementptr inbounds float, float* %vla, i64 %idxprom6 + store float %13, float* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + store i32 0, i32* %i, align 4 + br label %for.cond8 + +for.cond8: ; preds = %for.inc26, %for.end + %16 = load i32, i32* %i, align 4 + %17 = load i32, i32* %size.addr, align 4 + %cmp9 = icmp slt i32 %16, %17 + br i1 %cmp9, label %for.body10, label %for.end28 + +for.body10: ; preds = %for.cond8 + store i32 0, i32* %j, align 4 + br label %for.cond11 + +for.cond11: ; preds = %for.inc23, %for.body10 + %18 = load i32, i32* %j, align 4 + %19 = load i32, i32* %size.addr, align 4 + %cmp12 = icmp slt i32 %18, %19 + br i1 %cmp12, label %for.body13, label %for.end25 + +for.body13: ; preds = %for.cond11 + %20 = load i32, i32* %size.addr, align 4 + %sub14 = sub nsw i32 %20, 1 + %21 = load i32, i32* %i, align 4 + %sub15 = sub nsw i32 %sub14, %21 + %22 = load i32, i32* %j, align 4 + %add16 = add nsw i32 %sub15, %22 + %idxprom17 = sext i32 %add16 to i64 + %arrayidx18 = getelementptr inbounds float, float* %vla, i64 %idxprom17 + %23 = load float, float* %arrayidx18, align 4 + %24 = load float*, float** %m.addr, align 8 + %25 = load i32, i32* %i, align 4 + %26 = load i32, i32* %size.addr, align 4 + %mul19 = mul nsw i32 %25, %26 + %27 = load i32, i32* %j, align 4 + %add20 = add nsw i32 %mul19, %27 + %idxprom21 = sext i32 %add20 to i64 + %arrayidx22 = getelementptr inbounds float, float* %24, i64 %idxprom21 + store float %23, float* %arrayidx22, align 4 + br label %for.inc23 + +for.inc23: ; preds = %for.body13 + %28 = load i32, i32* %j, align 4 + %inc24 = add nsw i32 %28, 1 + store i32 %inc24, i32* %j, align 4 + br label %for.cond11 + +for.end25: ; preds = %for.cond11 + br label %for.inc26 + +for.inc26: ; preds = %for.end25 + %29 = load i32, i32* %i, align 4 + %inc27 = add nsw i32 %29, 1 + store i32 %inc27, i32* %i, align 4 + br label %for.cond8 + +for.end28: ; preds = %for.cond8 + %30 = load i8*, i8** %saved_stack, align 8 + call void @llvm.stackrestore(i8* %30) + ret void +} + +; Function Attrs: nounwind +declare i8* @llvm.stacksave() #1 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt3expf(float %__x) #2 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %call = call float @expf(float %0) #1 + ret float %call +} + +; Function Attrs: nounwind +declare void @llvm.stackrestore(i8*) #1 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #3 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %verbose = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %flag = alloca i8, align 1 + %time_start = alloca %struct.timeval, align 8 + %time_end = alloca %struct.timeval, align 8 + %time_total = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([56 x i8], [56 x i8]* @.str, i64 0, i64 0), i32 512, i32 1, i32 1) + store i32 1, i32* %verbose, align 4 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp slt i32 %0, 2 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0)) + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([62 x i8], [62 x i8]* @.str.2, i64 0, i64 0)) + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.3, i64 0, i64 0)) + %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([66 x i8], [66 x i8]* @.str.4, i64 0, i64 0)) + %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([68 x i8], [68 x i8]* @.str.5, i64 0, i64 0)) + %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.6, i64 0, i64 0)) + %call7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([64 x i8], [64 x i8]* @.str.7, i64 0, i64 0)) + %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.8, i64 0, i64 0)) + %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([70 x i8], [70 x i8]* @.str.9, i64 0, i64 0)) + %call10 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.10, i64 0, i64 0)) + %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([69 x i8], [69 x i8]* @.str.11, i64 0, i64 0)) + %call12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.12, i64 0, i64 0)) + %call13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.13, i64 0, i64 0)) + %call14 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.14, i64 0, i64 0)) + %call15 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.15, i64 0, i64 0)) + %call16 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.16, i64 0, i64 0)) + %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0)) + %call18 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.18, i64 0, i64 0)) + %call19 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.14, i64 0, i64 0)) + %call20 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.19, i64 0, i64 0)) + %call21 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.14, i64 0, i64 0)) + %call22 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.20, i64 0, i64 0)) + call void @exit(i32 0) #9 + unreachable + +if.end: ; preds = %entry + %call23 = call i32 @cudaSetDevice(i32 0) + call void @_Z21PrintDevicePropertiesv() + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc61, %if.end + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %argc.addr, align 4 + %cmp24 = icmp slt i32 %1, %2 + br i1 %cmp24, label %for.body, label %for.end63 + +for.body: ; preds = %for.cond + %3 = load i8**, i8*** %argv.addr, align 8 + %4 = load i32, i32* %i, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i8*, i8** %3, i64 %idxprom + %5 = load i8*, i8** %arrayidx, align 8 + %arrayidx25 = getelementptr inbounds i8, i8* %5, i64 0 + %6 = load i8, i8* %arrayidx25, align 1 + %conv = sext i8 %6 to i32 + %cmp26 = icmp eq i32 %conv, 45 + br i1 %cmp26, label %if.then27, label %if.end60 + +if.then27: ; preds = %for.body + %7 = load i8**, i8*** %argv.addr, align 8 + %8 = load i32, i32* %i, align 4 + %idxprom28 = sext i32 %8 to i64 + %arrayidx29 = getelementptr inbounds i8*, i8** %7, i64 %idxprom28 + %9 = load i8*, i8** %arrayidx29, align 8 + %arrayidx30 = getelementptr inbounds i8, i8* %9, i64 1 + %10 = load i8, i8* %arrayidx30, align 1 + store i8 %10, i8* %flag, align 1 + %11 = load i8, i8* %flag, align 1 + %conv31 = sext i8 %11 to i32 + switch i32 %conv31, label %sw.epilog [ + i32 115, label %sw.bb + i32 102, label %sw.bb52 + i32 113, label %sw.bb59 + ] + +sw.bb: ; preds = %if.then27 + %12 = load i32, i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + %13 = load i8**, i8*** %argv.addr, align 8 + %14 = load i32, i32* %i, align 4 + %idxprom32 = sext i32 %14 to i64 + %arrayidx33 = getelementptr inbounds i8*, i8** %13, i64 %idxprom32 + %15 = load i8*, i8** %arrayidx33, align 8 + %call34 = call i32 @atoi(i8* %15) #10 + store i32 %call34, i32* @Size, align 4 + %16 = load i32, i32* @Size, align 4 + %call35 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.21, i64 0, i64 0), i32 %16) + %17 = load i32, i32* @Size, align 4 + %18 = load i32, i32* @Size, align 4 + %mul = mul nsw i32 %17, %18 + %conv36 = sext i32 %mul to i64 + %mul37 = mul i64 %conv36, 4 + %call38 = call noalias i8* @malloc(i64 %mul37) #1 + %19 = bitcast i8* %call38 to float* + store float* %19, float** @a, align 8 + %20 = load float*, float** @a, align 8 + %21 = load i32, i32* @Size, align 4 + call void @_Z13create_matrixPfi(float* %20, i32 %21) + %22 = load i32, i32* @Size, align 4 + %conv39 = sext i32 %22 to i64 + %mul40 = mul i64 %conv39, 4 + %call41 = call noalias i8* @malloc(i64 %mul40) #1 + %23 = bitcast i8* %call41 to float* + store float* %23, float** @b, align 8 + store i32 0, i32* %j, align 4 + br label %for.cond42 + +for.cond42: ; preds = %for.inc, %sw.bb + %24 = load i32, i32* %j, align 4 + %25 = load i32, i32* @Size, align 4 + %cmp43 = icmp slt i32 %24, %25 + br i1 %cmp43, label %for.body44, label %for.end + +for.body44: ; preds = %for.cond42 + %26 = load float*, float** @b, align 8 + %27 = load i32, i32* %j, align 4 + %idxprom45 = sext i32 %27 to i64 + %arrayidx46 = getelementptr inbounds float, float* %26, i64 %idxprom45 + store float 1.000000e+00, float* %arrayidx46, align 4 + br label %for.inc + +for.inc: ; preds = %for.body44 + %28 = load i32, i32* %j, align 4 + %inc47 = add nsw i32 %28, 1 + store i32 %inc47, i32* %j, align 4 + br label %for.cond42 + +for.end: ; preds = %for.cond42 + %29 = load i32, i32* @Size, align 4 + %30 = load i32, i32* @Size, align 4 + %mul48 = mul nsw i32 %29, %30 + %conv49 = sext i32 %mul48 to i64 + %mul50 = mul i64 %conv49, 4 + %call51 = call noalias i8* @malloc(i64 %mul50) #1 + %31 = bitcast i8* %call51 to float* + store float* %31, float** @m, align 8 + br label %sw.epilog + +sw.bb52: ; preds = %if.then27 + %32 = load i32, i32* %i, align 4 + %inc53 = add nsw i32 %32, 1 + store i32 %inc53, i32* %i, align 4 + %33 = load i8**, i8*** %argv.addr, align 8 + %34 = load i32, i32* %i, align 4 + %idxprom54 = sext i32 %34 to i64 + %arrayidx55 = getelementptr inbounds i8*, i8** %33, i64 %idxprom54 + %35 = load i8*, i8** %arrayidx55, align 8 + %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.22, i64 0, i64 0), i8* %35) + %36 = load i8**, i8*** %argv.addr, align 8 + %37 = load i32, i32* %i, align 4 + %idxprom57 = sext i32 %37 to i64 + %arrayidx58 = getelementptr inbounds i8*, i8** %36, i64 %idxprom57 + %38 = load i8*, i8** %arrayidx58, align 8 + call void @_Z15InitProblemOncePc(i8* %38) + br label %sw.epilog + +sw.bb59: ; preds = %if.then27 + store i32 1, i32* %verbose, align 4 + br label %sw.epilog + +sw.epilog: ; preds = %if.then27, %sw.bb59, %sw.bb52, %for.end + br label %if.end60 + +if.end60: ; preds = %sw.epilog, %for.body + br label %for.inc61 + +for.inc61: ; preds = %if.end60 + %39 = load i32, i32* %i, align 4 + %inc62 = add nsw i32 %39, 1 + store i32 %inc62, i32* %i, align 4 + br label %for.cond + +for.end63: ; preds = %for.cond + call void @_Z10InitPerRunv() + %call64 = call i32 @gettimeofday(%struct.timeval* %time_start, %struct.timezone* null) #1 + call void @_Z10ForwardSubv() + %call65 = call i32 @gettimeofday(%struct.timeval* %time_end, %struct.timezone* null) #1 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 0 + %40 = load i64, i64* %tv_sec, align 8 + %mul66 = mul nsw i64 %40, 1000000 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 1 + %41 = load i64, i64* %tv_usec, align 8 + %add = add nsw i64 %mul66, %41 + %tv_sec67 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 0 + %42 = load i64, i64* %tv_sec67, align 8 + %mul68 = mul nsw i64 %42, 1000000 + %tv_usec69 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 1 + %43 = load i64, i64* %tv_usec69, align 8 + %add70 = add nsw i64 %mul68, %43 + %sub = sub nsw i64 %add, %add70 + %conv71 = trunc i64 %sub to i32 + store i32 %conv71, i32* %time_total, align 4 + %44 = load i32, i32* %verbose, align 4 + %tobool = icmp ne i32 %44, 0 + br i1 %tobool, label %if.then72, label %if.end76 + +if.then72: ; preds = %for.end63 + %call73 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.23, i64 0, i64 0)) + %45 = load float*, float** @m, align 8 + %46 = load i32, i32* @Size, align 4 + %47 = load i32, i32* @Size, align 4 + call void @_Z8PrintMatPfii(float* %45, i32 %46, i32 %47) + %call74 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.24, i64 0, i64 0)) + %48 = load float*, float** @a, align 8 + %49 = load i32, i32* @Size, align 4 + %50 = load i32, i32* @Size, align 4 + call void @_Z8PrintMatPfii(float* %48, i32 %49, i32 %50) + %call75 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.25, i64 0, i64 0)) + %51 = load float*, float** @b, align 8 + %52 = load i32, i32* @Size, align 4 + call void @_Z8PrintAryPfi(float* %51, i32 %52) + br label %if.end76 + +if.end76: ; preds = %if.then72, %for.end63 + call void @_Z7BackSubv() + %53 = load i32, i32* %verbose, align 4 + %tobool77 = icmp ne i32 %53, 0 + br i1 %tobool77, label %if.then78, label %if.end80 + +if.then78: ; preds = %if.end76 + %call79 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.26, i64 0, i64 0)) + %54 = load float*, float** @finalVec, align 8 + %55 = load i32, i32* @Size, align 4 + call void @_Z8PrintAryPfi(float* %54, i32 %55) + br label %if.end80 + +if.end80: ; preds = %if.then78, %if.end76 + %56 = load i32, i32* %time_total, align 4 + %conv81 = uitofp i32 %56 to double + %mul82 = fmul contract double %conv81, 0x3EB0C6F7A0B5ED8D + %call83 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.27, i64 0, i64 0), double %mul82) + %57 = load i32, i32* @totalKernelTime, align 4 + %conv84 = uitofp i32 %57 to double + %mul85 = fmul contract double %conv84, 0x3EB0C6F7A0B5ED8D + %call86 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.28, i64 0, i64 0), double %mul85) + %58 = load float*, float** @m, align 8 + %59 = bitcast float* %58 to i8* + call void @free(i8* %59) #1 + %60 = load float*, float** @a, align 8 + %61 = bitcast float* %60 to i8* + call void @free(i8* %61) #1 + %62 = load float*, float** @b, align 8 + %63 = bitcast float* %62 to i8* + call void @free(i8* %63) #1 + %64 = load i32, i32* %retval, align 4 + ret i32 %64 +} + +declare dso_local i32 @printf(i8*, ...) #4 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #5 + +declare dso_local i32 @cudaSetDevice(i32) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z21PrintDevicePropertiesv() #0 { +entry: + %deviceProp = alloca %struct.cudaDeviceProp, align 8 + %nDevCount = alloca i32, align 4 + %nDeviceIdx = alloca i32, align 4 + store i32 0, i32* %nDevCount, align 4 + %call = call i32 @cudaGetDeviceCount(i32* %nDevCount) + %0 = load i32, i32* %nDevCount, align 4 + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.29, i64 0, i64 0), i32 %0) + store i32 0, i32* %nDeviceIdx, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %nDeviceIdx, align 4 + %2 = load i32, i32* %nDevCount, align 4 + %cmp = icmp slt i32 %1, %2 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %3 = bitcast %struct.cudaDeviceProp* %deviceProp to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %3, i8 0, i64 712, i1 false) + %4 = load i32, i32* %nDeviceIdx, align 4 + %call2 = call i32 @cudaGetDeviceProperties(%struct.cudaDeviceProp* %deviceProp, i32 %4) + %cmp3 = icmp eq i32 0, %call2 + br i1 %cmp3, label %if.then, label %if.else + +if.then: ; preds = %for.body + %name = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 0 + %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %name, i64 0, i64 0 + %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.30, i64 0, i64 0), i8* %arraydecay) + %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.31, i64 0, i64 0)) + %totalGlobalMem = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 4 + %5 = load i64, i64* %totalGlobalMem, align 8 + %div = udiv i64 %5, 1024 + %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.32, i64 0, i64 0), i64 %div) + %sharedMemPerBlock = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 5 + %6 = load i64, i64* %sharedMemPerBlock, align 8 + %div7 = udiv i64 %6, 1024 + %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.33, i64 0, i64 0), i64 %div7) + %regsPerBlock = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 6 + %7 = load i32, i32* %regsPerBlock, align 8 + %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @.str.34, i64 0, i64 0), i32 %7) + %warpSize = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 7 + %8 = load i32, i32* %warpSize, align 4 + %call10 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.35, i64 0, i64 0), i32 %8) + %memPitch = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 8 + %9 = load i64, i64* %memPitch, align 8 + %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.36, i64 0, i64 0), i64 %9) + %maxThreadsPerBlock = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 9 + %10 = load i32, i32* %maxThreadsPerBlock, align 8 + %call12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.37, i64 0, i64 0), i32 %10) + %maxThreadsDim = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 10 + %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %maxThreadsDim, i64 0, i64 0 + %11 = load i32, i32* %arrayidx, align 4 + %maxThreadsDim13 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 10 + %arrayidx14 = getelementptr inbounds [3 x i32], [3 x i32]* %maxThreadsDim13, i64 0, i64 1 + %12 = load i32, i32* %arrayidx14, align 4 + %maxThreadsDim15 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 10 + %arrayidx16 = getelementptr inbounds [3 x i32], [3 x i32]* %maxThreadsDim15, i64 0, i64 2 + %13 = load i32, i32* %arrayidx16, align 4 + %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.38, i64 0, i64 0), i32 %11, i32 %12, i32 %13) + %maxGridSize = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 + %arrayidx18 = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize, i64 0, i64 0 + %14 = load i32, i32* %arrayidx18, align 8 + %maxGridSize19 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 + %arrayidx20 = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize19, i64 0, i64 1 + %15 = load i32, i32* %arrayidx20, align 4 + %maxGridSize21 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 + %arrayidx22 = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize21, i64 0, i64 2 + %16 = load i32, i32* %arrayidx22, align 8 + %call23 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.39, i64 0, i64 0), i32 %14, i32 %15, i32 %16) + %totalConstMem = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 13 + %17 = load i64, i64* %totalConstMem, align 8 + %call24 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.40, i64 0, i64 0), i64 %17) + %major = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 14 + %18 = load i32, i32* %major, align 8 + %minor = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 15 + %19 = load i32, i32* %minor, align 4 + %call25 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.41, i64 0, i64 0), i32 %18, i32 %19) + %clockRate = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 12 + %20 = load i32, i32* %clockRate, align 4 + %call26 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.42, i64 0, i64 0), i32 %20) + %textureAlignment = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 16 + %21 = load i64, i64* %textureAlignment, align 8 + %call27 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.43, i64 0, i64 0), i64 %21) + %deviceOverlap = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 18 + %22 = load i32, i32* %deviceOverlap, align 8 + %tobool = icmp ne i32 %22, 0 + %23 = zext i1 %tobool to i64 + %cond = select i1 %tobool, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.45, i64 0, i64 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.46, i64 0, i64 0) + %call28 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.44, i64 0, i64 0), i8* %cond) + %multiProcessorCount = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 19 + %24 = load i32, i32* %multiProcessorCount, align 4 + %call29 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.47, i64 0, i64 0), i32 %24) + br label %if.end + +if.else: ; preds = %for.body + %call30 = call i32 @cudaGetLastError() + %call31 = call i8* @cudaGetErrorString(i32 %call30) + %call32 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.48, i64 0, i64 0), i8* %call31) + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %for.inc + +for.inc: ; preds = %if.end + %25 = load i32, i32* %nDeviceIdx, align 4 + %inc = add nsw i32 %25, 1 + store i32 %inc, i32* %nDeviceIdx, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #6 + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #7 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z15InitProblemOncePc(i8* %filename) #0 { +entry: + %filename.addr = alloca i8*, align 8 + store i8* %filename, i8** %filename.addr, align 8 + %0 = load i8*, i8** %filename.addr, align 8 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.49, i64 0, i64 0), i8* %0) + %1 = load i8*, i8** %filename.addr, align 8 + %call1 = call %struct._IO_FILE* @fopen(i8* %1, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.50, i64 0, i64 0)) + store %struct._IO_FILE* %call1, %struct._IO_FILE** @fp, align 8 + %2 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.51, i64 0, i64 0), i32* @Size) + %3 = load i32, i32* @Size, align 4 + %4 = load i32, i32* @Size, align 4 + %mul = mul nsw i32 %3, %4 + %conv = sext i32 %mul to i64 + %mul3 = mul i64 %conv, 4 + %call4 = call noalias i8* @malloc(i64 %mul3) #1 + %5 = bitcast i8* %call4 to float* + store float* %5, float** @a, align 8 + %6 = load float*, float** @a, align 8 + %7 = load i32, i32* @Size, align 4 + %8 = load i32, i32* @Size, align 4 + call void @_Z7InitMatPfii(float* %6, i32 %7, i32 %8) + %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.52, i64 0, i64 0)) + %9 = load float*, float** @a, align 8 + %10 = load i32, i32* @Size, align 4 + %11 = load i32, i32* @Size, align 4 + call void @_Z8PrintMatPfii(float* %9, i32 %10, i32 %11) + %12 = load i32, i32* @Size, align 4 + %conv6 = sext i32 %12 to i64 + %mul7 = mul i64 %conv6, 4 + %call8 = call noalias i8* @malloc(i64 %mul7) #1 + %13 = bitcast i8* %call8 to float* + store float* %13, float** @b, align 8 + %14 = load float*, float** @b, align 8 + %15 = load i32, i32* @Size, align 4 + call void @_Z7InitAryPfi(float* %14, i32 %15) + %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.53, i64 0, i64 0)) + %16 = load float*, float** @b, align 8 + %17 = load i32, i32* @Size, align 4 + call void @_Z8PrintAryPfi(float* %16, i32 %17) + %18 = load i32, i32* @Size, align 4 + %19 = load i32, i32* @Size, align 4 + %mul10 = mul nsw i32 %18, %19 + %conv11 = sext i32 %mul10 to i64 + %mul12 = mul i64 %conv11, 4 + %call13 = call noalias i8* @malloc(i64 %mul12) #1 + %20 = bitcast i8* %call13 to float* + store float* %20, float** @m, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z10InitPerRunv() #2 { +entry: + %i = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* @Size, align 4 + %2 = load i32, i32* @Size, align 4 + %mul = mul nsw i32 %1, %2 + %cmp = icmp slt i32 %0, %mul + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %3 = load float*, float** @m, align 8 + %4 = load i32, i32* %i, align 4 + %idx.ext = sext i32 %4 to i64 + %add.ptr = getelementptr inbounds float, float* %3, i64 %idx.ext + store float 0.000000e+00, float* %add.ptr, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #7 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z10ForwardSubv() #0 { +entry: + %t = alloca i32, align 4 + %m_cuda = alloca float*, align 8 + %a_cuda = alloca float*, align 8 + %b_cuda = alloca float*, align 8 + %A = alloca i32, align 4 + %B = alloca i32, align 4 + %C = alloca i32, align 4 + %D = alloca i32, align 4 + %E = alloca i32, align 4 + %F = alloca i32, align 4 + %block_size = alloca i32, align 4 + %grid_size = alloca i32, align 4 + %dimBlock = alloca %struct.dim3, align 4 + %dimGrid = alloca %struct.dim3, align 4 + %blockSize2d = alloca i32, align 4 + %gridSize2d = alloca i32, align 4 + %dimBlockXY = alloca %struct.dim3, align 4 + %dimGridXY = alloca %struct.dim3, align 4 + %time_start = alloca %struct.timeval, align 8 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp32 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp32.coerce = alloca { i64, i32 }, align 4 + %agg.tmp36 = alloca %struct.dim3, align 4 + %agg.tmp37 = alloca %struct.dim3, align 4 + %agg.tmp36.coerce = alloca { i64, i32 }, align 4 + %agg.tmp37.coerce = alloca { i64, i32 }, align 4 + %time_end = alloca %struct.timeval, align 8 + store i32 1, i32* %A, align 4 + store i32 2, i32* %B, align 4 + store i32 3, i32* %C, align 4 + store i32 4, i32* %D, align 4 + store i32 5, i32* %E, align 4 + store i32 6, i32* %F, align 4 + %0 = bitcast float** %m_cuda to i8** + %1 = load i32, i32* @Size, align 4 + %2 = load i32, i32* @Size, align 4 + %mul = mul nsw i32 %1, %2 + %conv = sext i32 %mul to i64 + %mul1 = mul i64 %conv, 4 + %call = call i32 @cudaMalloc(i8** %0, i64 %mul1) + %3 = bitcast float** %a_cuda to i8** + %4 = load i32, i32* @Size, align 4 + %5 = load i32, i32* @Size, align 4 + %mul2 = mul nsw i32 %4, %5 + %conv3 = sext i32 %mul2 to i64 + %mul4 = mul i64 %conv3, 4 + %call5 = call i32 @cudaMalloc(i8** %3, i64 %mul4) + %6 = bitcast float** %b_cuda to i8** + %7 = load i32, i32* @Size, align 4 + %conv6 = sext i32 %7 to i64 + %mul7 = mul i64 %conv6, 4 + %call8 = call i32 @cudaMalloc(i8** %6, i64 %mul7) + %8 = load float*, float** %m_cuda, align 8 + %9 = bitcast float* %8 to i8* + %10 = load float*, float** @m, align 8 + %11 = bitcast float* %10 to i8* + %12 = load i32, i32* @Size, align 4 + %13 = load i32, i32* @Size, align 4 + %mul9 = mul nsw i32 %12, %13 + %conv10 = sext i32 %mul9 to i64 + %mul11 = mul i64 %conv10, 4 + %call12 = call i32 @cudaMemcpy(i8* %9, i8* %11, i64 %mul11, i32 1) + %14 = load float*, float** %a_cuda, align 8 + %15 = bitcast float* %14 to i8* + %16 = load float*, float** @a, align 8 + %17 = bitcast float* %16 to i8* + %18 = load i32, i32* @Size, align 4 + %19 = load i32, i32* @Size, align 4 + %mul13 = mul nsw i32 %18, %19 + %conv14 = sext i32 %mul13 to i64 + %mul15 = mul i64 %conv14, 4 + %call16 = call i32 @cudaMemcpy(i8* %15, i8* %17, i64 %mul15, i32 1) + %20 = load float*, float** %b_cuda, align 8 + %21 = bitcast float* %20 to i8* + %22 = load float*, float** @b, align 8 + %23 = bitcast float* %22 to i8* + %24 = load i32, i32* @Size, align 4 + %conv17 = sext i32 %24 to i64 + %mul18 = mul i64 %conv17, 4 + %call19 = call i32 @cudaMemcpy(i8* %21, i8* %23, i64 %mul18, i32 1) + store i32 512, i32* %block_size, align 4 + %25 = load i32, i32* @Size, align 4 + %26 = load i32, i32* %block_size, align 4 + %div = sdiv i32 %25, %26 + %27 = load i32, i32* @Size, align 4 + %28 = load i32, i32* %block_size, align 4 + %rem = srem i32 %27, %28 + %tobool = icmp ne i32 %rem, 0 + %lnot = xor i1 %tobool, true + %29 = zext i1 %lnot to i64 + %cond = select i1 %lnot, i32 0, i32 1 + %add = add nsw i32 %div, %cond + store i32 %add, i32* %grid_size, align 4 + %30 = load i32, i32* %grid_size, align 4 + %call20 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.54, i64 0, i64 0), i32 %30) + %31 = load i32, i32* %block_size, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 %31, i32 1, i32 1) + %32 = load i32, i32* %grid_size, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %32, i32 1, i32 1) + store i32 1, i32* %blockSize2d, align 4 + %33 = load i32, i32* @Size, align 4 + %34 = load i32, i32* %blockSize2d, align 4 + %div21 = sdiv i32 %33, %34 + %35 = load i32, i32* @Size, align 4 + %36 = load i32, i32* %blockSize2d, align 4 + %rem22 = srem i32 %35, %36 + %tobool23 = icmp ne i32 %rem22, 0 + %37 = zext i1 %tobool23 to i64 + %cond24 = select i1 %tobool23, i32 0, i32 1 + %tobool25 = icmp ne i32 %cond24, 0 + %lnot26 = xor i1 %tobool25, true + %conv27 = zext i1 %lnot26 to i32 + %add28 = add nsw i32 %div21, %conv27 + store i32 %add28, i32* %gridSize2d, align 4 + %38 = load i32, i32* %blockSize2d, align 4 + %39 = load i32, i32* %blockSize2d, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlockXY, i32 %38, i32 %39, i32 1) + %40 = load i32, i32* %blockSize2d, align 4 + %call29 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.55, i64 0, i64 0), i32 %40) + %41 = load i32, i32* %gridSize2d, align 4 + %42 = load i32, i32* %gridSize2d, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGridXY, i32 %41, i32 %42, i32 1) + %43 = load i32, i32* %grid_size, align 4 + %44 = load i32, i32* %gridSize2d, align 4 + %call30 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.56, i64 0, i64 0), i32 %43, i32 %44) + %call31 = call i32 @gettimeofday(%struct.timeval* %time_start, %struct.timezone* null) #1 + store i32 0, i32* %t, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %45 = load i32, i32* %t, align 4 + %46 = load i32, i32* @Size, align 4 + %sub = sub nsw i32 %46, 1 + %cmp = icmp slt i32 %45, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %47 = bitcast %struct.dim3* %agg.tmp to i8* + %48 = bitcast %struct.dim3* %dimGrid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %47, i8* align 4 %48, i64 12, i1 false) + %49 = bitcast %struct.dim3* %agg.tmp32 to i8* + %50 = bitcast %struct.dim3* %dimBlock to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %49, i8* align 4 %50, i64 12, i1 false) + %51 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %52 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %51, i8* align 4 %52, i64 12, i1 false) + %53 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %54 = load i64, i64* %53, align 4 + %55 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %56 = load i32, i32* %55, align 4 + %57 = bitcast { i64, i32 }* %agg.tmp32.coerce to i8* + %58 = bitcast %struct.dim3* %agg.tmp32 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %57, i8* align 4 %58, i64 12, i1 false) + %59 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp32.coerce, i32 0, i32 0 + %60 = load i64, i64* %59, align 4 + %61 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp32.coerce, i32 0, i32 1 + %62 = load i32, i32* %61, align 4 + %call33 = call i32 @__cudaPushCallConfiguration(i64 %54, i32 %56, i64 %60, i32 %62, i64 0, i8* null) + %tobool34 = icmp ne i32 %call33, 0 + br i1 %tobool34, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %63 = load float*, float** %m_cuda, align 8 + %64 = load float*, float** %a_cuda, align 8 + %65 = load i32, i32* @Size, align 4 + %66 = load i32, i32* %t, align 4 + call void @_Z4Fan1PfS_ii(float* %63, float* %64, i32 %65, i32 %66) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.body + %call35 = call i32 @cudaDeviceSynchronize() + %67 = bitcast %struct.dim3* %agg.tmp36 to i8* + %68 = bitcast %struct.dim3* %dimGridXY to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %67, i8* align 4 %68, i64 12, i1 false) + %69 = bitcast %struct.dim3* %agg.tmp37 to i8* + %70 = bitcast %struct.dim3* %dimBlockXY to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %69, i8* align 4 %70, i64 12, i1 false) + %71 = bitcast { i64, i32 }* %agg.tmp36.coerce to i8* + %72 = bitcast %struct.dim3* %agg.tmp36 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %71, i8* align 4 %72, i64 12, i1 false) + %73 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp36.coerce, i32 0, i32 0 + %74 = load i64, i64* %73, align 4 + %75 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp36.coerce, i32 0, i32 1 + %76 = load i32, i32* %75, align 4 + %77 = bitcast { i64, i32 }* %agg.tmp37.coerce to i8* + %78 = bitcast %struct.dim3* %agg.tmp37 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %77, i8* align 4 %78, i64 12, i1 false) + %79 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp37.coerce, i32 0, i32 0 + %80 = load i64, i64* %79, align 4 + %81 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp37.coerce, i32 0, i32 1 + %82 = load i32, i32* %81, align 4 + %call38 = call i32 @__cudaPushCallConfiguration(i64 %74, i32 %76, i64 %80, i32 %82, i64 0, i8* null) + %tobool39 = icmp ne i32 %call38, 0 + br i1 %tobool39, label %kcall.end42, label %kcall.configok40 + +kcall.configok40: ; preds = %kcall.end + %83 = load float*, float** %m_cuda, align 8 + %84 = load float*, float** %a_cuda, align 8 + %85 = load float*, float** %b_cuda, align 8 + %86 = load i32, i32* @Size, align 4 + %87 = load i32, i32* @Size, align 4 + %88 = load i32, i32* %t, align 4 + %sub41 = sub nsw i32 %87, %88 + %89 = load i32, i32* %t, align 4 + call void @_Z4Fan2PfS_S_iii(float* %83, float* %84, float* %85, i32 %86, i32 %sub41, i32 %89) + br label %kcall.end42 + +kcall.end42: ; preds = %kcall.configok40, %kcall.end + %call43 = call i32 @cudaDeviceSynchronize() + call void @_Z14checkCUDAErrorPKc(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.57, i64 0, i64 0)) + br label %for.inc + +for.inc: ; preds = %kcall.end42 + %90 = load i32, i32* %t, align 4 + %inc = add nsw i32 %90, 1 + store i32 %inc, i32* %t, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %call44 = call i32 @gettimeofday(%struct.timeval* %time_end, %struct.timezone* null) #1 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 0 + %91 = load i64, i64* %tv_sec, align 8 + %mul45 = mul nsw i64 %91, 1000000 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 1 + %92 = load i64, i64* %tv_usec, align 8 + %add46 = add nsw i64 %mul45, %92 + %tv_sec47 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 0 + %93 = load i64, i64* %tv_sec47, align 8 + %mul48 = mul nsw i64 %93, 1000000 + %tv_usec49 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 1 + %94 = load i64, i64* %tv_usec49, align 8 + %add50 = add nsw i64 %mul48, %94 + %sub51 = sub nsw i64 %add46, %add50 + %conv52 = trunc i64 %sub51 to i32 + store i32 %conv52, i32* @totalKernelTime, align 4 + %95 = load float*, float** @m, align 8 + %96 = bitcast float* %95 to i8* + %97 = load float*, float** %m_cuda, align 8 + %98 = bitcast float* %97 to i8* + %99 = load i32, i32* @Size, align 4 + %100 = load i32, i32* @Size, align 4 + %mul53 = mul nsw i32 %99, %100 + %conv54 = sext i32 %mul53 to i64 + %mul55 = mul i64 %conv54, 4 + %call56 = call i32 @cudaMemcpy(i8* %96, i8* %98, i64 %mul55, i32 2) + %101 = load float*, float** @a, align 8 + %102 = bitcast float* %101 to i8* + %103 = load float*, float** %a_cuda, align 8 + %104 = bitcast float* %103 to i8* + %105 = load i32, i32* @Size, align 4 + %106 = load i32, i32* @Size, align 4 + %mul57 = mul nsw i32 %105, %106 + %conv58 = sext i32 %mul57 to i64 + %mul59 = mul i64 %conv58, 4 + %call60 = call i32 @cudaMemcpy(i8* %102, i8* %104, i64 %mul59, i32 2) + %107 = load float*, float** @b, align 8 + %108 = bitcast float* %107 to i8* + %109 = load float*, float** %b_cuda, align 8 + %110 = bitcast float* %109 to i8* + %111 = load i32, i32* @Size, align 4 + %conv61 = sext i32 %111 to i64 + %mul62 = mul i64 %conv61, 4 + %call63 = call i32 @cudaMemcpy(i8* %108, i8* %110, i64 %mul62, i32 2) + %112 = load float*, float** %m_cuda, align 8 + %113 = bitcast float* %112 to i8* + %call64 = call i32 @cudaFree(i8* %113) + %114 = load float*, float** %a_cuda, align 8 + %115 = bitcast float* %114 to i8* + %call65 = call i32 @cudaFree(i8* %115) + %116 = load float*, float** %b_cuda, align 8 + %117 = bitcast float* %116 to i8* + %call66 = call i32 @cudaFree(i8* %117) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z8PrintMatPfii(float* %ary, i32 %nrow, i32 %ncol) #2 { +entry: + %ary.addr = alloca float*, align 8 + %nrow.addr = alloca i32, align 4 + %ncol.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store float* %ary, float** %ary.addr, align 8 + store i32 %nrow, i32* %nrow.addr, align 4 + store i32 %ncol, i32* %ncol.addr, align 4 + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z8PrintAryPfi(float* %ary, i32 %ary_size) #0 { +entry: + %ary.addr = alloca float*, align 8 + %ary_size.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store float* %ary, float** %ary.addr, align 8 + store i32 %ary_size, i32* %ary_size.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %ary_size.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load float*, float** %ary.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom + %4 = load float, float* %arrayidx, align 4 + %conv = fpext float %4 to double + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.59, i64 0, i64 0), double %conv) + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.60, i64 0, i64 0)) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z7BackSubv() #2 { +entry: + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %0 = load i32, i32* @Size, align 4 + %conv = sext i32 %0 to i64 + %mul = mul i64 %conv, 4 + %call = call noalias i8* @malloc(i64 %mul) #1 + %1 = bitcast i8* %call to float* + store float* %1, float** @finalVec, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc43, %entry + %2 = load i32, i32* %i, align 4 + %3 = load i32, i32* @Size, align 4 + %cmp = icmp slt i32 %2, %3 + br i1 %cmp, label %for.body, label %for.end45 + +for.body: ; preds = %for.cond + %4 = load float*, float** @b, align 8 + %5 = load i32, i32* @Size, align 4 + %6 = load i32, i32* %i, align 4 + %sub = sub nsw i32 %5, %6 + %sub1 = sub nsw i32 %sub, 1 + %idxprom = sext i32 %sub1 to i64 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %7 = load float, float* %arrayidx, align 4 + %8 = load float*, float** @finalVec, align 8 + %9 = load i32, i32* @Size, align 4 + %10 = load i32, i32* %i, align 4 + %sub2 = sub nsw i32 %9, %10 + %sub3 = sub nsw i32 %sub2, 1 + %idxprom4 = sext i32 %sub3 to i64 + %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 + store float %7, float* %arrayidx5, align 4 + store i32 0, i32* %j, align 4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc, %for.body + %11 = load i32, i32* %j, align 4 + %12 = load i32, i32* %i, align 4 + %cmp7 = icmp slt i32 %11, %12 + br i1 %cmp7, label %for.body8, label %for.end + +for.body8: ; preds = %for.cond6 + %13 = load float*, float** @a, align 8 + %14 = load i32, i32* @Size, align 4 + %15 = load i32, i32* @Size, align 4 + %16 = load i32, i32* %i, align 4 + %sub9 = sub nsw i32 %15, %16 + %sub10 = sub nsw i32 %sub9, 1 + %mul11 = mul nsw i32 %14, %sub10 + %idx.ext = sext i32 %mul11 to i64 + %add.ptr = getelementptr inbounds float, float* %13, i64 %idx.ext + %17 = load i32, i32* @Size, align 4 + %18 = load i32, i32* %j, align 4 + %sub12 = sub nsw i32 %17, %18 + %sub13 = sub nsw i32 %sub12, 1 + %idx.ext14 = sext i32 %sub13 to i64 + %add.ptr15 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext14 + %19 = load float, float* %add.ptr15, align 4 + %20 = load float*, float** @finalVec, align 8 + %21 = load i32, i32* @Size, align 4 + %22 = load i32, i32* %j, align 4 + %sub16 = sub nsw i32 %21, %22 + %sub17 = sub nsw i32 %sub16, 1 + %idxprom18 = sext i32 %sub17 to i64 + %arrayidx19 = getelementptr inbounds float, float* %20, i64 %idxprom18 + %23 = load float, float* %arrayidx19, align 4 + %mul20 = fmul contract float %19, %23 + %24 = load float*, float** @finalVec, align 8 + %25 = load i32, i32* @Size, align 4 + %26 = load i32, i32* %i, align 4 + %sub21 = sub nsw i32 %25, %26 + %sub22 = sub nsw i32 %sub21, 1 + %idxprom23 = sext i32 %sub22 to i64 + %arrayidx24 = getelementptr inbounds float, float* %24, i64 %idxprom23 + %27 = load float, float* %arrayidx24, align 4 + %sub25 = fsub contract float %27, %mul20 + store float %sub25, float* %arrayidx24, align 4 + br label %for.inc + +for.inc: ; preds = %for.body8 + %28 = load i32, i32* %j, align 4 + %inc = add nsw i32 %28, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond6 + +for.end: ; preds = %for.cond6 + %29 = load float*, float** @finalVec, align 8 + %30 = load i32, i32* @Size, align 4 + %31 = load i32, i32* %i, align 4 + %sub26 = sub nsw i32 %30, %31 + %sub27 = sub nsw i32 %sub26, 1 + %idxprom28 = sext i32 %sub27 to i64 + %arrayidx29 = getelementptr inbounds float, float* %29, i64 %idxprom28 + %32 = load float, float* %arrayidx29, align 4 + %33 = load float*, float** @a, align 8 + %34 = load i32, i32* @Size, align 4 + %35 = load i32, i32* @Size, align 4 + %36 = load i32, i32* %i, align 4 + %sub30 = sub nsw i32 %35, %36 + %sub31 = sub nsw i32 %sub30, 1 + %mul32 = mul nsw i32 %34, %sub31 + %idx.ext33 = sext i32 %mul32 to i64 + %add.ptr34 = getelementptr inbounds float, float* %33, i64 %idx.ext33 + %37 = load i32, i32* @Size, align 4 + %38 = load i32, i32* %i, align 4 + %sub35 = sub nsw i32 %37, %38 + %sub36 = sub nsw i32 %sub35, 1 + %idx.ext37 = sext i32 %sub36 to i64 + %add.ptr38 = getelementptr inbounds float, float* %add.ptr34, i64 %idx.ext37 + %39 = load float, float* %add.ptr38, align 4 + %div = fdiv float %32, %39 + %40 = load float*, float** @finalVec, align 8 + %41 = load i32, i32* @Size, align 4 + %42 = load i32, i32* %i, align 4 + %sub39 = sub nsw i32 %41, %42 + %sub40 = sub nsw i32 %sub39, 1 + %idxprom41 = sext i32 %sub40 to i64 + %arrayidx42 = getelementptr inbounds float, float* %40, i64 %idxprom41 + store float %div, float* %arrayidx42, align 4 + br label %for.inc43 + +for.inc43: ; preds = %for.end + %43 = load i32, i32* %i, align 4 + %inc44 = add nsw i32 %43, 1 + store i32 %inc44, i32* %i, align 4 + br label %for.cond + +for.end45: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #7 + +declare dso_local i32 @cudaGetDeviceCount(i32*) #4 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #8 + +declare dso_local i32 @cudaGetDeviceProperties(%struct.cudaDeviceProp*, i32) #4 + +declare dso_local i8* @cudaGetErrorString(i32) #4 + +declare dso_local i32 @cudaGetLastError() #4 + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #4 + +declare dso_local i32 @fscanf(%struct._IO_FILE*, i8*, ...) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z7InitMatPfii(float* %ary, i32 %nrow, i32 %ncol) #0 { +entry: + %ary.addr = alloca float*, align 8 + %nrow.addr = alloca i32, align 4 + %ncol.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store float* %ary, float** %ary.addr, align 8 + store i32 %nrow, i32* %nrow.addr, align 4 + store i32 %ncol, i32* %ncol.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc6, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %nrow.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end8 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %2 = load i32, i32* %j, align 4 + %3 = load i32, i32* %ncol.addr, align 4 + %cmp2 = icmp slt i32 %2, %3 + br i1 %cmp2, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %4 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %5 = load float*, float** %ary.addr, align 8 + %6 = load i32, i32* @Size, align 4 + %7 = load i32, i32* %i, align 4 + %mul = mul nsw i32 %6, %7 + %idx.ext = sext i32 %mul to i64 + %add.ptr = getelementptr inbounds float, float* %5, i64 %idx.ext + %8 = load i32, i32* %j, align 4 + %idx.ext4 = sext i32 %8 to i64 + %add.ptr5 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext4 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.58, i64 0, i64 0), float* %add.ptr5) + br label %for.inc + +for.inc: ; preds = %for.body3 + %9 = load i32, i32* %j, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc6 + +for.inc6: ; preds = %for.end + %10 = load i32, i32* %i, align 4 + %inc7 = add nsw i32 %10, 1 + store i32 %inc7, i32* %i, align 4 + br label %for.cond + +for.end8: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z7InitAryPfi(float* %ary, i32 %ary_size) #0 { +entry: + %ary.addr = alloca float*, align 8 + %ary_size.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store float* %ary, float** %ary.addr, align 8 + store i32 %ary_size, i32* %ary_size.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %ary_size.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 + %3 = load float*, float** %ary.addr, align 8 + %4 = load i32, i32* %i, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds float, float* %3, i64 %idxprom + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.58, i64 0, i64 0), float* %arrayidx) + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 { +entry: + %m_cuda.addr = alloca float*, align 8 + %a_cuda.addr = alloca float*, align 8 + %Size.addr = alloca i32, align 4 + %t.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %m_cuda, float** %m_cuda.addr, align 8 + store float* %a_cuda, float** %a_cuda.addr, align 8 + store i32 %Size, i32* %Size.addr, align 4 + store i32 %t, i32* %t.addr, align 4 + %kernel_args = alloca i8*, i64 4, align 16 + %0 = bitcast float** %m_cuda.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast float** %a_cuda.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32* %Size.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %t.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %9 = load i64, i64* %shmem_size, align 8 + %10 = load i8*, i8** %stream, align 8 + %11 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %12 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %11, i8* align 8 %12, i64 12, i1 false) + %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %14 = load i64, i64* %13, align 8 + %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %16 = load i32, i32* %15, align 8 + %17 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %18 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 12, i1 false) + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %20 = load i64, i64* %19, align 8 + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %22 = load i32, i32* %21, align 8 + %23 = bitcast i8* %10 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i64 %14, i32 %16, i64 %20, i32 %22, i8** %kernel_args, i64 %9, %struct.CUstream_st* %23) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #8 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 { +entry: + %m_cuda.addr = alloca float*, align 8 + %a_cuda.addr = alloca float*, align 8 + %b_cuda.addr = alloca float*, align 8 + %Size.addr = alloca i32, align 4 + %j1.addr = alloca i32, align 4 + %t.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %m_cuda, float** %m_cuda.addr, align 8 + store float* %a_cuda, float** %a_cuda.addr, align 8 + store float* %b_cuda, float** %b_cuda.addr, align 8 + store i32 %Size, i32* %Size.addr, align 4 + store i32 %j1, i32* %j1.addr, align 4 + store i32 %t, i32* %t.addr, align 4 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast float** %m_cuda.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast float** %a_cuda.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast float** %b_cuda.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %Size.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %j1.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %t.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @cudaMalloc(i8**, i64) #4 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #4 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #4 + +declare dso_local i32 @cudaDeviceSynchronize() #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z14checkCUDAErrorPKc(i8* %msg) #0 { +entry: + %msg.addr = alloca i8*, align 8 + %err = alloca i32, align 4 + store i8* %msg, i8** %msg.addr, align 8 + %call = call i32 @cudaGetLastError() + store i32 %call, i32* %err, align 4 + %0 = load i32, i32* %err, align 4 + %cmp = icmp ne i32 0, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %2 = load i8*, i8** %msg.addr, align 8 + %3 = load i32, i32* %err, align 4 + %call1 = call i8* @cudaGetErrorString(i32 %3) + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.61, i64 0, i64 0), i8* %2, i8* %call1) + call void @exit(i32 1) #9 + unreachable + +if.end: ; preds = %entry + ret void +} + +declare dso_local i32 @cudaFree(i8*) #4 + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #4 + +; Function Attrs: nounwind +declare dso_local float @expf(float) #7 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { argmemonly nounwind willreturn } +attributes #9 = { noreturn nounwind } +attributes #10 = { nounwind readonly } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/gauss/gaussian.cu b/examples/gauss/gaussian.cu new file mode 100644 index 0000000..637d900 --- /dev/null +++ b/examples/gauss/gaussian.cu @@ -0,0 +1,522 @@ +/*----------------------------------------------------------- + ** gaussian.cu -- The program is to solve a linear system Ax = b + ** by using Gaussian Elimination. The algorithm on page 101 + ** ("Foundations of Parallel Programming") is used. + ** The sequential version is gaussian.c. This parallel + ** implementation converts three independent for() loops + ** into three Fans. Use the data file ge_3.dat to verify + ** the correction of the output. + ** + ** Written by Andreas Kura, 02/15/95 + ** Modified by Chong-wei Xu, 04/20/95 + ** Modified by Chris Gregg for CUDA, 07/20/2009 + **----------------------------------------------------------- + */ +#include "cuda_runtime.h" +#include +#include +#include +#include +#include + +#ifdef TIMING +#include "timing.h" +#endif + +#ifdef RD_WG_SIZE_0_0 +#define MAXBLOCKSIZE RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define MAXBLOCKSIZE RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define MAXBLOCKSIZE RD_WG_SIZE +#else +#define MAXBLOCKSIZE 512 +#endif + +// 2D defines. Go from specific to general +#ifdef RD_WG_SIZE_1_0 +#define BLOCK_SIZE_XY RD_WG_SIZE_1_0 +#elif defined(RD_WG_SIZE_1) +#define BLOCK_SIZE_XY RD_WG_SIZE_1 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE_XY RD_WG_SIZE +#else +#define BLOCK_SIZE_XY 1 +#endif + +#ifdef TIMING +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +int Size; +float *a, *b, *finalVec; +float *m; + +FILE *fp; + +void InitProblemOnce(char *filename); +void InitPerRun(); +void ForwardSub(); +void BackSub(); +__global__ void Fan1(float *m, float *a, int Size, int t); +__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t); +void InitMat(float *ary, int nrow, int ncol); +void InitAry(float *ary, int ary_size); +void PrintMat(float *ary, int nrow, int ncolumn); +void PrintAry(float *ary, int ary_size); +void PrintDeviceProperties(); +void checkCUDAError(const char *msg); + +unsigned int totalKernelTime = 0; + +// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06 +void create_matrix(float *m, int size) { + int i, j; + float lamda = -0.01; + float coe[2 * size - 1]; + float coe_i = 0.0; + + for (i = 0; i < size; i++) { + coe_i = 10 * exp(lamda * i); + j = size - 1 + i; + coe[j] = coe_i; + j = size - 1 - i; + coe[j] = coe_i; + } + + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + m[i * size + j] = coe[size - 1 - i + j]; + } + } +} + +int main(int argc, char *argv[]) { + printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n", + MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY); + int verbose = 1; + int i, j; + char flag; + if (argc < 2) { + printf("Usage: gaussian -f filename / -s size [-q]\n\n"); + printf("-q (quiet) suppresses printing the matrix and result values.\n"); + printf("-f (filename) path of input file\n"); + printf( + "-s (size) size of matrix. Create matrix and rhs in this program \n"); + printf( + "The first line of the file contains the dimension of the matrix, n."); + printf("The second line of the file is a newline.\n"); + printf("The next n lines contain n tab separated values for the matrix."); + printf("The next line of the file is a newline.\n"); + printf("The next line of the file is a 1xn vector with tab separated " + "values.\n"); + printf("The next line of the file is a newline. (optional)\n"); + printf("The final line of the file is the pre-computed solution. " + "(optional)\n"); + printf("Example: matrix4.txt:\n"); + printf("4\n"); + printf("\n"); + printf("-0.6 -0.5 0.7 0.3\n"); + printf("-0.3 -0.9 0.3 0.7\n"); + printf("-0.4 -0.5 -0.3 -0.8\n"); + printf("0.0 -0.1 0.2 0.9\n"); + printf("\n"); + printf("-0.85 -0.68 0.24 -0.53\n"); + printf("\n"); + printf("0.7 0.0 -0.4 -0.5\n"); + exit(0); + } + + cudaSetDevice(0); + + PrintDeviceProperties(); + // char filename[100]; + // sprintf(filename,"matrices/matrix%d.txt",size); + + for (i = 1; i < argc; i++) { + if (argv[i][0] == '-') { // flag + flag = argv[i][1]; + switch (flag) { + case 's': // platform + i++; + Size = atoi(argv[i]); + printf("Create matrix internally in parse, size = %d \n", Size); + + a = (float *)malloc(Size * Size * sizeof(float)); + create_matrix(a, Size); + + b = (float *)malloc(Size * sizeof(float)); + for (j = 0; j < Size; j++) + b[j] = 1.0; + + m = (float *)malloc(Size * Size * sizeof(float)); + break; + case 'f': // platform + i++; + printf("Read file from %s \n", argv[i]); + InitProblemOnce(argv[i]); + break; + case 'q': // quiet + verbose = 1; + break; + } + } + } + + // InitProblemOnce(filename); + + InitPerRun(); + // begin timing + struct timeval time_start; + gettimeofday(&time_start, NULL); + + // run kernels + ForwardSub(); + + // end timing + struct timeval time_end; + gettimeofday(&time_end, NULL); + unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) - + (time_start.tv_sec * 1000000 + time_start.tv_usec); + + if (verbose) { + printf("Matrix m is: \n"); + PrintMat(m, Size, Size); + + printf("Matrix a is: \n"); + PrintMat(a, Size, Size); + + printf("Array b is: \n"); + PrintAry(b, Size); + } + BackSub(); + if (verbose) { + printf("The final solution is: \n"); + PrintAry(finalVec, Size); + } + printf("\nTime total (including memory transfers)\t%f sec\n", + time_total * 1e-6); + printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6); + + /*printf("%d,%d\n",size,time_total); + fprintf(stderr,"%d,%d\n",size,time_total);*/ + + free(m); + free(a); + free(b); + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} +/*------------------------------------------------------ + ** PrintDeviceProperties + **----------------------------------------------------- + */ +void PrintDeviceProperties() { + cudaDeviceProp deviceProp; + int nDevCount = 0; + + cudaGetDeviceCount(&nDevCount); + printf("Total Device found: %d", nDevCount); + for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) { + memset(&deviceProp, 0, sizeof(deviceProp)); + if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) { + printf("\nDevice Name \t\t - %s ", deviceProp.name); + printf("\n**************************************"); + printf("\nTotal Global Memory\t\t\t - %lu KB", + deviceProp.totalGlobalMem / 1024); + printf("\nShared memory available per block \t - %lu KB", + deviceProp.sharedMemPerBlock / 1024); + printf("\nNumber of registers per thread block \t - %d", + deviceProp.regsPerBlock); + printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize); + printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch); + printf("\nMaximum threads per block \t\t - %d", + deviceProp.maxThreadsPerBlock); + printf("\nMaximum Thread Dimension (block) \t - %d %d %d", + deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); + printf("\nMaximum Thread Dimension (grid) \t - %d %d %d", + deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], + deviceProp.maxGridSize[2]); + printf("\nTotal constant memory \t\t\t - %zu bytes", + deviceProp.totalConstMem); + printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor); + printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate); + printf("\nTexture Alignment \t\t\t - %zu bytes", + deviceProp.textureAlignment); + printf("\nDevice Overlap \t\t\t\t - %s", + deviceProp.deviceOverlap ? "Allowed" : "Not Allowed"); + printf("\nNumber of Multi processors \t\t - %d\n\n", + deviceProp.multiProcessorCount); + } else + printf("\n%s", cudaGetErrorString(cudaGetLastError())); + } +} + +/*------------------------------------------------------ + ** InitProblemOnce -- Initialize all of matrices and + ** vectors by opening a data file specified by the user. + ** + ** We used dynamic array *a, *b, and *m to allocate + ** the memory storages. + **------------------------------------------------------ + */ +void InitProblemOnce(char *filename) { + // char *filename = argv[1]; + + // printf("Enter the data file name: "); + // scanf("%s", filename); + printf("The file name is: %s\n", filename); + + fp = fopen(filename, "r"); + + fscanf(fp, "%d", &Size); + + a = (float *)malloc(Size * Size * sizeof(float)); + + InitMat(a, Size, Size); + printf("The input matrix a is:\n"); + PrintMat(a, Size, Size); + b = (float *)malloc(Size * sizeof(float)); + + InitAry(b, Size); + printf("The input array b is:\n"); + PrintAry(b, Size); + + m = (float *)malloc(Size * Size * sizeof(float)); +} + +/*------------------------------------------------------ + ** InitPerRun() -- Initialize the contents of the + ** multipier matrix **m + **------------------------------------------------------ + */ +void InitPerRun() { + int i; + for (i = 0; i < Size * Size; i++) + *(m + i) = 0.0; +} + +/*------------------------------------------------------- + ** Fan1() -- Calculate multiplier matrix + ** Pay attention to the index. Index i give the range + ** which starts from 0 to range-1. The real values of + ** the index should be adjust and related with the value + ** of t which is defined on the ForwardSub(). + **------------------------------------------------------- + */ +__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) { + // if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) { + // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d, + // Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t); + // } + + if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t) + return; + *(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) = + *(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) / + *(a_cuda + Size * t + t); +} + +/*------------------------------------------------------- + ** Fan2() -- Modify the matrix A into LUD + **------------------------------------------------------- + */ + +__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size, + int j1, int t) { + if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t) + return; + if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t) + return; + + int xidx = blockIdx.x * blockDim.x + threadIdx.x; + int yidx = blockIdx.y * blockDim.y + threadIdx.y; + // printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d, + // blockDim.x: %d, blockDim.y: + // %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y); + + a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -= + m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)]; + // a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t]; + if (yidx == 0) { + // printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y); + // printf("xidx:%d,yidx:%d\n",xidx,yidx); + b_cuda[xidx + 1 + t] -= + m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t]; + } +} + +/*------------------------------------------------------ + ** ForwardSub() -- Forward substitution of Gaussian + ** elimination. + **------------------------------------------------------ + */ +void ForwardSub() { + int t; + float *m_cuda, *a_cuda, *b_cuda; + + int A = 1; + int B = 2; + int C = 3; + int D = 4; + int E = 5; + int F = 6; + // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n", + // A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, + // threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F); + + // allocate memory on GPU + cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float)); + + cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float)); + + cudaMalloc((void **)&b_cuda, Size * sizeof(float)); + + // copy memory to GPU + cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice); + + int block_size, grid_size; + + block_size = MAXBLOCKSIZE; + grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1); + printf("1d grid size: %d\n", grid_size); + + dim3 dimBlock(block_size); + dim3 dimGrid(grid_size); + // dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) ); + + int blockSize2d, gridSize2d; + blockSize2d = BLOCK_SIZE_XY; + gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1)); + + dim3 dimBlockXY(blockSize2d, blockSize2d); + + printf("BlockXY: %d \n", blockSize2d); + dim3 dimGridXY(gridSize2d, gridSize2d); + +#ifdef TIMING + gettimeofday(&tv_kernel_start, NULL); +#endif + printf("first grid size: %d second: %d\n", grid_size, gridSize2d); + // begin timing kernels + struct timeval time_start; + gettimeofday(&time_start, NULL); + for (t = 0; t < (Size - 1); t++) { + Fan1<<>>(m_cuda, a_cuda, Size, t); + cudaDeviceSynchronize(); + Fan2<<>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t); + cudaDeviceSynchronize(); + checkCUDAError("Fan2"); + } + // end timing kernels + struct timeval time_end; + gettimeofday(&time_end, NULL); + totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) - + (time_start.tv_sec * 1000000 + time_start.tv_usec); + +#ifdef TIMING + tvsub(&time_end, &tv_kernel_start, &tv); + kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; +#endif + + // copy memory back to CPU + cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(m_cuda); + cudaFree(a_cuda); + cudaFree(b_cuda); +} + +/*------------------------------------------------------ + ** BackSub() -- Backward substitution + **------------------------------------------------------ + */ + +void BackSub() { + // create a new vector to hold the final answer + finalVec = (float *)malloc(Size * sizeof(float)); + // solve "bottom up" + int i, j; + for (i = 0; i < Size; i++) { + finalVec[Size - i - 1] = b[Size - i - 1]; + for (j = 0; j < i; j++) { + finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) * + finalVec[Size - j - 1]; + } + finalVec[Size - i - 1] = + finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1)); + } +} + +void InitMat(float *ary, int nrow, int ncol) { + int i, j; + + for (i = 0; i < nrow; i++) { + for (j = 0; j < ncol; j++) { + fscanf(fp, "%f", ary + Size * i + j); + } + } +} + +/*------------------------------------------------------ + ** PrintMat() -- Print the contents of the matrix + **------------------------------------------------------ + */ +void PrintMat(float *ary, int nrow, int ncol) { + return; + int i, j; + + for (i = 0; i < nrow; i++) { + for (j = 0; j < ncol; j++) { + printf("%8.2f ", *(ary + Size * i + j)); + } + printf("\n"); + } + printf("\n"); +} + +/*------------------------------------------------------ + ** InitAry() -- Initialize the array (vector) by reading + ** data from the data file + **------------------------------------------------------ + */ +void InitAry(float *ary, int ary_size) { + int i; + + for (i = 0; i < ary_size; i++) { + fscanf(fp, "%f", &ary[i]); + } +} + +/*------------------------------------------------------ + ** PrintAry() -- Print the contents of the array (vector) + **------------------------------------------------------ + */ +void PrintAry(float *ary, int ary_size) { + int i; + for (i = 0; i < ary_size; i++) { + printf("%.2f ", ary[i]); + } + printf("\n\n"); +} +void checkCUDAError(const char *msg) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} diff --git a/examples/gauss/run.sh b/examples/gauss/run.sh new file mode 100755 index 0000000..e689c70 --- /dev/null +++ b/examples/gauss/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e +llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime \ + -L../../build/runtime/threadPool \ + -o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread + +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log + +if grep -q "0.70 0.00 -0.40 -0.50" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/heartwall/AVI/avilib.c b/examples/heartwall/AVI/avilib.c new file mode 100644 index 0000000..11f4008 --- /dev/null +++ b/examples/heartwall/AVI/avilib.c @@ -0,0 +1,1829 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/* + * avilib.c + * + * Copyright (C) Thomas Östreich - June 2001 + * multiple audio track support Copyright (C) 2002 Thomas Östreich + * + * Original code: + * Copyright (C) 1999 Rainer Johanni + * + * This file is part of transcode, a linux video stream processing tool + * + * transcode is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * transcode is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "avilib.h" +//#include + +#define INFO_LIST + +/* The following variable indicates the kind of error */ + +long AVI_errno; + +#define MAX_INFO_STRLEN 64 +static char id_str[MAX_INFO_STRLEN]; + +#define FRAME_RATE_SCALE 1000000 + +#ifndef PACKAGE +#define PACKAGE "my" +#define VERSION "0.00" +#endif + +#ifndef O_BINARY +/* win32 wants a binary flag to open(); this sets it to null + on platforms that don't have it. */ +#define O_BINARY 0 +#endif + +/******************************************************************* + * * + * Utilities for writing an AVI File * + * * + *******************************************************************/ + +static size_t avi_read(int fd, char *buf, size_t len) { + size_t n = 0; + size_t r = 0; + + while (r < len) { + n = read(fd, buf + r, len - r); + + if (n <= 0) + return r; + r += n; + } + + return r; +} + +static size_t avi_write(int fd, char *buf, size_t len) { + size_t n = 0; + size_t r = 0; + + while (r < len) { + n = write(fd, buf + r, len - r); + if (n < 0) + return n; + + r += n; + } + return r; +} + +/* HEADERBYTES: The number of bytes to reserve for the header */ + +#define HEADERBYTES 2048 + +/* AVI_MAX_LEN: The maximum length of an AVI file, we stay a bit below + the 2GB limit (Remember: 2*10^9 is smaller than 2 GB) */ + +#define AVI_MAX_LEN (UINT_MAX - (1 << 20) * 16 - HEADERBYTES) + +#define PAD_EVEN(x) (((x) + 1) & ~1) + +/* Copy n into dst as a 4 byte, little endian number. + Should also work on big endian machines */ + +static void long2str(unsigned char *dst, int n) { + dst[0] = (n)&0xff; + dst[1] = (n >> 8) & 0xff; + dst[2] = (n >> 16) & 0xff; + dst[3] = (n >> 24) & 0xff; +} + +/* Convert a string of 4 or 2 bytes to a number, + also working on big endian machines */ + +static unsigned long str2ulong(unsigned char *str) { + return (str[0] | (str[1] << 8) | (str[2] << 16) | (str[3] << 24)); +} +static unsigned long str2ushort(unsigned char *str) { + return (str[0] | (str[1] << 8)); +} + +/* Calculate audio sample size from number of bits and number of channels. + This may have to be adjusted for eg. 12 bits and stereo */ + +static int avi_sampsize(avi_t *AVI, int j) { + int s; + s = ((AVI->track[j].a_bits + 7) / 8) * AVI->track[j].a_chans; + // if(s==0) s=1; /* avoid possible zero divisions */ + if (s < 4) + s = 4; /* avoid possible zero divisions */ + return s; +} + +/* Add a chunk (=tag and data) to the AVI file, + returns -1 on write error, 0 on success */ + +static int avi_add_chunk(avi_t *AVI, unsigned char *tag, unsigned char *data, + int length) { + unsigned char c[8]; + + /* Copy tag and length int c, so that we need only 1 write system call + for these two values */ + + memcpy(c, tag, 4); + long2str(c + 4, length); + + /* Output tag, length and data, restore previous position + if the write fails */ + + length = PAD_EVEN(length); + + if (avi_write(AVI->fdes, (char *)c, 8) != 8 || + avi_write(AVI->fdes, (char *)data, length) != length) { + lseek(AVI->fdes, AVI->pos, SEEK_SET); + AVI_errno = AVI_ERR_WRITE; + return -1; + } + + /* Update file position */ + + AVI->pos += 8 + length; + + // fprintf(stderr, "pos=%lu %s\n", AVI->pos, tag); + + return 0; +} + +static int avi_add_index_entry(avi_t *AVI, unsigned char *tag, long flags, + unsigned long pos, unsigned long len) { + void *ptr; + + if (AVI->n_idx >= AVI->max_idx) { + ptr = realloc((void *)AVI->idx, (AVI->max_idx + 4096) * 16); + + if (ptr == 0) { + AVI_errno = AVI_ERR_NO_MEM; + return -1; + } + AVI->max_idx += 4096; + AVI->idx = (unsigned char((*)[16]))ptr; + } + + /* Add index entry */ + + // fprintf(stderr, "INDEX %s %ld %lu %lu\n", tag, flags, pos, len); + + memcpy(AVI->idx[AVI->n_idx], tag, 4); + long2str(AVI->idx[AVI->n_idx] + 4, flags); + long2str(AVI->idx[AVI->n_idx] + 8, pos); + long2str(AVI->idx[AVI->n_idx] + 12, len); + + /* Update counter */ + + AVI->n_idx++; + + if (len > AVI->max_len) + AVI->max_len = len; + + return 0; +} + +/* + AVI_open_output_file: Open an AVI File and write a bunch + of zero bytes as space for the header. + + returns a pointer to avi_t on success, a zero pointer on error +*/ + +avi_t *AVI_open_output_file(char *filename) { + avi_t *AVI; + int i; + + int mask = 0; + + unsigned char AVI_header[HEADERBYTES]; + + /* Allocate the avi_t struct and zero it */ + + AVI = (avi_t *)malloc(sizeof(avi_t)); + if (AVI == 0) { + AVI_errno = AVI_ERR_NO_MEM; + return 0; + } + memset((void *)AVI, 0, sizeof(avi_t)); + + /* Since Linux needs a long time when deleting big files, + we do not truncate the file when we open it. + Instead it is truncated when the AVI file is closed */ + + /* mask = umask (0); + umask (mask);*/ + + AVI->fdes = open(filename, O_RDWR | O_CREAT | O_BINARY, 0644 & ~mask); + if (AVI->fdes < 0) { + AVI_errno = AVI_ERR_OPEN; + free(AVI); + return 0; + } + + /* Write out HEADERBYTES bytes, the header will go here + when we are finished with writing */ + + for (i = 0; i < HEADERBYTES; i++) + AVI_header[i] = 0; + i = avi_write(AVI->fdes, (char *)AVI_header, HEADERBYTES); + if (i != HEADERBYTES) { + close(AVI->fdes); + AVI_errno = AVI_ERR_WRITE; + free(AVI); + return 0; + } + + AVI->pos = HEADERBYTES; + AVI->mode = AVI_MODE_WRITE; /* open for writing */ + + // init + AVI->anum = 0; + AVI->aptr = 0; + + return AVI; +} + +void AVI_set_video(avi_t *AVI, int width, int height, double fps, + char *compressor) { + /* may only be called if file is open for writing */ + + if (AVI->mode == AVI_MODE_READ) + return; + + AVI->width = width; + AVI->height = height; + AVI->fps = fps; + + if (strncmp(compressor, "RGB", 3) == 0) { + memset(AVI->compressor, 0, 4); + } else { + memcpy(AVI->compressor, compressor, 4); + } + + AVI->compressor[4] = 0; + + avi_update_header(AVI); +} + +void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format, + long mp3rate) { + /* may only be called if file is open for writing */ + + if (AVI->mode == AVI_MODE_READ) + return; + + // inc audio tracks + AVI->aptr = AVI->anum; + ++AVI->anum; + + if (AVI->anum > AVI_MAX_TRACKS) { + fprintf(stderr, "error - only %d audio tracks supported\n", AVI_MAX_TRACKS); + exit(1); + } + + AVI->track[AVI->aptr].a_chans = channels; + AVI->track[AVI->aptr].a_rate = rate; + AVI->track[AVI->aptr].a_bits = bits; + AVI->track[AVI->aptr].a_fmt = format; + AVI->track[AVI->aptr].mp3rate = mp3rate; + + avi_update_header(AVI); +} + +#define OUT4CC(s) \ + if (nhb <= HEADERBYTES - 4) \ + memcpy(AVI_header + nhb, s, 4); \ + nhb += 4 + +#define OUTLONG(n) \ + if (nhb <= HEADERBYTES - 4) \ + long2str(AVI_header + nhb, n); \ + nhb += 4 + +#define OUTSHRT(n) \ + if (nhb <= HEADERBYTES - 2) { \ + AVI_header[nhb] = (n)&0xff; \ + AVI_header[nhb + 1] = (n >> 8) & 0xff; \ + } \ + nhb += 2 + +// ThOe write preliminary AVI file header: 0 frames, max vid/aud size +int avi_update_header(avi_t *AVI) { + int njunk, sampsize, hasIndex, ms_per_frame, frate, flag; + int movi_len, hdrl_start, strl_start, j; + unsigned char AVI_header[HEADERBYTES]; + long nhb; + + // assume max size + movi_len = AVI_MAX_LEN - HEADERBYTES + 4; + + // assume index will be written + hasIndex = 1; + + if (AVI->fps < 0.001) { + frate = 0; + ms_per_frame = 0; + } else { + frate = (int)(FRAME_RATE_SCALE * AVI->fps + 0.5); + ms_per_frame = (int)(1000000 / AVI->fps + 0.5); + } + + /* Prepare the file header */ + + nhb = 0; + + /* The RIFF header */ + + OUT4CC("RIFF"); + OUTLONG(movi_len); // assume max size + OUT4CC("AVI "); + + /* Start the header list */ + + OUT4CC("LIST"); + OUTLONG(0); /* Length of list in bytes, don't know yet */ + hdrl_start = nhb; /* Store start position */ + OUT4CC("hdrl"); + + /* The main AVI header */ + + /* The Flags in AVI File header */ + +#define AVIF_HASINDEX 0x00000010 /* Index at end of file */ +#define AVIF_MUSTUSEINDEX 0x00000020 +#define AVIF_ISINTERLEAVED 0x00000100 +#define AVIF_TRUSTCKTYPE 0x00000800 /* Use CKType to find key frames */ +#define AVIF_WASCAPTUREFILE 0x00010000 +#define AVIF_COPYRIGHTED 0x00020000 + + OUT4CC("avih"); + OUTLONG(56); /* # of bytes to follow */ + OUTLONG(ms_per_frame); /* Microseconds per frame */ + // ThOe ->0 + // OUTLONG(10000000); /* MaxBytesPerSec, I hope this will never + // be used */ + OUTLONG(0); + OUTLONG(0); /* PaddingGranularity (whatever that might be) */ + /* Other sources call it 'reserved' */ + flag = AVIF_ISINTERLEAVED; + if (hasIndex) + flag |= AVIF_HASINDEX; + if (hasIndex && AVI->must_use_index) + flag |= AVIF_MUSTUSEINDEX; + OUTLONG(flag); /* Flags */ + OUTLONG(0); // no frames yet + OUTLONG(0); /* InitialFrames */ + + OUTLONG(AVI->anum + 1); + + OUTLONG(0); /* SuggestedBufferSize */ + OUTLONG(AVI->width); /* Width */ + OUTLONG(AVI->height); /* Height */ + /* MS calls the following 'reserved': */ + OUTLONG(0); /* TimeScale: Unit used to measure time */ + OUTLONG(0); /* DataRate: Data rate of playback */ + OUTLONG(0); /* StartTime: Starting time of AVI data */ + OUTLONG(0); /* DataLength: Size of AVI data chunk */ + + /* Start the video stream list ---------------------------------- */ + + OUT4CC("LIST"); + OUTLONG(0); /* Length of list in bytes, don't know yet */ + strl_start = nhb; /* Store start position */ + OUT4CC("strl"); + + /* The video stream header */ + + OUT4CC("strh"); + OUTLONG(56); /* # of bytes to follow */ + OUT4CC("vids"); /* Type */ + OUT4CC(AVI->compressor); /* Handler */ + OUTLONG(0); /* Flags */ + OUTLONG(0); /* Reserved, MS says: wPriority, wLanguage */ + OUTLONG(0); /* InitialFrames */ + OUTLONG(FRAME_RATE_SCALE); /* Scale */ + OUTLONG(frate); /* Rate: Rate/Scale == samples/second */ + OUTLONG(0); /* Start */ + OUTLONG(0); // no frames yet + OUTLONG(0); /* SuggestedBufferSize */ + OUTLONG(-1); /* Quality */ + OUTLONG(0); /* SampleSize */ + OUTLONG(0); /* Frame */ + OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + + /* The video stream format */ + + OUT4CC("strf"); + OUTLONG(40); /* # of bytes to follow */ + OUTLONG(40); /* Size */ + OUTLONG(AVI->width); /* Width */ + OUTLONG(AVI->height); /* Height */ + OUTSHRT(1); + OUTSHRT(24); /* Planes, Count */ + OUT4CC(AVI->compressor); /* Compression */ + // ThOe (*3) + OUTLONG(AVI->width * AVI->height * 3); /* SizeImage (in bytes?) */ + OUTLONG(0); /* XPelsPerMeter */ + OUTLONG(0); /* YPelsPerMeter */ + OUTLONG(0); /* ClrUsed: Number of colors used */ + OUTLONG(0); /* ClrImportant: Number of colors important */ + + /* Finish stream list, i.e. put number of bytes in the list to proper pos */ + + long2str(AVI_header + strl_start - 4, nhb - strl_start); + + /* Start the audio stream list ---------------------------------- */ + + for (j = 0; j < AVI->anum; ++j) { + + sampsize = avi_sampsize(AVI, j); + + OUT4CC("LIST"); + OUTLONG(0); /* Length of list in bytes, don't know yet */ + strl_start = nhb; /* Store start position */ + OUT4CC("strl"); + + /* The audio stream header */ + + OUT4CC("strh"); + OUTLONG(56); /* # of bytes to follow */ + OUT4CC("auds"); + + // ----------- + // ThOe + OUTLONG(0); /* Format (Optionally) */ + // ----------- + + OUTLONG(0); /* Flags */ + OUTLONG(0); /* Reserved, MS says: wPriority, wLanguage */ + OUTLONG(0); /* InitialFrames */ + + // ThOe /4 + OUTLONG(sampsize / 4); /* Scale */ + OUTLONG(1000 * AVI->track[j].mp3rate / 8); + OUTLONG(0); /* Start */ + OUTLONG(4 * AVI->track[j].audio_bytes / sampsize); /* Length */ + OUTLONG(0); /* SuggestedBufferSize */ + OUTLONG(-1); /* Quality */ + + // ThOe /4 + OUTLONG(sampsize / 4); /* SampleSize */ + + OUTLONG(0); /* Frame */ + OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + + /* The audio stream format */ + + OUT4CC("strf"); + OUTLONG(16); /* # of bytes to follow */ + OUTSHRT(AVI->track[j].a_fmt); /* Format */ + OUTSHRT(AVI->track[j].a_chans); /* Number of channels */ + OUTLONG(AVI->track[j].a_rate); /* SamplesPerSec */ + // ThOe + OUTLONG(1000 * AVI->track[j].mp3rate / 8); + // ThOe (/4) + + OUTSHRT(sampsize / 4); /* BlockAlign */ + + OUTSHRT(AVI->track[j].a_bits); /* BitsPerSample */ + + /* Finish stream list, i.e. put number of bytes in the list to proper pos */ + + long2str(AVI_header + strl_start - 4, nhb - strl_start); + } + + /* Finish header list */ + + long2str(AVI_header + hdrl_start - 4, nhb - hdrl_start); + + /* Calculate the needed amount of junk bytes, output junk */ + + njunk = HEADERBYTES - nhb - 8 - 12; + + /* Safety first: if njunk <= 0, somebody has played with + HEADERBYTES without knowing what (s)he did. + This is a fatal error */ + + if (njunk <= 0) { + fprintf(stderr, "AVI_close_output_file: # of header bytes too small\n"); + exit(1); + } + + OUT4CC("JUNK"); + OUTLONG(njunk); + memset(AVI_header + nhb, 0, njunk); + + // 11/14/01 added id string + + if (njunk > strlen(id_str) + 8) { + sprintf(id_str, "%s-%s", PACKAGE, VERSION); + memcpy(AVI_header + nhb, id_str, strlen(id_str)); + } + + nhb += njunk; + + /* Start the movi list */ + + OUT4CC("LIST"); + OUTLONG(movi_len); /* Length of list in bytes */ + OUT4CC("movi"); + + /* Output the header, truncate the file to the number of bytes + actually written, report an error if someting goes wrong */ + + if (lseek(AVI->fdes, 0, SEEK_SET) < 0 || + avi_write(AVI->fdes, (char *)AVI_header, HEADERBYTES) != HEADERBYTES || + lseek(AVI->fdes, AVI->pos, SEEK_SET) < 0) { + AVI_errno = AVI_ERR_CLOSE; + return -1; + } + + return 0; +} + +/* + Write the header of an AVI file and close it. + returns 0 on success, -1 on write error. +*/ + +static int avi_close_output_file(avi_t *AVI) { + + int ret, njunk, sampsize, hasIndex, ms_per_frame, frate, idxerror, flag; + unsigned long movi_len; + int hdrl_start, strl_start, j; + unsigned char AVI_header[HEADERBYTES]; + long nhb; + +#ifdef INFO_LIST + long info_len; +// time_t calptr; +#endif + + /* Calculate length of movi list */ + + movi_len = AVI->pos - HEADERBYTES + 4; + + /* Try to ouput the index entries. This may fail e.g. if no space + is left on device. We will report this as an error, but we still + try to write the header correctly (so that the file still may be + readable in the most cases */ + + idxerror = 0; + // fprintf(stderr, "pos=%lu, index_len=%ld \n", AVI->pos, + // AVI->n_idx*16); + ret = avi_add_chunk(AVI, (unsigned char *)"idx1", + (unsigned char *)((void *)AVI->idx), AVI->n_idx * 16); + hasIndex = (ret == 0); + // fprintf(stderr, "pos=%lu, index_len=%d\n", AVI->pos, hasIndex); + + if (ret) { + idxerror = 1; + AVI_errno = AVI_ERR_WRITE_INDEX; + } + + /* Calculate Microseconds per frame */ + + if (AVI->fps < 0.001) { + frate = 0; + ms_per_frame = 0; + } else { + frate = (int)(FRAME_RATE_SCALE * AVI->fps + 0.5); + ms_per_frame = (int)(1000000 / AVI->fps + 0.5); + } + + /* Prepare the file header */ + + nhb = 0; + + /* The RIFF header */ + + OUT4CC("RIFF"); + OUTLONG(AVI->pos - 8); /* # of bytes to follow */ + OUT4CC("AVI "); + + /* Start the header list */ + + OUT4CC("LIST"); + OUTLONG(0); /* Length of list in bytes, don't know yet */ + hdrl_start = nhb; /* Store start position */ + OUT4CC("hdrl"); + + /* The main AVI header */ + + /* The Flags in AVI File header */ + +#define AVIF_HASINDEX 0x00000010 /* Index at end of file */ +#define AVIF_MUSTUSEINDEX 0x00000020 +#define AVIF_ISINTERLEAVED 0x00000100 +#define AVIF_TRUSTCKTYPE 0x00000800 /* Use CKType to find key frames */ +#define AVIF_WASCAPTUREFILE 0x00010000 +#define AVIF_COPYRIGHTED 0x00020000 + + OUT4CC("avih"); + OUTLONG(56); /* # of bytes to follow */ + OUTLONG(ms_per_frame); /* Microseconds per frame */ + // ThOe ->0 + // OUTLONG(10000000); /* MaxBytesPerSec, I hope this will never + // be used */ + OUTLONG(0); + OUTLONG(0); /* PaddingGranularity (whatever that might be) */ + /* Other sources call it 'reserved' */ + flag = AVIF_ISINTERLEAVED; + if (hasIndex) + flag |= AVIF_HASINDEX; + if (hasIndex && AVI->must_use_index) + flag |= AVIF_MUSTUSEINDEX; + OUTLONG(flag); /* Flags */ + OUTLONG(AVI->video_frames); /* TotalFrames */ + OUTLONG(0); /* InitialFrames */ + + OUTLONG(AVI->anum + 1); + // if (AVI->track[0].audio_bytes) + // { OUTLONG(2); } /* Streams */ + // else + // { OUTLONG(1); } /* Streams */ + + OUTLONG(0); /* SuggestedBufferSize */ + OUTLONG(AVI->width); /* Width */ + OUTLONG(AVI->height); /* Height */ + /* MS calls the following 'reserved': */ + OUTLONG(0); /* TimeScale: Unit used to measure time */ + OUTLONG(0); /* DataRate: Data rate of playback */ + OUTLONG(0); /* StartTime: Starting time of AVI data */ + OUTLONG(0); /* DataLength: Size of AVI data chunk */ + + /* Start the video stream list ---------------------------------- */ + + OUT4CC("LIST"); + OUTLONG(0); /* Length of list in bytes, don't know yet */ + strl_start = nhb; /* Store start position */ + OUT4CC("strl"); + + /* The video stream header */ + + OUT4CC("strh"); + OUTLONG(56); /* # of bytes to follow */ + OUT4CC("vids"); /* Type */ + OUT4CC(AVI->compressor); /* Handler */ + OUTLONG(0); /* Flags */ + OUTLONG(0); /* Reserved, MS says: wPriority, wLanguage */ + OUTLONG(0); /* InitialFrames */ + OUTLONG(FRAME_RATE_SCALE); /* Scale */ + OUTLONG(frate); /* Rate: Rate/Scale == samples/second */ + OUTLONG(0); /* Start */ + OUTLONG(AVI->video_frames); /* Length */ + OUTLONG(0); /* SuggestedBufferSize */ + OUTLONG(-1); /* Quality */ + OUTLONG(0); /* SampleSize */ + OUTLONG(0); /* Frame */ + OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + + /* The video stream format */ + + OUT4CC("strf"); + OUTLONG(40); /* # of bytes to follow */ + OUTLONG(40); /* Size */ + OUTLONG(AVI->width); /* Width */ + OUTLONG(AVI->height); /* Height */ + OUTSHRT(1); + OUTSHRT(24); /* Planes, Count */ + OUT4CC(AVI->compressor); /* Compression */ + // ThOe (*3) + OUTLONG(AVI->width * AVI->height * 3); /* SizeImage (in bytes?) */ + OUTLONG(0); /* XPelsPerMeter */ + OUTLONG(0); /* YPelsPerMeter */ + OUTLONG(0); /* ClrUsed: Number of colors used */ + OUTLONG(0); /* ClrImportant: Number of colors important */ + + /* Finish stream list, i.e. put number of bytes in the list to proper pos */ + + long2str(AVI_header + strl_start - 4, nhb - strl_start); + + /* Start the audio stream list ---------------------------------- */ + + for (j = 0; j < AVI->anum; ++j) { + + // if (AVI->track[j].a_chans && AVI->track[j].audio_bytes) + { + + sampsize = avi_sampsize(AVI, j); + + OUT4CC("LIST"); + OUTLONG(0); /* Length of list in bytes, don't know yet */ + strl_start = nhb; /* Store start position */ + OUT4CC("strl"); + + /* The audio stream header */ + + OUT4CC("strh"); + OUTLONG(56); /* # of bytes to follow */ + OUT4CC("auds"); + + // ----------- + // ThOe + OUTLONG(0); /* Format (Optionally) */ + // ----------- + + OUTLONG(0); /* Flags */ + OUTLONG(0); /* Reserved, MS says: wPriority, wLanguage */ + OUTLONG(0); /* InitialFrames */ + + // ThOe /4 + OUTLONG(sampsize / 4); /* Scale */ + OUTLONG(1000 * AVI->track[j].mp3rate / 8); + OUTLONG(0); /* Start */ + OUTLONG(4 * AVI->track[j].audio_bytes / sampsize); /* Length */ + OUTLONG(0); /* SuggestedBufferSize */ + OUTLONG(-1); /* Quality */ + + // ThOe /4 + OUTLONG(sampsize / 4); /* SampleSize */ + + OUTLONG(0); /* Frame */ + OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + // OUTLONG(0); /* Frame */ + + /* The audio stream format */ + + OUT4CC("strf"); + OUTLONG(16); /* # of bytes to follow */ + OUTSHRT(AVI->track[j].a_fmt); /* Format */ + OUTSHRT(AVI->track[j].a_chans); /* Number of channels */ + OUTLONG(AVI->track[j].a_rate); /* SamplesPerSec */ + // ThOe + OUTLONG(1000 * AVI->track[j].mp3rate / 8); + // ThOe (/4) + + OUTSHRT(sampsize / 4); /* BlockAlign */ + + OUTSHRT(AVI->track[j].a_bits); /* BitsPerSample */ + + /* Finish stream list, i.e. put number of bytes in the list to proper pos + */ + } + long2str(AVI_header + strl_start - 4, nhb - strl_start); + } + + /* Finish header list */ + + long2str(AVI_header + hdrl_start - 4, nhb - hdrl_start); + + // add INFO list --- (0.6.0pre4) + +#ifdef INFO_LIST + OUT4CC("LIST"); + + // FIXME + info_len = MAX_INFO_STRLEN + 12; + OUTLONG(info_len); + OUT4CC("INFO"); + + // OUT4CC ("INAM"); + // OUTLONG(MAX_INFO_STRLEN); + + // sprintf(id_str, "\t"); + // memset(AVI_header+nhb, 0, MAX_INFO_STRLEN); + // memcpy(AVI_header+nhb, id_str, strlen(id_str)); + // nhb += MAX_INFO_STRLEN; + + OUT4CC("ISFT"); + OUTLONG(MAX_INFO_STRLEN); + + sprintf(id_str, "%s-%s", PACKAGE, VERSION); + memset(AVI_header + nhb, 0, MAX_INFO_STRLEN); + memcpy(AVI_header + nhb, id_str, strlen(id_str)); + nhb += MAX_INFO_STRLEN; + +// OUT4CC ("ICMT"); +// OUTLONG(MAX_INFO_STRLEN); + +// calptr=time(NULL); +// sprintf(id_str, "\t%s %s", ctime(&calptr), ""); +// memset(AVI_header+nhb, 0, MAX_INFO_STRLEN); +// memcpy(AVI_header+nhb, id_str, 25); +// nhb += MAX_INFO_STRLEN; +#endif + + // ---------------------------- + + /* Calculate the needed amount of junk bytes, output junk */ + + njunk = HEADERBYTES - nhb - 8 - 12; + + /* Safety first: if njunk <= 0, somebody has played with + HEADERBYTES without knowing what (s)he did. + This is a fatal error */ + + if (njunk <= 0) { + fprintf(stderr, "AVI_close_output_file: # of header bytes too small\n"); + exit(1); + } + + OUT4CC("JUNK"); + OUTLONG(njunk); + memset(AVI_header + nhb, 0, njunk); + + nhb += njunk; + + /* Start the movi list */ + + OUT4CC("LIST"); + OUTLONG(movi_len); /* Length of list in bytes */ + OUT4CC("movi"); + + /* Output the header, truncate the file to the number of bytes + actually written, report an error if someting goes wrong */ + + if (lseek(AVI->fdes, 0, SEEK_SET) < 0 || + avi_write(AVI->fdes, (char *)AVI_header, HEADERBYTES) != HEADERBYTES + //|| ftruncate(AVI->fdes,AVI->pos)<0 + ) { + AVI_errno = AVI_ERR_CLOSE; + return -1; + } + + if (idxerror) + return -1; + + return 0; +} + +/* + AVI_write_data: + Add video or audio data to the file; + + Return values: + 0 No error; + -1 Error, AVI_errno is set appropriatly; + +*/ + +static int avi_write_data(avi_t *AVI, char *data, unsigned long length, + int audio, int keyframe) { + int n; + + unsigned char astr[5]; + + /* Check for maximum file length */ + + if ((AVI->pos + 8 + length + 8 + (AVI->n_idx + 1) * 16) > AVI_MAX_LEN) { + AVI_errno = AVI_ERR_SIZELIM; + return -1; + } + + /* Add index entry */ + + // set tag for current audio track + sprintf((char *)astr, "0%1dwb", AVI->aptr + 1); + + if (audio) + n = avi_add_index_entry(AVI, astr, 0x00, AVI->pos, length); + else + n = avi_add_index_entry(AVI, (unsigned char *)"00db", + ((keyframe) ? 0x10 : 0x0), AVI->pos, length); + + if (n) + return -1; + + /* Output tag and data */ + + if (audio) + n = avi_add_chunk(AVI, (unsigned char *)astr, (unsigned char *)data, + length); + else + n = avi_add_chunk(AVI, (unsigned char *)"00db", (unsigned char *)data, + length); + + if (n) + return -1; + + return 0; +} + +int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe) { + unsigned long pos; + + if (AVI->mode == AVI_MODE_READ) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + + pos = AVI->pos; + + if (avi_write_data(AVI, data, bytes, 0, keyframe)) + return -1; + + AVI->last_pos = pos; + AVI->last_len = bytes; + AVI->video_frames++; + return 0; +} + +int AVI_dup_frame(avi_t *AVI) { + if (AVI->mode == AVI_MODE_READ) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + + if (AVI->last_pos == 0) + return 0; /* No previous real frame */ + if (avi_add_index_entry(AVI, (unsigned char *)"00db", 0x10, AVI->last_pos, + AVI->last_len)) + return -1; + AVI->video_frames++; + AVI->must_use_index = 1; + return 0; +} + +int AVI_write_audio(avi_t *AVI, char *data, long bytes) { + if (AVI->mode == AVI_MODE_READ) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + + if (avi_write_data(AVI, data, bytes, 1, 0)) + return -1; + AVI->track[AVI->aptr].audio_bytes += bytes; + return 0; +} + +int AVI_append_audio(avi_t *AVI, char *data, long bytes) { + + long i, length, pos; + unsigned char c[4]; + + if (AVI->mode == AVI_MODE_READ) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + + // update last index entry: + + --AVI->n_idx; + length = str2ulong(AVI->idx[AVI->n_idx] + 12); + pos = str2ulong(AVI->idx[AVI->n_idx] + 8); + + // update; + long2str(AVI->idx[AVI->n_idx] + 12, length + bytes); + + ++AVI->n_idx; + + AVI->track[AVI->aptr].audio_bytes += bytes; + + // update chunk header + lseek(AVI->fdes, pos + 4, SEEK_SET); + long2str(c, length + bytes); + avi_write(AVI->fdes, (char *)c, 4); + + lseek(AVI->fdes, pos + 8 + length, SEEK_SET); + + i = PAD_EVEN(length + bytes); + + bytes = i - length; + avi_write(AVI->fdes, data, bytes); + AVI->pos = pos + 8 + i; + + return 0; +} + +long AVI_bytes_remain(avi_t *AVI) { + if (AVI->mode == AVI_MODE_READ) + return 0; + + return (AVI_MAX_LEN - (AVI->pos + 8 + 16 * AVI->n_idx)); +} + +long AVI_bytes_written(avi_t *AVI) { + if (AVI->mode == AVI_MODE_READ) + return 0; + + return (AVI->pos + 8 + 16 * AVI->n_idx); +} + +int AVI_set_audio_track(avi_t *AVI, int track) { + + if (track < 0 || track + 1 > AVI->anum) + return (-1); + + // this info is not written to file anyway + AVI->aptr = track; + return 0; +} + +int AVI_get_audio_track(avi_t *AVI) { return (AVI->aptr); } + +/******************************************************************* + * * + * Utilities for reading video and audio from an AVI File * + * * + *******************************************************************/ + +int AVI_close(avi_t *AVI) { + int ret; + + /* If the file was open for writing, the header and index still have + to be written */ + + if (AVI->mode == AVI_MODE_WRITE) + ret = avi_close_output_file(AVI); + else + ret = 0; + + /* Even if there happened an error, we first clean up */ + + close(AVI->fdes); + if (AVI->idx) + free(AVI->idx); + if (AVI->video_index) + free(AVI->video_index); + // FIXME + // if(AVI->audio_index) free(AVI->audio_index); + free(AVI); + + return ret; +} + +#define ERR_EXIT(x) \ + { \ + AVI_close(AVI); \ + AVI_errno = x; \ + return 0; \ + } + +avi_t *AVI_open_input_file(char *filename, int getIndex) { + avi_t *AVI = NULL; + + /* Create avi_t structure */ + + AVI = (avi_t *)malloc(sizeof(avi_t)); + if (AVI == NULL) { + AVI_errno = AVI_ERR_NO_MEM; + return 0; + } + memset((void *)AVI, 0, sizeof(avi_t)); + + AVI->mode = AVI_MODE_READ; /* open for reading */ + + /* Open the file */ + + AVI->fdes = open(filename, O_RDONLY | O_BINARY); + if (AVI->fdes < 0) { + AVI_errno = AVI_ERR_OPEN; + free(AVI); + return 0; + } + + avi_parse_input_file(AVI, getIndex); + + AVI->aptr = 0; // reset + + return AVI; +} + +avi_t *AVI_open_fd(int fd, int getIndex) { + avi_t *AVI = NULL; + + /* Create avi_t structure */ + + AVI = (avi_t *)malloc(sizeof(avi_t)); + if (AVI == NULL) { + AVI_errno = AVI_ERR_NO_MEM; + return 0; + } + memset((void *)AVI, 0, sizeof(avi_t)); + + AVI->mode = AVI_MODE_READ; /* open for reading */ + + // file alread open + AVI->fdes = fd; + + avi_parse_input_file(AVI, getIndex); + + AVI->aptr = 0; // reset + + return AVI; +} + +int avi_parse_input_file(avi_t *AVI, int getIndex) { + long i, n, rate, scale, idx_type; + unsigned char *hdrl_data; + long header_offset = 0, hdrl_len = 0; + long nvi, nai[AVI_MAX_TRACKS], ioff; + long tot[AVI_MAX_TRACKS]; + int j; + int lasttag = 0; + int vids_strh_seen = 0; + int vids_strf_seen = 0; + int auds_strh_seen = 0; + // int auds_strf_seen = 0; + int num_stream = 0; + char data[256]; + + /* Read first 12 bytes and check that this is an AVI file */ + + if (avi_read(AVI->fdes, data, 12) != 12) + ERR_EXIT(AVI_ERR_READ) + + if (strncasecmp(data, "RIFF", 4) != 0 || + strncasecmp(data + 8, "AVI ", 4) != 0) + ERR_EXIT(AVI_ERR_NO_AVI) + + /* Go through the AVI file and extract the header list, + the start position of the 'movi' list and an optionally + present idx1 tag */ + + hdrl_data = 0; + + while (1) { + if (avi_read(AVI->fdes, data, 8) != 8) + break; /* We assume it's EOF */ + + n = str2ulong((unsigned char *)data + 4); + n = PAD_EVEN(n); + + if (strncasecmp(data, "LIST", 4) == 0) { + if (avi_read(AVI->fdes, data, 4) != 4) + ERR_EXIT(AVI_ERR_READ) + n -= 4; + if (strncasecmp(data, "hdrl", 4) == 0) { + hdrl_len = n; + hdrl_data = (unsigned char *)malloc(n); + if (hdrl_data == 0) + ERR_EXIT(AVI_ERR_NO_MEM); + + // offset of header + + header_offset = lseek(AVI->fdes, 0, SEEK_CUR); + + if (avi_read(AVI->fdes, (char *)hdrl_data, n) != n) + ERR_EXIT(AVI_ERR_READ) + } else if (strncasecmp(data, "movi", 4) == 0) { + AVI->movi_start = lseek(AVI->fdes, 0, SEEK_CUR); + lseek(AVI->fdes, n, SEEK_CUR); + } else + lseek(AVI->fdes, n, SEEK_CUR); + } else if (strncasecmp(data, "idx1", 4) == 0) { + /* n must be a multiple of 16, but the reading does not + break if this is not the case */ + + AVI->n_idx = AVI->max_idx = n / 16; + AVI->idx = (unsigned char((*)[16]))malloc(n); + if (AVI->idx == 0) + ERR_EXIT(AVI_ERR_NO_MEM) + if (avi_read(AVI->fdes, (char *)AVI->idx, n) != n) + ERR_EXIT(AVI_ERR_READ) + } else + lseek(AVI->fdes, n, SEEK_CUR); + } + + if (!hdrl_data) + ERR_EXIT(AVI_ERR_NO_HDRL) + if (!AVI->movi_start) + ERR_EXIT(AVI_ERR_NO_MOVI) + + /* Interpret the header list */ + + for (i = 0; i < hdrl_len;) { + /* List tags are completly ignored */ + + if (strncasecmp((char *)hdrl_data + i, "LIST", 4) == 0) { + i += 12; + continue; + } + + n = str2ulong(hdrl_data + i + 4); + n = PAD_EVEN(n); + + /* Interpret the tag and its args */ + + if (strncasecmp((char *)hdrl_data + i, "strh", 4) == 0) { + i += 8; + if (strncasecmp((char *)hdrl_data + i, "vids", 4) == 0 && + !vids_strh_seen) { + memcpy(AVI->compressor, hdrl_data + i + 4, 4); + AVI->compressor[4] = 0; + + // ThOe + AVI->v_codech_off = header_offset + i + 4; + + scale = str2ulong((unsigned char *)hdrl_data + i + 20); + rate = str2ulong(hdrl_data + i + 24); + if (scale != 0) + AVI->fps = (double)rate / (double)scale; + AVI->video_frames = str2ulong(hdrl_data + i + 32); + AVI->video_strn = num_stream; + AVI->max_len = 0; + vids_strh_seen = 1; + lasttag = 1; /* vids */ + } else if (strncasecmp((char *)hdrl_data + i, "auds", 4) == 0 && + !auds_strh_seen) { + + // inc audio tracks + AVI->aptr = AVI->anum; + ++AVI->anum; + + if (AVI->anum > AVI_MAX_TRACKS) { + fprintf(stderr, "error - only %d audio tracks supported\n", + AVI_MAX_TRACKS); + return (-1); + } + + AVI->track[AVI->aptr].audio_bytes = + str2ulong(hdrl_data + i + 32) * avi_sampsize(AVI, 0); + AVI->track[AVI->aptr].audio_strn = num_stream; + // auds_strh_seen = 1; + lasttag = 2; /* auds */ + + // ThOe + AVI->track[AVI->aptr].a_codech_off = header_offset + i; + + } else + lasttag = 0; + num_stream++; + } else if (strncasecmp((char *)hdrl_data + i, "strf", 4) == 0) { + i += 8; + if (lasttag == 1) { + AVI->width = str2ulong(hdrl_data + i + 4); + AVI->height = str2ulong(hdrl_data + i + 8); + vids_strf_seen = 1; + // ThOe + AVI->v_codecf_off = header_offset + i + 16; + + memcpy(AVI->compressor2, hdrl_data + i + 16, 4); + AVI->compressor2[4] = 0; + + } else if (lasttag == 2) { + AVI->track[AVI->aptr].a_fmt = str2ushort(hdrl_data + i); + + // ThOe + AVI->track[AVI->aptr].a_codecf_off = header_offset + i; + + AVI->track[AVI->aptr].a_chans = str2ushort(hdrl_data + i + 2); + AVI->track[AVI->aptr].a_rate = str2ulong(hdrl_data + i + 4); + // ThOe: read mp3bitrate + AVI->track[AVI->aptr].mp3rate = 8 * str2ulong(hdrl_data + i + 8) / 1000; + //: ThOe + AVI->track[AVI->aptr].a_bits = str2ushort(hdrl_data + i + 14); + // auds_strf_seen = 1; + } + lasttag = 0; + } else { + i += 8; + lasttag = 0; + } + + i += n; + } + + free(hdrl_data); + + if (!vids_strh_seen || !vids_strf_seen) + ERR_EXIT(AVI_ERR_NO_VIDS) + + AVI->video_tag[0] = AVI->video_strn / 10 + '0'; + AVI->video_tag[1] = AVI->video_strn % 10 + '0'; + AVI->video_tag[2] = 'd'; + AVI->video_tag[3] = 'b'; + + /* Audio tag is set to "99wb" if no audio present */ + if (!AVI->track[0].a_chans) + AVI->track[0].audio_strn = 99; + + for (j = 0; j < AVI->anum; ++j) { + AVI->track[j].audio_tag[0] = (j + 1) / 10 + '0'; + AVI->track[j].audio_tag[1] = (j + 1) % 10 + '0'; + AVI->track[j].audio_tag[2] = 'w'; + AVI->track[j].audio_tag[3] = 'b'; + } + + lseek(AVI->fdes, AVI->movi_start, SEEK_SET); + + /* get index if wanted */ + + if (!getIndex) + return (0); + + /* if the file has an idx1, check if this is relative + to the start of the file or to the start of the movi list */ + + idx_type = 0; + + if (AVI->idx) { + long pos, len; + + /* Search the first videoframe in the idx1 and look where + it is in the file */ + + for (i = 0; i < AVI->n_idx; i++) + if (strncasecmp((char *)AVI->idx[i], (char *)AVI->video_tag, 3) == 0) + break; + if (i >= AVI->n_idx) + ERR_EXIT(AVI_ERR_NO_VIDS) + + pos = str2ulong(AVI->idx[i] + 8); + len = str2ulong(AVI->idx[i] + 12); + + lseek(AVI->fdes, pos, SEEK_SET); + if (avi_read(AVI->fdes, data, 8) != 8) + ERR_EXIT(AVI_ERR_READ) + if (strncasecmp((char *)data, (char *)AVI->idx[i], 4) == 0 && + str2ulong((unsigned char *)data + 4) == len) { + idx_type = 1; /* Index from start of file */ + } else { + lseek(AVI->fdes, pos + AVI->movi_start - 4, SEEK_SET); + if (avi_read(AVI->fdes, data, 8) != 8) + ERR_EXIT(AVI_ERR_READ) + if (strncasecmp((char *)data, (char *)AVI->idx[i], 4) == 0 && + str2ulong((unsigned char *)data + 4) == len) { + idx_type = 2; /* Index from start of movi list */ + } + } + /* idx_type remains 0 if neither of the two tests above succeeds */ + } + + if (idx_type == 0) { + /* we must search through the file to get the index */ + + lseek(AVI->fdes, AVI->movi_start, SEEK_SET); + + AVI->n_idx = 0; + + while (1) { + if (avi_read(AVI->fdes, data, 8) != 8) + break; + n = str2ulong((unsigned char *)data + 4); + + /* The movi list may contain sub-lists, ignore them */ + + if (strncasecmp(data, "LIST", 4) == 0) { + lseek(AVI->fdes, 4, SEEK_CUR); + continue; + } + + /* Check if we got a tag ##db, ##dc or ##wb */ + + if (((data[2] == 'd' || data[2] == 'D') && + (data[3] == 'b' || data[3] == 'B' || data[3] == 'c' || + data[3] == 'C')) || + ((data[2] == 'w' || data[2] == 'W') && + (data[3] == 'b' || data[3] == 'B'))) { + avi_add_index_entry(AVI, (unsigned char *)data, 0, + lseek(AVI->fdes, 0, SEEK_CUR) - 8, n); + } + + lseek(AVI->fdes, PAD_EVEN(n), SEEK_CUR); + } + idx_type = 1; + } + + /* Now generate the video index and audio index arrays */ + + nvi = 0; + for (j = 0; j < AVI->anum; ++j) + nai[j] = 0; + + for (i = 0; i < AVI->n_idx; i++) { + + if (strncasecmp((char *)AVI->idx[i], (char *)AVI->video_tag, 3) == 0) + nvi++; + + for (j = 0; j < AVI->anum; ++j) + if (strncasecmp((char *)AVI->idx[i], AVI->track[j].audio_tag, 4) == 0) + nai[j]++; + } + + AVI->video_frames = nvi; + for (j = 0; j < AVI->anum; ++j) + AVI->track[j].audio_chunks = nai[j]; + + // fprintf(stderr, "chunks = %ld %d %s\n", AVI->track[0].audio_chunks, + // AVI->anum, AVI->track[0].audio_tag); + + if (AVI->video_frames == 0) + ERR_EXIT(AVI_ERR_NO_VIDS); + AVI->video_index = + (video_index_entry *)malloc(nvi * sizeof(video_index_entry)); + if (AVI->video_index == 0) + ERR_EXIT(AVI_ERR_NO_MEM); + + for (j = 0; j < AVI->anum; ++j) { + if (AVI->track[j].audio_chunks) { + AVI->track[j].audio_index = + (audio_index_entry *)malloc(nai[j] * sizeof(audio_index_entry)); + if (AVI->track[j].audio_index == 0) + ERR_EXIT(AVI_ERR_NO_MEM); + } + } + + nvi = 0; + for (j = 0; j < AVI->anum; ++j) + nai[j] = tot[j] = 0; + + ioff = idx_type == 1 ? 8 : AVI->movi_start + 4; + + for (i = 0; i < AVI->n_idx; i++) { + + // video + if (strncasecmp((char *)AVI->idx[i], (char *)AVI->video_tag, 3) == 0) { + AVI->video_index[nvi].key = str2ulong(AVI->idx[i] + 4); + AVI->video_index[nvi].pos = str2ulong(AVI->idx[i] + 8) + ioff; + AVI->video_index[nvi].len = str2ulong(AVI->idx[i] + 12); + nvi++; + } + + // audio + for (j = 0; j < AVI->anum; ++j) { + + if (strncasecmp((char *)AVI->idx[i], AVI->track[j].audio_tag, 4) == 0) { + AVI->track[j].audio_index[nai[j]].pos = + str2ulong(AVI->idx[i] + 8) + ioff; + AVI->track[j].audio_index[nai[j]].len = str2ulong(AVI->idx[i] + 12); + AVI->track[j].audio_index[nai[j]].tot = tot[j]; + tot[j] += AVI->track[j].audio_index[nai[j]].len; + nai[j]++; + } + } + } + + for (j = 0; j < AVI->anum; ++j) + AVI->track[j].audio_bytes = tot[j]; + + /* Reposition the file */ + + lseek(AVI->fdes, AVI->movi_start, SEEK_SET); + AVI->video_pos = 0; + + return (0); +} + +long AVI_video_frames(avi_t *AVI) { return AVI->video_frames; } +int AVI_video_width(avi_t *AVI) { return AVI->width; } +int AVI_video_height(avi_t *AVI) { return AVI->height; } +double AVI_frame_rate(avi_t *AVI) { return AVI->fps; } +char *AVI_video_compressor(avi_t *AVI) { return AVI->compressor2; } + +long AVI_max_video_chunk(avi_t *AVI) { return AVI->max_len; } + +int AVI_audio_tracks(avi_t *AVI) { return (AVI->anum); } + +int AVI_audio_channels(avi_t *AVI) { return AVI->track[AVI->aptr].a_chans; } + +long AVI_audio_mp3rate(avi_t *AVI) { return AVI->track[AVI->aptr].mp3rate; } + +int AVI_audio_bits(avi_t *AVI) { return AVI->track[AVI->aptr].a_bits; } + +int AVI_audio_format(avi_t *AVI) { return AVI->track[AVI->aptr].a_fmt; } + +long AVI_audio_rate(avi_t *AVI) { return AVI->track[AVI->aptr].a_rate; } + +long AVI_audio_bytes(avi_t *AVI) { return AVI->track[AVI->aptr].audio_bytes; } + +long AVI_audio_chunks(avi_t *AVI) { return AVI->track[AVI->aptr].audio_chunks; } + +long AVI_audio_codech_offset(avi_t *AVI) { + return AVI->track[AVI->aptr].a_codech_off; +} + +long AVI_audio_codecf_offset(avi_t *AVI) { + return AVI->track[AVI->aptr].a_codecf_off; +} + +long AVI_video_codech_offset(avi_t *AVI) { return AVI->v_codech_off; } + +long AVI_video_codecf_offset(avi_t *AVI) { return AVI->v_codecf_off; } + +long AVI_frame_size(avi_t *AVI, long frame) { + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->video_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + + if (frame < 0 || frame >= AVI->video_frames) + return 0; + return (AVI->video_index[frame].len); +} + +long AVI_audio_size(avi_t *AVI, long frame) { + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->track[AVI->aptr].audio_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + + if (frame < 0 || frame >= AVI->track[AVI->aptr].audio_chunks) + return 0; + return (AVI->track[AVI->aptr].audio_index[frame].len); +} + +long AVI_get_video_position(avi_t *AVI, long frame) { + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->video_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + + if (frame < 0 || frame >= AVI->video_frames) + return 0; + return (AVI->video_index[frame].pos); +} + +int AVI_seek_start(avi_t *AVI) { + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + + lseek(AVI->fdes, AVI->movi_start, SEEK_SET); + AVI->video_pos = 0; + return 0; +} + +int AVI_set_video_position(avi_t *AVI, long frame) { + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->video_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + + if (frame < 0) + frame = 0; + AVI->video_pos = frame; + return 0; +} + +int AVI_set_audio_bitrate(avi_t *AVI, long bitrate) { + if (AVI->mode == AVI_MODE_READ) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + + AVI->track[AVI->aptr].mp3rate = bitrate; + return 0; +} + +long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe) { + long n; + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->video_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + if (AVI->video_pos < 0 || AVI->video_pos >= AVI->video_frames) + return -1; + n = AVI->video_index[AVI->video_pos].len; + *keyframe = (AVI->video_index[AVI->video_pos].key == 0x10) ? 1 : 0; + lseek(AVI->fdes, AVI->video_index[AVI->video_pos].pos, SEEK_SET); + if (avi_read(AVI->fdes, vidbuf, n) != n) { + AVI_errno = AVI_ERR_READ; + return -1; + } + AVI->video_pos++; + return n; +} + +int AVI_set_audio_position(avi_t *AVI, long byte) { + long n0, n1, n; + + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->track[AVI->aptr].audio_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + + if (byte < 0) + byte = 0; + + /* Binary search in the audio chunks */ + + n0 = 0; + n1 = AVI->track[AVI->aptr].audio_chunks; + + while (n0 < n1 - 1) { + n = (n0 + n1) / 2; + if (AVI->track[AVI->aptr].audio_index[n].tot > byte) + n1 = n; + else + n0 = n; + } + + AVI->track[AVI->aptr].audio_posc = n0; + AVI->track[AVI->aptr].audio_posb = + byte - AVI->track[AVI->aptr].audio_index[n0].tot; + + return 0; +} + +long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes) { + long nr, pos, left, todo; + + if (AVI->mode == AVI_MODE_WRITE) { + AVI_errno = AVI_ERR_NOT_PERM; + return -1; + } + if (!AVI->track[AVI->aptr].audio_index) { + AVI_errno = AVI_ERR_NO_IDX; + return -1; + } + + nr = 0; /* total number of bytes read */ + + while (bytes > 0) { + left = AVI->track[AVI->aptr] + .audio_index[AVI->track[AVI->aptr].audio_posc] + .len - + AVI->track[AVI->aptr].audio_posb; + if (left == 0) { + if (AVI->track[AVI->aptr].audio_posc >= + AVI->track[AVI->aptr].audio_chunks - 1) + return nr; + AVI->track[AVI->aptr].audio_posc++; + AVI->track[AVI->aptr].audio_posb = 0; + continue; + } + if (bytes < left) + todo = bytes; + else + todo = left; + pos = AVI->track[AVI->aptr] + .audio_index[AVI->track[AVI->aptr].audio_posc] + .pos + + AVI->track[AVI->aptr].audio_posb; + lseek(AVI->fdes, pos, SEEK_SET); + if (avi_read(AVI->fdes, audbuf + nr, todo) != todo) { + AVI_errno = AVI_ERR_READ; + return -1; + } + bytes -= todo; + nr += todo; + AVI->track[AVI->aptr].audio_posb += todo; + } + + return nr; +} + +/* AVI_read_data: Special routine for reading the next audio or video chunk + without having an index of the file. */ + +int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf, + long max_audbuf, long *len) { + + /* + * Return codes: + * + * 1 = video data read + * 2 = audio data read + * 0 = reached EOF + * -1 = video buffer too small + * -2 = audio buffer too small + */ + + int n; + char data[8]; + + if (AVI->mode == AVI_MODE_WRITE) + return 0; + + while (1) { + /* Read tag and length */ + + if (avi_read(AVI->fdes, data, 8) != 8) + return 0; + + /* if we got a list tag, ignore it */ + + if (strncasecmp(data, "LIST", 4) == 0) { + lseek(AVI->fdes, 4, SEEK_CUR); + continue; + } + + n = PAD_EVEN(str2ulong((unsigned char *)data + 4)); + + if (strncasecmp(data, AVI->video_tag, 3) == 0) { + *len = n; + AVI->video_pos++; + if (n > max_vidbuf) { + lseek(AVI->fdes, n, SEEK_CUR); + return -1; + } + if (avi_read(AVI->fdes, vidbuf, n) != n) + return 0; + return 1; + } else if (strncasecmp(data, AVI->track[AVI->aptr].audio_tag, 4) == 0) { + *len = n; + if (n > max_audbuf) { + lseek(AVI->fdes, n, SEEK_CUR); + return -2; + } + if (avi_read(AVI->fdes, audbuf, n) != n) + return 0; + return 2; + break; + } else if (lseek(AVI->fdes, n, SEEK_CUR) < 0) + return 0; + } +} + +/* AVI_print_error: Print most recent error (similar to perror) */ + +char *(avi_errors[]) = { + /* 0 */ (char *)"avilib - No Error", + /* 1 */ (char *)"avilib - AVI file size limit reached", + /* 2 */ (char *)"avilib - Error opening AVI file", + /* 3 */ (char *)"avilib - Error reading from AVI file", + /* 4 */ (char *)"avilib - Error writing to AVI file", + /* 5 */ (char *)"avilib - Error writing index (file may still be useable)", + /* 6 */ (char *)"avilib - Error closing AVI file", + /* 7 */ (char *)"avilib - Operation (read/write) not permitted", + /* 8 */ (char *)"avilib - Out of memory (malloc failed)", + /* 9 */ (char *)"avilib - Not an AVI file", + /* 10 */ (char *)"avilib - AVI file has no header list (corrupted?)", + /* 11 */ (char *)"avilib - AVI file has no MOVI list (corrupted?)", + /* 12 */ (char *)"avilib - AVI file has no video data", + /* 13 */ (char *)"avilib - operation needs an index", + /* 14 */ (char *)"avilib - Unkown Error"}; +static int num_avi_errors = sizeof(avi_errors) / sizeof(char *); + +static char error_string[4096]; + +void AVI_print_error(char *str) { + int aerrno; + + aerrno = (AVI_errno >= 0 && AVI_errno < num_avi_errors) ? AVI_errno + : num_avi_errors - 1; + + fprintf(stderr, "%s: %s\n", str, avi_errors[aerrno]); + + /* for the following errors, perror should report a more detailed reason: */ + + if (AVI_errno == AVI_ERR_OPEN || AVI_errno == AVI_ERR_READ || + AVI_errno == AVI_ERR_WRITE || AVI_errno == AVI_ERR_WRITE_INDEX || + AVI_errno == AVI_ERR_CLOSE) { + perror("REASON"); + } +} + +char *AVI_strerror() { + int aerrno; + + aerrno = (AVI_errno >= 0 && AVI_errno < num_avi_errors) ? AVI_errno + : num_avi_errors - 1; + + if (AVI_errno == AVI_ERR_OPEN || AVI_errno == AVI_ERR_READ || + AVI_errno == AVI_ERR_WRITE || AVI_errno == AVI_ERR_WRITE_INDEX || + AVI_errno == AVI_ERR_CLOSE) { + sprintf(error_string, "%s - %s", avi_errors[aerrno], strerror(errno)); + return error_string; + } else { + return avi_errors[aerrno]; + } +} + +uint64_t AVI_max_size() { return ((uint64_t)AVI_MAX_LEN); } + +#ifdef __cplusplus +} +#endif diff --git a/examples/heartwall/AVI/avilib.h b/examples/heartwall/AVI/avilib.h new file mode 100644 index 0000000..57d2a97 --- /dev/null +++ b/examples/heartwall/AVI/avilib.h @@ -0,0 +1,317 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/* + * avilib.h + * + * Copyright (C) Thomas Östreich - June 2001 + * multiple audio track support Copyright (C) 2002 Thomas Östreich + * + * Original code: + * Copyright (C) 1999 Rainer Johanni + * + * This file is part of transcode, a linux video stream processing tool + * + * transcode is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * transcode is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +// #include +#include +#include +#include +#include +#include + +#ifndef AVILIB_H +#define AVILIB_H + +#define AVI_MAX_TRACKS 8 + +typedef struct { + unsigned long key; + unsigned long pos; + unsigned long len; +} video_index_entry; + +typedef struct { + unsigned long pos; + unsigned long len; + unsigned long tot; +} audio_index_entry; + +typedef struct track_s { + + long a_fmt; /* Audio format, see #defines below */ + long a_chans; /* Audio channels, 0 for no audio */ + long a_rate; /* Rate in Hz */ + long a_bits; /* bits per audio sample */ + long mp3rate; /* mp3 bitrate kbs*/ + + long audio_strn; /* Audio stream number */ + long audio_bytes; /* Total number of bytes of audio data */ + long audio_chunks; /* Chunks of audio data in the file */ + + char audio_tag[4]; /* Tag of audio data */ + long audio_posc; /* Audio position: chunk */ + long audio_posb; /* Audio position: byte within chunk */ + + long a_codech_off; /* absolut offset of audio codec information */ + long a_codecf_off; /* absolut offset of audio codec information */ + + audio_index_entry *audio_index; + +} track_t; + +typedef struct { + + long fdes; /* File descriptor of AVI file */ + long mode; /* 0 for reading, 1 for writing */ + + long width; /* Width of a video frame */ + long height; /* Height of a video frame */ + double fps; /* Frames per second */ + char compressor[8]; /* Type of compressor, 4 bytes + padding for 0 byte */ + char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */ + long video_strn; /* Video stream number */ + long video_frames; /* Number of video frames */ + char video_tag[4]; /* Tag of video data */ + long video_pos; /* Number of next frame to be read + (if index present) */ + + unsigned long max_len; /* maximum video chunk present */ + + track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported + + unsigned long pos; /* position in file */ + long n_idx; /* number of index entries actually filled */ + long max_idx; /* number of index entries actually allocated */ + + long v_codech_off; /* absolut offset of video codec (strh) info */ + long v_codecf_off; /* absolut offset of video codec (strf) info */ + + unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */ + video_index_entry *video_index; + + unsigned long last_pos; /* Position of last frame written */ + unsigned long last_len; /* Length of last frame written */ + int must_use_index; /* Flag if frames are duplicated */ + unsigned long movi_start; + + int anum; // total number of audio tracks + int aptr; // current audio working track + +} avi_t; + +#define AVI_MODE_WRITE 0 +#define AVI_MODE_READ 1 + +/* The error codes delivered by avi_open_input_file */ + +#define AVI_ERR_SIZELIM \ + 1 /* The write of the data would exceed \ + the maximum size of the AVI file. \ + This is more a warning than an \ + error since the file may be closed safely */ + +#define AVI_ERR_OPEN \ + 2 /* Error opening the AVI file - wrong path \ + name or file nor readable/writable \ + */ + +#define AVI_ERR_READ 3 /* Error reading from AVI File */ + +#define AVI_ERR_WRITE \ + 4 /* Error writing to AVI File, \ + disk full ??? */ + +#define AVI_ERR_WRITE_INDEX \ + 5 /* Could not write index to AVI file \ + during close, file may still be \ + usable */ + +#define AVI_ERR_CLOSE \ + 6 /* Could not write header to AVI file \ + or not truncate the file during \ + close, file is most probably corrupted */ + +#define AVI_ERR_NOT_PERM \ + 7 /* Operation not permitted: \ + trying to read from a file open \ + for writing or vice versa */ + +#define AVI_ERR_NO_MEM 8 /* malloc failed */ + +#define AVI_ERR_NO_AVI 9 /* Not an AVI file */ + +#define AVI_ERR_NO_HDRL \ + 10 /* AVI file has no has no header list, \ + corrupted ??? */ + +#define AVI_ERR_NO_MOVI \ + 11 /* AVI file has no has no MOVI list, \ + corrupted ??? */ + +#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */ + +#define AVI_ERR_NO_IDX \ + 13 /* The file has been opened with \ + getIndex==0, but an operation has \ + been performed that needs an index */ + +/* Possible Audio formats */ + +#ifndef WAVE_FORMAT_PCM +#define WAVE_FORMAT_UNKNOWN (0x0000) +#define WAVE_FORMAT_PCM (0x0001) +#define WAVE_FORMAT_ADPCM (0x0002) +#define WAVE_FORMAT_IBM_CVSD (0x0005) +#define WAVE_FORMAT_ALAW (0x0006) +#define WAVE_FORMAT_MULAW (0x0007) +#define WAVE_FORMAT_OKI_ADPCM (0x0010) +#define WAVE_FORMAT_DVI_ADPCM (0x0011) +#define WAVE_FORMAT_DIGISTD (0x0015) +#define WAVE_FORMAT_DIGIFIX (0x0016) +#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020) +#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022) +#define WAVE_FORMAT_GSM610 (0x0031) +#define IBM_FORMAT_MULAW (0x0101) +#define IBM_FORMAT_ALAW (0x0102) +#define IBM_FORMAT_ADPCM (0x0103) +#endif + +avi_t *AVI_open_output_file(char *filename); +void AVI_set_video(avi_t *AVI, int width, int height, double fps, + char *compressor); +void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format, + long mp3rate); +int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe); +int AVI_dup_frame(avi_t *AVI); +int AVI_write_audio(avi_t *AVI, char *data, long bytes); +int AVI_append_audio(avi_t *AVI, char *data, long bytes); +long AVI_bytes_remain(avi_t *AVI); +int AVI_close(avi_t *AVI); +long AVI_bytes_written(avi_t *AVI); + +avi_t *AVI_open_input_file(char *filename, int getIndex); +avi_t *AVI_open_fd(int fd, int getIndex); +int avi_parse_input_file(avi_t *AVI, int getIndex); +long AVI_audio_mp3rate(avi_t *AVI); +long AVI_video_frames(avi_t *AVI); +int AVI_video_width(avi_t *AVI); +int AVI_video_height(avi_t *AVI); +double AVI_frame_rate(avi_t *AVI); +char *AVI_video_compressor(avi_t *AVI); + +int AVI_audio_channels(avi_t *AVI); +int AVI_audio_bits(avi_t *AVI); +int AVI_audio_format(avi_t *AVI); +long AVI_audio_rate(avi_t *AVI); +long AVI_audio_bytes(avi_t *AVI); +long AVI_audio_chunks(avi_t *AVI); + +long AVI_max_video_chunk(avi_t *AVI); + +long AVI_frame_size(avi_t *AVI, long frame); +long AVI_audio_size(avi_t *AVI, long frame); +int AVI_seek_start(avi_t *AVI); +int AVI_set_video_position(avi_t *AVI, long frame); +long AVI_get_video_position(avi_t *AVI, long frame); +long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe); + +int AVI_set_audio_position(avi_t *AVI, long byte); +int AVI_set_audio_bitrate(avi_t *AVI, long bitrate); + +long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes); + +long AVI_audio_codech_offset(avi_t *AVI); +long AVI_audio_codecf_offset(avi_t *AVI); +long AVI_video_codech_offset(avi_t *AVI); +long AVI_video_codecf_offset(avi_t *AVI); + +int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf, + long max_audbuf, long *len); + +void AVI_print_error(char *str); +char *AVI_strerror(); +char *AVI_syserror(); + +int AVI_scan(char *name); +int AVI_dump(char *name, int mode); + +char *AVI_codec2str(short cc); +int AVI_file_check(char *import_file); + +void AVI_info(avi_t *avifile); +uint64_t AVI_max_size(); +int avi_update_header(avi_t *AVI); + +int AVI_set_audio_track(avi_t *AVI, int track); +int AVI_get_audio_track(avi_t *AVI); +int AVI_audio_tracks(avi_t *AVI); + +struct riff_struct { + unsigned char id[4]; /* RIFF */ + unsigned long len; + unsigned char wave_id[4]; /* WAVE */ +}; + +struct chunk_struct { + unsigned char id[4]; + unsigned long len; +}; + +struct common_struct { + unsigned short wFormatTag; + unsigned short wChannels; + unsigned long dwSamplesPerSec; + unsigned long dwAvgBytesPerSec; + unsigned short wBlockAlign; + unsigned short wBitsPerSample; /* Only for PCM */ +}; + +struct wave_header { + struct riff_struct riff; + struct chunk_struct format; + struct common_struct common; + struct chunk_struct data; +}; + +struct AVIStreamHeader { + long fccType; + long fccHandler; + long dwFlags; + long dwPriority; + long dwInitialFrames; + long dwScale; + long dwRate; + long dwStart; + long dwLength; + long dwSuggestedBufferSize; + long dwQuality; + long dwSampleSize; +}; + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/examples/heartwall/AVI/avimod.c b/examples/heartwall/AVI/avimod.c new file mode 100644 index 0000000..64d1edb --- /dev/null +++ b/examples/heartwall/AVI/avimod.c @@ -0,0 +1,130 @@ +// #ifdef __cplusplus +// extern "C" { +// #endif + +//=============================================================================================================================================================================================================== +// DEFINE / INCLUDE +//=============================================================================================================================================================================================================== +#include "avimod.h" + +//=============================================================================================================================================================================================================== +// FUNCTIONS +//=============================================================================================================================================================================================================== + +// Flips the specified image and crops it to the specified dimensions +// If scaled == true, all values are scaled to the range [0.0, 1.0 +fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled, + int converted) { + + // fixed dimensions for cropping or not cropping, square vertices starting + // from initial point in top left corner going down and right + int top; + int bottom; + int left; + int right; + if (cropped == 1) { + top = 0; + bottom = 0; + left = 0; + right = 0; + } else { + top = 0; + bottom = height - 1; + left = 0; + right = width - 1; + } + + // dimensions of new cropped image + int height_new = bottom - top + 1; + int width_new = right - left + 1; + + // counters + int i, j; + + // allocate memory for cropped/flipped frame + fp *result = (fp *)malloc(height_new * width_new * sizeof(fp)); + + // crop/flip and scale frame + fp temp; + if (scaled) { + fp scale = 1.0 / 255.0; + for (i = 0; i < height_new; i++) { // rows + for (j = 0; j < width_new; j++) { // colums + temp = + (fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale; + if (temp < 0) { + result[i * width_new + j] = temp + 256; + } else { + result[i * width_new + j] = temp; + } + } + } + } else { + for (i = 0; i < height_new; i++) { // rows + for (j = 0; j < width_new; j++) { // colums + temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)]; + if (temp < 0) { + result[i * width_new + j] = temp + 256; + } else { + result[i * width_new + j] = temp; + } + } + } + } + + // convert storage method (from row-major to column-major) + fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp)); + if (converted == 1) { + for (i = 0; i < width_new; i++) { // rows + for (j = 0; j < height_new; j++) { // colums + result_converted[i * height_new + j] = result[j * width_new + i]; + } + } + } else { + result_converted = result; + } + free(result); + + // return + return result_converted; +} + +// Returns the specified frame from the specified video file +// If cropped == true, the frame is cropped to pre-determined dimensions +// (hardcoded to the boundaries of the blood vessel in the test video) +// If scaled == true, all values are scaled to the range [0.0, 1.0] +fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled, + int converted) { + + // variable + int dummy; + int width = AVI_video_width(cell_file); + int height = AVI_video_height(cell_file); + int status; + + // There are 600 frames in this file (i.e. frame_num = 600 causes an error) + AVI_set_video_position(cell_file, frame_num); + + // Read in the frame from the AVI + char *image_buf = (char *)malloc(width * height * sizeof(char)); + status = AVI_read_frame(cell_file, image_buf, &dummy); + if (status == -1) { + AVI_print_error((char *)"Error with AVI_read_frame"); + exit(-1); + } + + // The image is read in upside-down, so we need to flip it + fp *image_chopped; + image_chopped = + chop_flip_image(image_buf, height, width, cropped, scaled, converted); + + // free image buffer + free(image_buf); + + // return + return image_chopped; +} + +// #ifdef __cplusplus +// } +// #endif diff --git a/examples/heartwall/AVI/avimod.h b/examples/heartwall/AVI/avimod.h new file mode 100644 index 0000000..f912014 --- /dev/null +++ b/examples/heartwall/AVI/avimod.h @@ -0,0 +1,24 @@ +#ifdef __cplusplus +extern "C" { +#endif + +//=============================================================================================================================================================================================================== +// DEFINE / INCLUDE +//=============================================================================================================================================================================================================== +#define fp float + +#include "avilib.h" + +//=============================================================================================================================================================================================================== +// DEFINE / INCLUDE +//=============================================================================================================================================================================================================== + +fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled, + int converted); + +fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled, + int converted); + +#ifdef __cplusplus +} +#endif diff --git a/examples/heartwall/define.c b/examples/heartwall/define.c new file mode 100644 index 0000000..6603910 --- /dev/null +++ b/examples/heartwall/define.c @@ -0,0 +1,396 @@ +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// DEFINE / INCLUDE +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== + +#define fp float + +/* #define NUMBER_THREADS 512 */ +#ifdef RD_WG_SIZE_0_0 +#define NUMBER_THREADS RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define NUMBER_THREADS RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define NUMBER_THREADS RD_WG_SIZE +#else +#define NUMBER_THREADS 256 +#endif + +#define ENDO_POINTS 20 +#define EPI_POINTS 31 +#define ALL_POINTS 51 + +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// PARAMS_COMMON_CHANGE STRUCT +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== + +typedef struct params_common_change { + + //====================================================================================================================================================== + // FRAME + //====================================================================================================================================================== + + fp *d_frame; + int frame_no; + +} params_common_change; + +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// PARAMS_COMMON STRUCTURE +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== + +typedef struct params_common { + + //====================================================================================================================================================== + // HARDCODED INPUTS FROM MATLAB + //====================================================================================================================================================== + + //==================================================================================================== + // CONSTANTS + //==================================================================================================== + + int sSize; + int tSize; + int maxMove; + fp alpha; + + //==================================================================================================== + // FRAME + //==================================================================================================== + + int no_frames; + int frame_rows; + int frame_cols; + int frame_elem; + int frame_mem; + + //==================================================================================================== + // ENDO POINTS + //==================================================================================================== + + int endoPoints; + int endo_mem; + + int *endoRow; + int *endoCol; + int *tEndoRowLoc; + int *tEndoColLoc; + + int *d_endoRow; + int *d_endoCol; + int *d_tEndoRowLoc; + int *d_tEndoColLoc; + + fp *d_endoT; + + //==================================================================================================== + // EPI POINTS + //==================================================================================================== + int epiPoints; + int epi_mem; + + int *epiRow; + int *epiCol; + int *tEpiRowLoc; + int *tEpiColLoc; + + int *d_epiRow; + int *d_epiCol; + int *d_tEpiRowLoc; + int *d_tEpiColLoc; + + fp *d_epiT; + + //==================================================================================================== + // ALL POINTS + //==================================================================================================== + + int allPoints; + + //====================================================================================================================================================== + // RIGHT TEMPLATE FROM TEMPLATE ARRAY + //====================================================================================================================================================== + + int in_rows; + int in_cols; + int in_elem; + int in_mem; + + //====================================================================================================================================================== + // AREA AROUND POINT FROM FRAME + //====================================================================================================================================================== + + int in2_rows; + int in2_cols; + int in2_elem; + int in2_mem; + + //====================================================================================================================================================== + // CONVOLUTION + //====================================================================================================================================================== + + int conv_rows; + int conv_cols; + int conv_elem; + int conv_mem; + int ioffset; + int joffset; + + //====================================================================================================================================================== + // CUMULATIVE SUM 1 + //====================================================================================================================================================== + + //==================================================================================================== + // PAD ARRAY, VERTICAL CUMULATIVE SUM + //==================================================================================================== + + int in2_pad_add_rows; + int in2_pad_add_cols; + int in2_pad_cumv_rows; + int in2_pad_cumv_cols; + int in2_pad_cumv_elem; + int in2_pad_cumv_mem; + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + int in2_pad_cumv_sel_rows; + int in2_pad_cumv_sel_cols; + int in2_pad_cumv_sel_elem; + int in2_pad_cumv_sel_mem; + int in2_pad_cumv_sel_rowlow; + int in2_pad_cumv_sel_rowhig; + int in2_pad_cumv_sel_collow; + int in2_pad_cumv_sel_colhig; + + //==================================================================================================== + // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM + //==================================================================================================== + + int in2_pad_cumv_sel2_rowlow; + int in2_pad_cumv_sel2_rowhig; + int in2_pad_cumv_sel2_collow; + int in2_pad_cumv_sel2_colhig; + int in2_sub_cumh_rows; + int in2_sub_cumh_cols; + int in2_sub_cumh_elem; + int in2_sub_cumh_mem; + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + int in2_sub_cumh_sel_rows; + int in2_sub_cumh_sel_cols; + int in2_sub_cumh_sel_elem; + int in2_sub_cumh_sel_mem; + int in2_sub_cumh_sel_rowlow; + int in2_sub_cumh_sel_rowhig; + int in2_sub_cumh_sel_collow; + int in2_sub_cumh_sel_colhig; + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + int in2_sub_cumh_sel2_rowlow; + int in2_sub_cumh_sel2_rowhig; + int in2_sub_cumh_sel2_collow; + int in2_sub_cumh_sel2_colhig; + int in2_sub2_rows; + int in2_sub2_cols; + int in2_sub2_elem; + int in2_sub2_mem; + + //====================================================================================================================================================== + // CUMULATIVE SUM 2 + //====================================================================================================================================================== + + //==================================================================================================== + // MULTIPLICATION + //==================================================================================================== + + int in2_sqr_rows; + int in2_sqr_cols; + int in2_sqr_elem; + int in2_sqr_mem; + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + int in2_sqr_sub2_rows; + int in2_sqr_sub2_cols; + int in2_sqr_sub2_elem; + int in2_sqr_sub2_mem; + + //====================================================================================================================================================== + // FINAL + //====================================================================================================================================================== + + int in_sqr_rows; + int in_sqr_cols; + int in_sqr_elem; + int in_sqr_mem; + + //====================================================================================================================================================== + // TEMPLATE MASK CREATE + //====================================================================================================================================================== + + int tMask_rows; + int tMask_cols; + int tMask_elem; + int tMask_mem; + + //====================================================================================================================================================== + // POINT MASK INITIALIZE + //====================================================================================================================================================== + + int mask_rows; + int mask_cols; + int mask_elem; + int mask_mem; + + //====================================================================================================================================================== + // MASK CONVOLUTION + //====================================================================================================================================================== + + int mask_conv_rows; + int mask_conv_cols; + int mask_conv_elem; + int mask_conv_mem; + int mask_conv_ioffset; + int mask_conv_joffset; + +} params_common; + +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// PARAMS_UNIQUE STRUCTURE +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== + +typedef struct params_unique { + + //====================================================================================================================================================== + // POINT NUMBER + //====================================================================================================================================================== + + int *d_Row; + int *d_Col; + int *d_tRowLoc; + int *d_tColLoc; + fp *d_T; + + //====================================================================================================================================================== + // POINT NUMBER + //====================================================================================================================================================== + + int point_no; + + //====================================================================================================================================================== + // RIGHT TEMPLATE FROM TEMPLATE ARRAY + //====================================================================================================================================================== + + int in_pointer; + + //====================================================================================================================================================== + // AREA AROUND POINT FROM FRAME + //====================================================================================================================================================== + + fp *d_in2; + + //====================================================================================================================================================== + // CONVOLUTION + //====================================================================================================================================================== + + fp *d_conv; + fp *d_in_mod; + + //====================================================================================================================================================== + // CUMULATIVE SUM + //====================================================================================================================================================== + + //==================================================================================================== + // PAD ARRAY, VERTICAL CUMULATIVE SUM + //==================================================================================================== + + fp *d_in2_pad_cumv; + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + fp *d_in2_pad_cumv_sel; + + //==================================================================================================== + // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM + //==================================================================================================== + + fp *d_in2_sub_cumh; + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + fp *d_in2_sub_cumh_sel; + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + fp *d_in2_sub2; + + //====================================================================================================================================================== + // CUMULATIVE SUM 2 + //====================================================================================================================================================== + + //==================================================================================================== + // MULTIPLICATION + //==================================================================================================== + + fp *d_in2_sqr; + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + fp *d_in2_sqr_sub2; + + //====================================================================================================================================================== + // FINAL + //====================================================================================================================================================== + + fp *d_in_sqr; + + //====================================================================================================================================================== + // TEMPLATE MASK + //====================================================================================================================================================== + + fp *d_tMask; + + //====================================================================================================================================================== + // POINT MASK INITIALIZE + //====================================================================================================================================================== + + fp *d_mask; + + //====================================================================================================================================================== + // MASK CONVOLUTION + //====================================================================================================================================================== + + fp *d_mask_conv; + +} params_unique; + +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// END OF STRUCTURE +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== diff --git a/examples/heartwall/kernel.cu b/examples/heartwall/kernel.cu new file mode 100755 index 0000000..b9d1945 --- /dev/null +++ b/examples/heartwall/kernel.cu @@ -0,0 +1,1239 @@ +__global__ void kernel() { + + //====================================================================================================================================================== + // COMMON VARIABLES + //====================================================================================================================================================== + + fp *d_in; + int rot_row; + int rot_col; + int in2_rowlow; + int in2_collow; + int ic; + int jc; + int jp1; + int ja1, ja2; + int ip1; + int ia1, ia2; + int ja, jb; + int ia, ib; + float s; + int i; + int j; + int row; + int col; + int ori_row; + int ori_col; + int position; + float sum; + int pos_ori; + float temp; + float temp2; + int location; + int cent; + int tMask_row; + int tMask_col; + float largest_value_current = 0; + float largest_value = 0; + int largest_coordinate_current = 0; + int largest_coordinate = 0; + float fin_max_val = 0; + int fin_max_coo = 0; + int largest_row; + int largest_col; + int offset_row; + int offset_col; + float in_partial_sum[51]; // WATCH THIS !!! HARDCODED VALUE + float in_sqr_partial_sum[51]; // WATCH THIS !!! HARDCODED VALUE + float in_final_sum; + float in_sqr_final_sum; + float mean; + float mean_sqr; + float variance; + float deviation; + float denomT; + float par_max_val[131]; // WATCH THIS !!! HARDCODED VALUE + int par_max_coo[131]; // WATCH THIS !!! HARDCODED VALUE + int pointer; + float d_in_mod_temp[2601]; + int ori_pointer; + int loc_pointer; + + //====================================================================================================================================================== + // THREAD PARAMETERS + //====================================================================================================================================================== + + int bx = blockIdx.x; // get current horizontal block index (0-n) + int tx = threadIdx.x; // get current horizontal thread index (0-n) + int ei_new = tx; + + //=============================================================================================================================================================================================================== + //=============================================================================================================================================================================================================== + // GENERATE TEMPLATE + //=============================================================================================================================================================================================================== + //=============================================================================================================================================================================================================== + printf("phase1\n"); + // generate templates based on the first frame only + //====================================================================================================================================================== + // GET POINTER TO TEMPLATE FOR THE POINT + //====================================================================================================================================================== + + // pointers to: current template for current point + d_in = &d_unique[bx].d_T[d_unique[bx].in_pointer]; + + //=============================================================================================================================================================================================================== + //=============================================================================================================================================================================================================== + // PROCESS POINTS + //=============================================================================================================================================================================================================== + //=============================================================================================================================================================================================================== + printf("phase2\n"); + // process points in all frames except for the first one + + //====================================================================================================================================================== + // SELECTION + //====================================================================================================================================================== + + in2_rowlow = + d_unique[bx].d_Row[d_unique[bx].point_no] - d_common.sSize; // (1 to n+1) + in2_collow = d_unique[bx].d_Col[d_unique[bx].point_no] - d_common.sSize; + + // work + ei_new = tx; + while (ei_new < d_common.in2_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_rows == 0) { + row = d_common.in2_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + in2_rowlow - 1; + ori_col = col + in2_collow - 1; + d_unique[bx].d_in2[ei_new] = + d_common_change.d_frame[ori_col * d_common.frame_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // CONVOLUTION + //====================================================================================================================================================== + + //==================================================================================================== + // ROTATION + //==================================================================================================== + + // variables + d_in = &d_unique[bx].d_T[d_unique[bx].in_pointer]; + + // work + ei_new = tx; + while (ei_new < d_common.in_elem) { + + // figure out row/col location in padded array + row = (ei_new + 1) % d_common.in_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in_rows == 0) { + row = d_common.in_rows - 1; + col = col - 1; + } + + // execution + rot_row = (d_common.in_rows - 1) - row; + rot_col = (d_common.in_rows - 1) - col; + d_in_mod_temp[ei_new] = d_in[rot_col * d_common.in_rows + rot_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // ACTUAL CONVOLUTION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.conv_elem) { + + // figure out row/col location in array + ic = (ei_new + 1) % d_common.conv_rows; // (1-n) + jc = (ei_new + 1) / d_common.conv_rows + 1; // (1-n) + if ((ei_new + 1) % d_common.conv_rows == 0) { + ic = d_common.conv_rows; + jc = jc - 1; + } + + // + j = jc + d_common.joffset; + jp1 = j + 1; + if (d_common.in2_cols < jp1) { + ja1 = jp1 - d_common.in2_cols; + } else { + ja1 = 1; + } + if (d_common.in_cols < j) { + ja2 = d_common.in_cols; + } else { + ja2 = j; + } + + i = ic + d_common.ioffset; + ip1 = i + 1; + + if (d_common.in2_rows < ip1) { + ia1 = ip1 - d_common.in2_rows; + } else { + ia1 = 1; + } + if (d_common.in_rows < i) { + ia2 = d_common.in_rows; + } else { + ia2 = i; + } + + s = 0; + + for (ja = ja1; ja <= ja2; ja++) { + jb = jp1 - ja; + for (ia = ia1; ia <= ia2; ia++) { + ib = ip1 - ia; + s = s + d_in_mod_temp[d_common.in_rows * (ja - 1) + ia - 1] * + d_unique[bx].d_in2[d_common.in2_rows * (jb - 1) + ib - 1]; + } + } + + // d_unique[bx].d_conv[d_common.conv_rows*(jc-1)+ic-1] = s; + d_unique[bx].d_conv[ei_new] = s; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // CUMULATIVE SUM + //====================================================================================================================================================== + + //==================================================================================================== + // PAD ARRAY, VERTICAL CUMULATIVE SUM + //==================================================================================================== + + //================================================== + // PADD ARRAY + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_pad_cumv_elem) { + + // figure out row/col location in padded array + row = (ei_new + 1) % d_common.in2_pad_cumv_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_pad_cumv_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_pad_cumv_rows == 0) { + row = d_common.in2_pad_cumv_rows - 1; + col = col - 1; + } + + // execution + if (row > (d_common.in2_pad_add_rows - + 1) && // do if has numbers in original array + row < (d_common.in2_pad_add_rows + d_common.in2_rows) && + col > (d_common.in2_pad_add_cols - 1) && + col < (d_common.in2_pad_add_cols + d_common.in2_cols)) { + ori_row = row - d_common.in2_pad_add_rows; + ori_col = col - d_common.in2_pad_add_cols; + d_unique[bx].d_in2_pad_cumv[ei_new] = + d_unique[bx].d_in2[ori_col * d_common.in2_rows + ori_row]; + } else { // do if otherwise + d_unique[bx].d_in2_pad_cumv[ei_new] = 0; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // VERTICAL CUMULATIVE SUM + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_pad_cumv_cols) { + + // figure out column position + pos_ori = ei_new * d_common.in2_pad_cumv_rows; + + // variables + sum = 0; + + // loop through all rows + for (position = pos_ori; position < pos_ori + d_common.in2_pad_cumv_rows; + position = position + 1) { + d_unique[bx].d_in2_pad_cumv[position] = + d_unique[bx].d_in2_pad_cumv[position] + sum; + sum = d_unique[bx].d_in2_pad_cumv[position]; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_pad_cumv_sel_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_pad_cumv_sel_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_pad_cumv_sel_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_pad_cumv_sel_rows == 0) { + row = d_common.in2_pad_cumv_sel_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_pad_cumv_sel_rowlow - 1; + ori_col = col + d_common.in2_pad_cumv_sel_collow - 1; + d_unique[bx].d_in2_pad_cumv_sel[ei_new] = + d_unique[bx] + .d_in2_pad_cumv[ori_col * d_common.in2_pad_cumv_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM + //==================================================================================================== + + //================================================== + // SELECTION 2 + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_sub_cumh_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_sub_cumh_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_sub_cumh_rows == 0) { + row = d_common.in2_sub_cumh_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_pad_cumv_sel2_rowlow - 1; + ori_col = col + d_common.in2_pad_cumv_sel2_collow - 1; + d_unique[bx].d_in2_sub_cumh[ei_new] = + d_unique[bx] + .d_in2_pad_cumv[ori_col * d_common.in2_pad_cumv_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // SUBTRACTION + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_elem) { + + // subtract + d_unique[bx].d_in2_sub_cumh[ei_new] = + d_unique[bx].d_in2_pad_cumv_sel[ei_new] - + d_unique[bx].d_in2_sub_cumh[ei_new]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // HORIZONTAL CUMULATIVE SUM + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_rows) { + + // figure out row position + pos_ori = ei_new; + + // variables + sum = 0; + + // loop through all rows + for (position = pos_ori; position < pos_ori + d_common.in2_sub_cumh_elem; + position = position + d_common.in2_sub_cumh_rows) { + d_unique[bx].d_in2_sub_cumh[position] = + d_unique[bx].d_in2_sub_cumh[position] + sum; + sum = d_unique[bx].d_in2_sub_cumh[position]; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_sel_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_sub_cumh_sel_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_sub_cumh_sel_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_sub_cumh_sel_rows == 0) { + row = d_common.in2_sub_cumh_sel_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_sub_cumh_sel_rowlow - 1; + ori_col = col + d_common.in2_sub_cumh_sel_collow - 1; + d_unique[bx].d_in2_sub_cumh_sel[ei_new] = + d_unique[bx] + .d_in2_sub_cumh[ori_col * d_common.in2_sub_cumh_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + //================================================== + // SELECTION 2 + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_sub2_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_sub2_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_sub2_rows == 0) { + row = d_common.in2_sub2_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_sub_cumh_sel2_rowlow - 1; + ori_col = col + d_common.in2_sub_cumh_sel2_collow - 1; + d_unique[bx].d_in2_sub2[ei_new] = + d_unique[bx] + .d_in2_sub_cumh[ori_col * d_common.in2_sub_cumh_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // SUBTRACTION + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + // subtract + d_unique[bx].d_in2_sub2[ei_new] = d_unique[bx].d_in2_sub_cumh_sel[ei_new] - + d_unique[bx].d_in2_sub2[ei_new]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // CUMULATIVE SUM 2 + //====================================================================================================================================================== + + //==================================================================================================== + // MULTIPLICATION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sqr_elem) { + + temp = d_unique[bx].d_in2[ei_new]; + d_unique[bx].d_in2_sqr[ei_new] = temp * temp; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // PAD ARRAY, VERTICAL CUMULATIVE SUM + //==================================================================================================== + + //================================================== + // PAD ARRAY + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_pad_cumv_elem) { + + // figure out row/col location in padded array + row = (ei_new + 1) % d_common.in2_pad_cumv_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_pad_cumv_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_pad_cumv_rows == 0) { + row = d_common.in2_pad_cumv_rows - 1; + col = col - 1; + } + + // execution + if (row > (d_common.in2_pad_add_rows - + 1) && // do if has numbers in original array + row < (d_common.in2_pad_add_rows + d_common.in2_sqr_rows) && + col > (d_common.in2_pad_add_cols - 1) && + col < (d_common.in2_pad_add_cols + d_common.in2_sqr_cols)) { + ori_row = row - d_common.in2_pad_add_rows; + ori_col = col - d_common.in2_pad_add_cols; + d_unique[bx].d_in2_pad_cumv[ei_new] = + d_unique[bx].d_in2_sqr[ori_col * d_common.in2_sqr_rows + ori_row]; + } else { // do if otherwise + d_unique[bx].d_in2_pad_cumv[ei_new] = 0; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // VERTICAL CUMULATIVE SUM + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_pad_cumv_cols) { + + // figure out column position + pos_ori = ei_new * d_common.in2_pad_cumv_rows; + + // variables + sum = 0; + + // loop through all rows + for (position = pos_ori; position < pos_ori + d_common.in2_pad_cumv_rows; + position = position + 1) { + d_unique[bx].d_in2_pad_cumv[position] = + d_unique[bx].d_in2_pad_cumv[position] + sum; + sum = d_unique[bx].d_in2_pad_cumv[position]; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_pad_cumv_sel_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_pad_cumv_sel_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_pad_cumv_sel_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_pad_cumv_sel_rows == 0) { + row = d_common.in2_pad_cumv_sel_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_pad_cumv_sel_rowlow - 1; + ori_col = col + d_common.in2_pad_cumv_sel_collow - 1; + d_unique[bx].d_in2_pad_cumv_sel[ei_new] = + d_unique[bx] + .d_in2_pad_cumv[ori_col * d_common.in2_pad_cumv_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM + //==================================================================================================== + + //================================================== + // SELECTION 2 + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_sub_cumh_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_sub_cumh_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_sub_cumh_rows == 0) { + row = d_common.in2_sub_cumh_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_pad_cumv_sel2_rowlow - 1; + ori_col = col + d_common.in2_pad_cumv_sel2_collow - 1; + d_unique[bx].d_in2_sub_cumh[ei_new] = + d_unique[bx] + .d_in2_pad_cumv[ori_col * d_common.in2_pad_cumv_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // SUBTRACTION + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_elem) { + + // subtract + d_unique[bx].d_in2_sub_cumh[ei_new] = + d_unique[bx].d_in2_pad_cumv_sel[ei_new] - + d_unique[bx].d_in2_sub_cumh[ei_new]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // HORIZONTAL CUMULATIVE SUM + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_rows) { + + // figure out row position + pos_ori = ei_new; + + // variables + sum = 0; + + // loop through all rows + for (position = pos_ori; position < pos_ori + d_common.in2_sub_cumh_elem; + position = position + d_common.in2_sub_cumh_rows) { + d_unique[bx].d_in2_sub_cumh[position] = + d_unique[bx].d_in2_sub_cumh[position] + sum; + sum = d_unique[bx].d_in2_sub_cumh[position]; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub_cumh_sel_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_sub_cumh_sel_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_sub_cumh_sel_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_sub_cumh_sel_rows == 0) { + row = d_common.in2_sub_cumh_sel_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_sub_cumh_sel_rowlow - 1; + ori_col = col + d_common.in2_sub_cumh_sel_collow - 1; + d_unique[bx].d_in2_sub_cumh_sel[ei_new] = + d_unique[bx] + .d_in2_sub_cumh[ori_col * d_common.in2_sub_cumh_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + //================================================== + // SELECTION 2 + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + // figure out row/col location in new matrix + row = (ei_new + 1) % d_common.in2_sub2_rows - 1; // (0-n) row + col = (ei_new + 1) / d_common.in2_sub2_rows + 1 - 1; // (0-n) column + if ((ei_new + 1) % d_common.in2_sub2_rows == 0) { + row = d_common.in2_sub2_rows - 1; + col = col - 1; + } + + // figure out corresponding location in old matrix and copy values to new + // matrix + ori_row = row + d_common.in2_sub_cumh_sel2_rowlow - 1; + ori_col = col + d_common.in2_sub_cumh_sel2_collow - 1; + d_unique[bx].d_in2_sqr_sub2[ei_new] = + d_unique[bx] + .d_in2_sub_cumh[ori_col * d_common.in2_sub_cumh_rows + ori_row]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //================================================== + // SYNCHRONIZE THREADS + //================================================== + + __syncthreads(); + + //================================================== + // SUBTRACTION + //================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + // subtract + d_unique[bx].d_in2_sqr_sub2[ei_new] = + d_unique[bx].d_in2_sub_cumh_sel[ei_new] - + d_unique[bx].d_in2_sqr_sub2[ei_new]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // FINAL + //====================================================================================================================================================== + + //==================================================================================================== + // DENOMINATOR A SAVE RESULT IN CUMULATIVE SUM A2 + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + temp = d_unique[bx].d_in2_sub2[ei_new]; + temp2 = + d_unique[bx].d_in2_sqr_sub2[ei_new] - (temp * temp / d_common.in_elem); + if (temp2 < 0) { + temp2 = 0; + } + d_unique[bx].d_in2_sqr_sub2[ei_new] = sqrt(temp2); + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // MULTIPLICATION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in_sqr_elem) { + + temp = d_in[ei_new]; + d_unique[bx].d_in_sqr[ei_new] = temp * temp; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // IN SUM + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in_cols) { + + sum = 0; + for (i = 0; i < d_common.in_rows; i++) { + + sum = sum + d_in[ei_new * d_common.in_rows + i]; + } + in_partial_sum[ei_new] = sum; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // IN_SQR SUM + //==================================================================================================== + + ei_new = tx; + while (ei_new < d_common.in_sqr_rows) { + + sum = 0; + for (i = 0; i < d_common.in_sqr_cols; i++) { + + sum = sum + d_unique[bx].d_in_sqr[ei_new + d_common.in_sqr_rows * i]; + } + in_sqr_partial_sum[ei_new] = sum; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // FINAL SUMMATION + //==================================================================================================== + + in_final_sum = 0; + for (i = 0; i < d_common.in_cols; i++) { + in_final_sum = in_final_sum + in_partial_sum[i]; + } + + { + + in_sqr_final_sum = 0; + for (i = 0; i < d_common.in_sqr_cols; i++) { + in_sqr_final_sum = in_sqr_final_sum + in_sqr_partial_sum[i]; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // DENOMINATOR T + //==================================================================================================== + + mean = in_final_sum / + d_common.in_elem; // gets mean (average) value of element in ROI + mean_sqr = mean * mean; + variance = (in_sqr_final_sum / d_common.in_elem) - + mean_sqr; // gets variance of ROI + deviation = sqrt(variance); // gets standard deviation of ROI + + denomT = sqrt(float(d_common.in_elem - 1)) * deviation; + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // DENOMINATOR SAVE RESULT IN CUMULATIVE SUM A2 + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + d_unique[bx].d_in2_sqr_sub2[ei_new] = + d_unique[bx].d_in2_sqr_sub2[ei_new] * denomT; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // NUMERATOR SAVE RESULT IN CONVOLUTION + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.conv_elem) { + + d_unique[bx].d_conv[ei_new] = + d_unique[bx].d_conv[ei_new] - + d_unique[bx].d_in2_sub2[ei_new] * in_final_sum / d_common.in_elem; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // CORRELATION SAVE RESULT IN CUMULATIVE SUM A2 + //==================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.in2_sub2_elem) { + + d_unique[bx].d_in2_sqr_sub2[ei_new] = + d_unique[bx].d_conv[ei_new] / d_unique[bx].d_in2_sqr_sub2[ei_new]; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // TEMPLATE MASK CREATE + //====================================================================================================================================================== + + cent = d_common.sSize + d_common.tSize + 1; + if (d_common_change.frame_no == 0) { + tMask_row = cent + d_unique[bx].d_Row[d_unique[bx].point_no] - + d_unique[bx].d_Row[d_unique[bx].point_no] - 1; + tMask_col = cent + d_unique[bx].d_Col[d_unique[bx].point_no] - + d_unique[bx].d_Col[d_unique[bx].point_no] - 1; + } else { + pointer = d_common_change.frame_no - 1 + + d_unique[bx].point_no * d_common.no_frames; + tMask_row = cent + d_unique[bx].d_tRowLoc[pointer] - + d_unique[bx].d_Row[d_unique[bx].point_no] - 1; + tMask_col = cent + d_unique[bx].d_tColLoc[pointer] - + d_unique[bx].d_Col[d_unique[bx].point_no] - 1; + } + + // work + ei_new = tx; + while (ei_new < d_common.tMask_elem) { + + location = tMask_col * d_common.tMask_rows + tMask_row; + + if (ei_new == location) { + d_unique[bx].d_tMask[ei_new] = 1; + } else { + d_unique[bx].d_tMask[ei_new] = 0; + } + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // MASK CONVOLUTION + //====================================================================================================================================================== + + // work + ei_new = tx; + while (ei_new < d_common.mask_conv_elem) { + + // figure out row/col location in array + ic = (ei_new + 1) % d_common.mask_conv_rows; // (1-n) + jc = (ei_new + 1) / d_common.mask_conv_rows + 1; // (1-n) + if ((ei_new + 1) % d_common.mask_conv_rows == 0) { + ic = d_common.mask_conv_rows; + jc = jc - 1; + } + + // + j = jc + d_common.mask_conv_joffset; + jp1 = j + 1; + if (d_common.mask_cols < jp1) { + ja1 = jp1 - d_common.mask_cols; + } else { + ja1 = 1; + } + if (d_common.tMask_cols < j) { + ja2 = d_common.tMask_cols; + } else { + ja2 = j; + } + + i = ic + d_common.mask_conv_ioffset; + ip1 = i + 1; + + if (d_common.mask_rows < ip1) { + ia1 = ip1 - d_common.mask_rows; + } else { + ia1 = 1; + } + if (d_common.tMask_rows < i) { + ia2 = d_common.tMask_rows; + } else { + ia2 = i; + } + + s = 0; + + for (ja = ja1; ja <= ja2; ja++) { + jb = jp1 - ja; + for (ia = ia1; ia <= ia2; ia++) { + ib = ip1 - ia; + s = s + + d_unique[bx].d_tMask[d_common.tMask_rows * (ja - 1) + ia - 1] * 1; + } + } + + // //d_unique[bx].d_mask_conv[d_common.mask_conv_rows*(jc-1)+ic-1] = s; + d_unique[bx].d_mask_conv[ei_new] = + d_unique[bx].d_in2_sqr_sub2[ei_new] * s; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + + //====================================================================================================================================================== + // MAXIMUM VALUE + //====================================================================================================================================================== + + //==================================================================================================== + // INITIAL SEARCH + //==================================================================================================== + + ei_new = tx; + while (ei_new < d_common.mask_conv_rows) { + + for (i = 0; i < d_common.mask_conv_cols; i++) { + largest_coordinate_current = ei_new * d_common.mask_conv_rows + i; + largest_value_current = + abs(d_unique[bx].d_mask_conv[largest_coordinate_current]); + if (largest_value_current > largest_value) { + largest_coordinate = largest_coordinate_current; + largest_value = largest_value_current; + } + } + par_max_coo[ei_new] = largest_coordinate; + par_max_val[ei_new] = largest_value; + + // go for second round + ei_new = ei_new + NUMBER_THREADS; + } + + //==================================================================================================== + // SYNCHRONIZE THREADS + //==================================================================================================== + + __syncthreads(); + + //==================================================================================================== + // FINAL SEARCH + //==================================================================================================== + + for (i = 0; i < d_common.mask_conv_rows; i++) { + if (par_max_val[i] > fin_max_val) { + fin_max_val = par_max_val[i]; + fin_max_coo = par_max_coo[i]; + } + } + + // convert coordinate to row/col form + largest_row = (fin_max_coo + 1) % d_common.mask_conv_rows - 1; // (0-n) row + largest_col = (fin_max_coo + 1) / d_common.mask_conv_rows; // (0-n) column + if ((fin_max_coo + 1) % d_common.mask_conv_rows == 0) { + largest_row = d_common.mask_conv_rows - 1; + largest_col = largest_col - 1; + } + + // calculate offset + largest_row = largest_row + 1; // compensate to match MATLAB format (1-n) + largest_col = largest_col + 1; // compensate to match MATLAB format (1-n) + offset_row = + largest_row - d_common.in_rows - (d_common.sSize - d_common.tSize); + offset_col = + largest_col - d_common.in_cols - (d_common.sSize - d_common.tSize); + pointer = + d_common_change.frame_no + d_unique[bx].point_no * d_common.no_frames; + d_unique[bx].d_tRowLoc[pointer] = + d_unique[bx].d_Row[d_unique[bx].point_no] + offset_row; + d_unique[bx].d_tColLoc[pointer] = + d_unique[bx].d_Col[d_unique[bx].point_no] + offset_col; + + //====================================================================================================================================================== + // SYNCHRONIZE THREADS + //====================================================================================================================================================== + + __syncthreads(); + } diff --git a/examples/heartwall/main.cu b/examples/heartwall/main.cu new file mode 100644 index 0000000..ca80319 --- /dev/null +++ b/examples/heartwall/main.cu @@ -0,0 +1,795 @@ +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// DEFINE / INCLUDE +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== + +//====================================================================================================================================================== +// LIBRARIES +//====================================================================================================================================================== + +#include +#include +#include + +#include +#include +#include + +//====================================================================================================================================================== +// STRUCTURES, GLOBAL STRUCTURE VARIABLES +//====================================================================================================================================================== + +#include "define.c" + +params_common_change common_change; +__constant__ params_common_change d_common_change; + +params_common common; +__constant__ params_common d_common; + +params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose + // more than usually needed +__constant__ params_unique d_unique[ALL_POINTS]; + +//====================================================================================================================================================== +// KERNEL CODE +//====================================================================================================================================================== + +#include "kernel.cu" + +// WRITE DATA FUNCTION +//===============================================================================================================================================================================================================200 + +void write_data(char *filename, int frameNo, int frames_processed, + int endoPoints, int *input_a, int *input_b, int epiPoints, + int *input_2a, int *input_2b) { + + //================================================================================80 + // VARIABLES + //================================================================================80 + + FILE *fid; + int i, j; + char c; + + //================================================================================80 + // OPEN FILE FOR READING + //================================================================================80 + + fid = fopen(filename, "w+"); + if (fid == NULL) { + printf("The file was not opened for writing\n"); + return; + } + + //================================================================================80 + // WRITE VALUES TO THE FILE + //================================================================================80 + fprintf(fid, "Total AVI Frames: %d\n", frameNo); + fprintf(fid, "Frames Processed: %d\n", frames_processed); + fprintf(fid, "endoPoints: %d\n", endoPoints); + fprintf(fid, "epiPoints: %d", epiPoints); + for (j = 0; j < frames_processed; j++) { + fprintf(fid, "\n---Frame %d---", j); + fprintf(fid, "\n--endo--\n", j); + for (i = 0; i < endoPoints; i++) { + fprintf(fid, "%d\t", input_a[j + i * frameNo]); + } + fprintf(fid, "\n"); + for (i = 0; i < endoPoints; i++) { + // if(input_b[j*size+i] > 2000) input_b[j*size+i]=0; + fprintf(fid, "%d\t", input_b[j + i * frameNo]); + } + fprintf(fid, "\n--epi--\n", j); + for (i = 0; i < epiPoints; i++) { + // if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0; + fprintf(fid, "%d\t", input_2a[j + i * frameNo]); + } + fprintf(fid, "\n"); + for (i = 0; i < epiPoints; i++) { + // if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0; + fprintf(fid, "%d\t", input_2b[j + i * frameNo]); + } + } + // ================================================================================80 + // CLOSE FILE + // ================================================================================80 + + fclose(fid); +} + +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// MAIN FUNCTION +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +int main(int argc, char *argv[]) { + cudaSetDevice(0); + printf("WG size of kernel = %d \n", NUMBER_THREADS); + //====================================================================================================================================================== + // VARIABLES + //====================================================================================================================================================== + + // CUDA kernel execution parameters + dim3 threads; + dim3 blocks; + + // counter + int i; + int frames_processed; + + // frames + char *video_file_name; + avi_t *frames; + fp *frame; + + //====================================================================================================================================================== + // FRAME + //====================================================================================================================================================== + + if (argc != 3) { + printf("ERROR: usage: heartwall \n"); + exit(1); + } + + // open movie file + video_file_name = argv[1]; + frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting + if (frames == NULL) { + AVI_print_error((char *)"Error with AVI_open_input_file"); + return -1; + } + + // common + common.no_frames = AVI_video_frames(frames); + common.frame_rows = AVI_video_height(frames); + common.frame_cols = AVI_video_width(frames); + common.frame_elem = common.frame_rows * common.frame_cols; + common.frame_mem = sizeof(fp) * common.frame_elem; + + // pointers + cudaMalloc((void **)&common_change.d_frame, common.frame_mem); + + //====================================================================================================================================================== + // CHECK INPUT ARGUMENTS + //====================================================================================================================================================== + + frames_processed = atoi(argv[2]); + if (frames_processed < 0 || frames_processed > common.no_frames) { + printf("ERROR: %d is an incorrect number of frames specified, select in " + "the range of 0-%d\n", + frames_processed, common.no_frames); + return 0; + } + + //====================================================================================================================================================== + // HARDCODED INPUTS FROM MATLAB + //====================================================================================================================================================== + + //==================================================================================================== + // CONSTANTS + //==================================================================================================== + + common.sSize = 40; + common.tSize = 25; + common.maxMove = 10; + common.alpha = 0.87; + + //==================================================================================================== + // ENDO POINTS + //==================================================================================================== + + common.endoPoints = ENDO_POINTS; + common.endo_mem = sizeof(int) * common.endoPoints; + + common.endoRow = (int *)malloc(common.endo_mem); + common.endoRow[0] = 369; + common.endoRow[1] = 400; + common.endoRow[2] = 429; + common.endoRow[3] = 452; + common.endoRow[4] = 476; + common.endoRow[5] = 486; + common.endoRow[6] = 479; + common.endoRow[7] = 458; + common.endoRow[8] = 433; + common.endoRow[9] = 404; + common.endoRow[10] = 374; + common.endoRow[11] = 346; + common.endoRow[12] = 318; + common.endoRow[13] = 294; + common.endoRow[14] = 277; + common.endoRow[15] = 269; + common.endoRow[16] = 275; + common.endoRow[17] = 287; + common.endoRow[18] = 311; + common.endoRow[19] = 339; + cudaMalloc((void **)&common.d_endoRow, common.endo_mem); + cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem, + cudaMemcpyHostToDevice); + + common.endoCol = (int *)malloc(common.endo_mem); + common.endoCol[0] = 408; + common.endoCol[1] = 406; + common.endoCol[2] = 397; + common.endoCol[3] = 383; + common.endoCol[4] = 354; + common.endoCol[5] = 322; + common.endoCol[6] = 294; + common.endoCol[7] = 270; + common.endoCol[8] = 250; + common.endoCol[9] = 237; + common.endoCol[10] = 235; + common.endoCol[11] = 241; + common.endoCol[12] = 254; + common.endoCol[13] = 273; + common.endoCol[14] = 300; + common.endoCol[15] = 328; + common.endoCol[16] = 356; + common.endoCol[17] = 383; + common.endoCol[18] = 401; + common.endoCol[19] = 411; + cudaMalloc((void **)&common.d_endoCol, common.endo_mem); + cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem, + cudaMemcpyHostToDevice); + + common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames); + cudaMalloc((void **)&common.d_tEndoRowLoc, + common.endo_mem * common.no_frames); + + common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames); + cudaMalloc((void **)&common.d_tEndoColLoc, + common.endo_mem * common.no_frames); + + //==================================================================================================== + // EPI POINTS + //==================================================================================================== + + common.epiPoints = EPI_POINTS; + common.epi_mem = sizeof(int) * common.epiPoints; + + common.epiRow = (int *)malloc(common.epi_mem); + common.epiRow[0] = 390; + common.epiRow[1] = 419; + common.epiRow[2] = 448; + common.epiRow[3] = 474; + common.epiRow[4] = 501; + common.epiRow[5] = 519; + common.epiRow[6] = 535; + common.epiRow[7] = 542; + common.epiRow[8] = 543; + common.epiRow[9] = 538; + common.epiRow[10] = 528; + common.epiRow[11] = 511; + common.epiRow[12] = 491; + common.epiRow[13] = 466; + common.epiRow[14] = 438; + common.epiRow[15] = 406; + common.epiRow[16] = 376; + common.epiRow[17] = 347; + common.epiRow[18] = 318; + common.epiRow[19] = 291; + common.epiRow[20] = 275; + common.epiRow[21] = 259; + common.epiRow[22] = 256; + common.epiRow[23] = 252; + common.epiRow[24] = 252; + common.epiRow[25] = 257; + common.epiRow[26] = 266; + common.epiRow[27] = 283; + common.epiRow[28] = 305; + common.epiRow[29] = 331; + common.epiRow[30] = 360; + cudaMalloc((void **)&common.d_epiRow, common.epi_mem); + cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem, + cudaMemcpyHostToDevice); + + common.epiCol = (int *)malloc(common.epi_mem); + common.epiCol[0] = 457; + common.epiCol[1] = 454; + common.epiCol[2] = 446; + common.epiCol[3] = 431; + common.epiCol[4] = 411; + common.epiCol[5] = 388; + common.epiCol[6] = 361; + common.epiCol[7] = 331; + common.epiCol[8] = 301; + common.epiCol[9] = 273; + common.epiCol[10] = 243; + common.epiCol[11] = 218; + common.epiCol[12] = 196; + common.epiCol[13] = 178; + common.epiCol[14] = 166; + common.epiCol[15] = 157; + common.epiCol[16] = 155; + common.epiCol[17] = 165; + common.epiCol[18] = 177; + common.epiCol[19] = 197; + common.epiCol[20] = 218; + common.epiCol[21] = 248; + common.epiCol[22] = 276; + common.epiCol[23] = 304; + common.epiCol[24] = 333; + common.epiCol[25] = 361; + common.epiCol[26] = 391; + common.epiCol[27] = 415; + common.epiCol[28] = 434; + common.epiCol[29] = 448; + common.epiCol[30] = 455; + cudaMalloc((void **)&common.d_epiCol, common.epi_mem); + cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem, + cudaMemcpyHostToDevice); + + common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames); + cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames); + + common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames); + cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames); + + //==================================================================================================== + // ALL POINTS + //==================================================================================================== + + common.allPoints = ALL_POINTS; + + //====================================================================================================================================================== + // TEMPLATE SIZES + //====================================================================================================================================================== + + // common + common.in_rows = common.tSize + 1 + common.tSize; + common.in_cols = common.in_rows; + common.in_elem = common.in_rows * common.in_cols; + common.in_mem = sizeof(fp) * common.in_elem; + + //====================================================================================================================================================== + // CREATE ARRAY OF TEMPLATES FOR ALL POINTS + //====================================================================================================================================================== + + // common + cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints); + cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints); + + //====================================================================================================================================================== + // SPECIFIC TO ENDO OR EPI TO BE SET HERE + //====================================================================================================================================================== + + for (i = 0; i < common.endoPoints; i++) { + unique[i].point_no = i; + unique[i].d_Row = common.d_endoRow; + unique[i].d_Col = common.d_endoCol; + unique[i].d_tRowLoc = common.d_tEndoRowLoc; + unique[i].d_tColLoc = common.d_tEndoColLoc; + unique[i].d_T = common.d_endoT; + } + for (i = common.endoPoints; i < common.allPoints; i++) { + unique[i].point_no = i - common.endoPoints; + unique[i].d_Row = common.d_epiRow; + unique[i].d_Col = common.d_epiCol; + unique[i].d_tRowLoc = common.d_tEpiRowLoc; + unique[i].d_tColLoc = common.d_tEpiColLoc; + unique[i].d_T = common.d_epiT; + } + + //====================================================================================================================================================== + // RIGHT TEMPLATE FROM TEMPLATE ARRAY + //====================================================================================================================================================== + + // pointers + for (i = 0; i < common.allPoints; i++) { + unique[i].in_pointer = unique[i].point_no * common.in_elem; + } + + //====================================================================================================================================================== + // AREA AROUND POINT FROM FRAME + //====================================================================================================================================================== + + // common + common.in2_rows = 2 * common.sSize + 1; + common.in2_cols = 2 * common.sSize + 1; + common.in2_elem = common.in2_rows * common.in2_cols; + common.in2_mem = sizeof(float) * common.in2_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2, common.in2_mem); + } + + //====================================================================================================================================================== + // CONVOLUTION + //====================================================================================================================================================== + + // common + common.conv_rows = + common.in_rows + common.in2_rows - 1; // number of rows in I + common.conv_cols = + common.in_cols + common.in2_cols - 1; // number of columns in I + common.conv_elem = common.conv_rows * common.conv_cols; // number of elements + common.conv_mem = sizeof(float) * common.conv_elem; + common.ioffset = 0; + common.joffset = 0; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_conv, common.conv_mem); + } + + //====================================================================================================================================================== + // CUMULATIVE SUM + //====================================================================================================================================================== + + //==================================================================================================== + // PADDING OF ARRAY, VERTICAL CUMULATIVE SUM + //==================================================================================================== + + // common + common.in2_pad_add_rows = common.in_rows; + common.in2_pad_add_cols = common.in_cols; + + common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows; + common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols; + common.in2_pad_cumv_elem = + common.in2_pad_cumv_rows * common.in2_pad_cumv_cols; + common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem); + } + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + // common + common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1) + common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1; + common.in2_pad_cumv_sel_collow = 1; + common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols; + common.in2_pad_cumv_sel_rows = + common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1; + common.in2_pad_cumv_sel_cols = + common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1; + common.in2_pad_cumv_sel_elem = + common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols; + common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel, + common.in2_pad_cumv_sel_mem); + } + + //==================================================================================================== + // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM + //==================================================================================================== + + // common + common.in2_pad_cumv_sel2_rowlow = 1; + common.in2_pad_cumv_sel2_rowhig = + common.in2_pad_cumv_rows - common.in_rows - 1; + common.in2_pad_cumv_sel2_collow = 1; + common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols; + common.in2_sub_cumh_rows = + common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1; + common.in2_sub_cumh_cols = + common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1; + common.in2_sub_cumh_elem = + common.in2_sub_cumh_rows * common.in2_sub_cumh_cols; + common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem); + } + + //==================================================================================================== + // SELECTION + //==================================================================================================== + + // common + common.in2_sub_cumh_sel_rowlow = 1; + common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows; + common.in2_sub_cumh_sel_collow = 1 + common.in_cols; + common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1; + common.in2_sub_cumh_sel_rows = + common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1; + common.in2_sub_cumh_sel_cols = + common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1; + common.in2_sub_cumh_sel_elem = + common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols; + common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel, + common.in2_sub_cumh_sel_mem); + } + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + // common + common.in2_sub_cumh_sel2_rowlow = 1; + common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows; + common.in2_sub_cumh_sel2_collow = 1; + common.in2_sub_cumh_sel2_colhig = + common.in2_sub_cumh_cols - common.in_cols - 1; + common.in2_sub2_rows = + common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1; + common.in2_sub2_cols = + common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1; + common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols; + common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem); + } + + //====================================================================================================================================================== + // CUMULATIVE SUM 2 + //====================================================================================================================================================== + + //==================================================================================================== + // MULTIPLICATION + //==================================================================================================== + + // common + common.in2_sqr_rows = common.in2_rows; + common.in2_sqr_cols = common.in2_cols; + common.in2_sqr_elem = common.in2_elem; + common.in2_sqr_mem = common.in2_mem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem); + } + + //==================================================================================================== + // SELECTION 2, SUBTRACTION + //==================================================================================================== + + // common + common.in2_sqr_sub2_rows = common.in2_sub2_rows; + common.in2_sqr_sub2_cols = common.in2_sub2_cols; + common.in2_sqr_sub2_elem = common.in2_sub2_elem; + common.in2_sqr_sub2_mem = common.in2_sub2_mem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem); + } + + //====================================================================================================================================================== + // FINAL + //====================================================================================================================================================== + + // common + common.in_sqr_rows = common.in_rows; + common.in_sqr_cols = common.in_cols; + common.in_sqr_elem = common.in_elem; + common.in_sqr_mem = common.in_mem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem); + } + + //====================================================================================================================================================== + // TEMPLATE MASK CREATE + //====================================================================================================================================================== + + // common + common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1; + common.tMask_cols = common.tMask_rows; + common.tMask_elem = common.tMask_rows * common.tMask_cols; + common.tMask_mem = sizeof(float) * common.tMask_elem; + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem); + } + + //====================================================================================================================================================== + // POINT MASK INITIALIZE + //====================================================================================================================================================== + + // common + common.mask_rows = common.maxMove; + common.mask_cols = common.mask_rows; + common.mask_elem = common.mask_rows * common.mask_cols; + common.mask_mem = sizeof(float) * common.mask_elem; + + //====================================================================================================================================================== + // MASK CONVOLUTION + //====================================================================================================================================================== + + // common + common.mask_conv_rows = common.tMask_rows; // number of rows in I + common.mask_conv_cols = common.tMask_cols; // number of columns in I + common.mask_conv_elem = + common.mask_conv_rows * common.mask_conv_cols; // number of elements + common.mask_conv_mem = sizeof(float) * common.mask_conv_elem; + common.mask_conv_ioffset = (common.mask_rows - 1) / 2; + if ((common.mask_rows - 1) % 2 > 0.5) { + common.mask_conv_ioffset = common.mask_conv_ioffset + 1; + } + common.mask_conv_joffset = (common.mask_cols - 1) / 2; + if ((common.mask_cols - 1) % 2 > 0.5) { + common.mask_conv_joffset = common.mask_conv_joffset + 1; + } + + // pointers + for (i = 0; i < common.allPoints; i++) { + cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem); + } + + //====================================================================================================================================================== + // KERNEL + //====================================================================================================================================================== + + //==================================================================================================== + // THREAD BLOCK + //==================================================================================================== + + // All kernels operations within kernel use same max size of threads. Size of + // block size is set to the size appropriate for max size operation (on padded + // matrix). Other use subsets of that. + threads.x = NUMBER_THREADS; // define the number of threads in the block + threads.y = 1; + blocks.x = common.allPoints; // define the number of blocks in the grid + blocks.y = 1; + + //==================================================================================================== + // COPY ARGUMENTS + //==================================================================================================== + + cudaMemcpyToSymbol(d_common, &common, sizeof(params_common)); + cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS); + + //==================================================================================================== + // PRINT FRAME PROGRESS START + //==================================================================================================== + + printf("frame progress: "); + fflush(NULL); + + //==================================================================================================== + // LAUNCH + //==================================================================================================== + + for (common_change.frame_no = 0; common_change.frame_no < frames_processed; + common_change.frame_no++) { + printf("get frame\n"); + // Extract a cropped version of the first frame from the video file + frame = get_frame( + frames, // pointer to video file + common_change.frame_no, // number of frame that needs to be returned + 0, // cropped? + 0, // scaled? + 1); // converted + printf("memcpy\n"); + // copy frame to GPU memory + cudaMemcpy(common_change.d_frame, frame, common.frame_mem, + cudaMemcpyHostToDevice); + printf("toSymbol\n"); + cudaMemcpyToSymbol(d_common_change, &common_change, + sizeof(params_common_change)); + + // launch GPU kernel + printf("launch\n"); + kernel<<<1, 32>>>(); + cudaDeviceSynchronize(); + printf("return\n"); + // free frame after each loop iteration, since AVI library allocates memory + // for every frame fetched + printf("free\n"); + free(frame); + + // print frame progress + printf("%d ", common_change.frame_no); + fflush(NULL); + } + + //==================================================================================================== + // PRINT FRAME PROGRESS END + //==================================================================================================== + + printf("\n"); + fflush(NULL); + + //==================================================================================================== + // OUTPUT + //==================================================================================================== + + cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc, + common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost); + cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc, + common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost); + + cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc, + common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost); + cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc, + common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost); + +#ifdef OUTPUT + + //==================================================50 + // DUMP DATA TO FILE + //==================================================50 + write_data("result.txt", common.no_frames, frames_processed, + common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc, + common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc); + + //==================================================50 + // End + //==================================================50 + +#endif + + //====================================================================================================================================================== + // DEALLOCATION + //====================================================================================================================================================== + + //==================================================================================================== + // COMMON + //==================================================================================================== + + // frame + cudaFree(common_change.d_frame); + + // endo points + free(common.endoRow); + free(common.endoCol); + free(common.tEndoRowLoc); + free(common.tEndoColLoc); + + cudaFree(common.d_endoRow); + cudaFree(common.d_endoCol); + cudaFree(common.d_tEndoRowLoc); + cudaFree(common.d_tEndoColLoc); + + cudaFree(common.d_endoT); + + // epi points + free(common.epiRow); + free(common.epiCol); + free(common.tEpiRowLoc); + free(common.tEpiColLoc); + + cudaFree(common.d_epiRow); + cudaFree(common.d_epiCol); + cudaFree(common.d_tEpiRowLoc); + cudaFree(common.d_tEpiColLoc); + + cudaFree(common.d_epiT); + + //==================================================================================================== + // POINTERS + //==================================================================================================== + + for (i = 0; i < common.allPoints; i++) { + cudaFree(unique[i].d_in2); + + cudaFree(unique[i].d_conv); + cudaFree(unique[i].d_in2_pad_cumv); + cudaFree(unique[i].d_in2_pad_cumv_sel); + cudaFree(unique[i].d_in2_sub_cumh); + cudaFree(unique[i].d_in2_sub_cumh_sel); + cudaFree(unique[i].d_in2_sub2); + cudaFree(unique[i].d_in2_sqr); + cudaFree(unique[i].d_in2_sqr_sub2); + cudaFree(unique[i].d_in_sqr); + + cudaFree(unique[i].d_tMask); + cudaFree(unique[i].d_mask_conv); + } +} + +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== +// MAIN FUNCTION +//=============================================================================================================================================================================================================== +//=============================================================================================================================================================================================================== diff --git a/examples/heartwall/run.sh b/examples/heartwall/run.sh new file mode 100644 index 0000000..53465a2 --- /dev/null +++ b/examples/heartwall/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +cd AVI; make; cd ..; + +clang++ -DOUTPUT main.cu -I./AVI --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v + + +/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + + +g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread + +./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20 diff --git a/examples/heartwall/setdevice.cu b/examples/heartwall/setdevice.cu new file mode 100755 index 0000000..d27bb48 --- /dev/null +++ b/examples/heartwall/setdevice.cu @@ -0,0 +1,5 @@ +//////////////////////////////////////////////////////////////////////////////// +// Set Device +//////////////////////////////////////////////////////////////////////////////// + +void setdevice(void) { cudaSetDevice(0); } diff --git a/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..90f6f17 --- /dev/null +++ b/examples/hotspot/hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,719 @@ +; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "hotspot.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any + +@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 { +entry: + %iteration.addr = alloca i32, align 4 + %power.addr = alloca float*, align 8 + %temp_src.addr = alloca float*, align 8 + %temp_dst.addr = alloca float*, align 8 + %grid_cols.addr = alloca i32, align 4 + %grid_rows.addr = alloca i32, align 4 + %border_cols.addr = alloca i32, align 4 + %border_rows.addr = alloca i32, align 4 + %Cap.addr = alloca float, align 4 + %Rx.addr = alloca float, align 4 + %Ry.addr = alloca float, align 4 + %Rz.addr = alloca float, align 4 + %step.addr = alloca float, align 4 + %time_elapsed.addr = alloca float, align 4 + %amb_temp = alloca float, align 4 + %step_div_Cap = alloca float, align 4 + %Rx_1 = alloca float, align 4 + %Ry_1 = alloca float, align 4 + %Rz_1 = alloca float, align 4 + %bx = alloca i32, align 4 + %by = alloca i32, align 4 + %tx = alloca i32, align 4 + %ty = alloca i32, align 4 + %small_block_rows = alloca i32, align 4 + %small_block_cols = alloca i32, align 4 + %blkY = alloca i32, align 4 + %blkX = alloca i32, align 4 + %blkYmax = alloca i32, align 4 + %blkXmax = alloca i32, align 4 + %yidx = alloca i32, align 4 + %xidx = alloca i32, align 4 + %loadYidx = alloca i32, align 4 + %loadXidx = alloca i32, align 4 + %index = alloca i32, align 4 + %validYmin = alloca i32, align 4 + %validYmax = alloca i32, align 4 + %validXmin = alloca i32, align 4 + %validXmax = alloca i32, align 4 + %N = alloca i32, align 4 + %S = alloca i32, align 4 + %W = alloca i32, align 4 + %E = alloca i32, align 4 + %computed = alloca i8, align 1 + %i = alloca i32, align 4 + store i32 %iteration, i32* %iteration.addr, align 4 + store float* %power, float** %power.addr, align 8 + store float* %temp_src, float** %temp_src.addr, align 8 + store float* %temp_dst, float** %temp_dst.addr, align 8 + store i32 %grid_cols, i32* %grid_cols.addr, align 4 + store i32 %grid_rows, i32* %grid_rows.addr, align 4 + store i32 %border_cols, i32* %border_cols.addr, align 4 + store i32 %border_rows, i32* %border_rows.addr, align 4 + store float %Cap, float* %Cap.addr, align 4 + store float %Rx, float* %Rx.addr, align 4 + store float %Ry, float* %Ry.addr, align 4 + store float %Rz, float* %Rz.addr, align 4 + store float %step, float* %step.addr, align 4 + store float %time_elapsed, float* %time_elapsed.addr, align 4 + store float 8.000000e+01, float* %amb_temp, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %bx, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2 + store i32 %call1, i32* %by, align 4 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call2, i32* %tx, align 4 + %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + store i32 %call3, i32* %ty, align 4 + %0 = load float, float* %step.addr, align 4 + %1 = load float, float* %Cap.addr, align 4 + %div = fdiv float %0, %1 + store float %div, float* %step_div_Cap, align 4 + %2 = load float, float* %Rx.addr, align 4 + %div4 = fdiv float 1.000000e+00, %2 + store float %div4, float* %Rx_1, align 4 + %3 = load float, float* %Ry.addr, align 4 + %div5 = fdiv float 1.000000e+00, %3 + store float %div5, float* %Ry_1, align 4 + %4 = load float, float* %Rz.addr, align 4 + %div6 = fdiv float 1.000000e+00, %4 + store float %div6, float* %Rz_1, align 4 + %5 = load i32, i32* %iteration.addr, align 4 + %mul = mul nsw i32 %5, 2 + %sub = sub nsw i32 16, %mul + store i32 %sub, i32* %small_block_rows, align 4 + %6 = load i32, i32* %iteration.addr, align 4 + %mul7 = mul nsw i32 %6, 2 + %sub8 = sub nsw i32 16, %mul7 + store i32 %sub8, i32* %small_block_cols, align 4 + %7 = load i32, i32* %small_block_rows, align 4 + %8 = load i32, i32* %by, align 4 + %mul9 = mul nsw i32 %7, %8 + %9 = load i32, i32* %border_rows.addr, align 4 + %sub10 = sub nsw i32 %mul9, %9 + store i32 %sub10, i32* %blkY, align 4 + %10 = load i32, i32* %small_block_cols, align 4 + %11 = load i32, i32* %bx, align 4 + %mul11 = mul nsw i32 %10, %11 + %12 = load i32, i32* %border_cols.addr, align 4 + %sub12 = sub nsw i32 %mul11, %12 + store i32 %sub12, i32* %blkX, align 4 + %13 = load i32, i32* %blkY, align 4 + %add = add nsw i32 %13, 16 + %sub13 = sub nsw i32 %add, 1 + store i32 %sub13, i32* %blkYmax, align 4 + %14 = load i32, i32* %blkX, align 4 + %add14 = add nsw i32 %14, 16 + %sub15 = sub nsw i32 %add14, 1 + store i32 %sub15, i32* %blkXmax, align 4 + %15 = load i32, i32* %blkY, align 4 + %16 = load i32, i32* %ty, align 4 + %add16 = add nsw i32 %15, %16 + store i32 %add16, i32* %yidx, align 4 + %17 = load i32, i32* %blkX, align 4 + %18 = load i32, i32* %tx, align 4 + %add17 = add nsw i32 %17, %18 + store i32 %add17, i32* %xidx, align 4 + %19 = load i32, i32* %yidx, align 4 + store i32 %19, i32* %loadYidx, align 4 + %20 = load i32, i32* %xidx, align 4 + store i32 %20, i32* %loadXidx, align 4 + %21 = load i32, i32* %grid_cols.addr, align 4 + %22 = load i32, i32* %loadYidx, align 4 + %mul18 = mul nsw i32 %21, %22 + %23 = load i32, i32* %loadXidx, align 4 + %add19 = add nsw i32 %mul18, %23 + store i32 %add19, i32* %index, align 4 + %24 = load i32, i32* %loadYidx, align 4 + %cmp = icmp sge i32 %24, 0 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %entry + %25 = load i32, i32* %loadYidx, align 4 + %26 = load i32, i32* %grid_rows.addr, align 4 + %sub20 = sub nsw i32 %26, 1 + %cmp21 = icmp sle i32 %25, %sub20 + br i1 %cmp21, label %land.lhs.true22, label %if.end + +land.lhs.true22: ; preds = %land.lhs.true + %27 = load i32, i32* %loadXidx, align 4 + %cmp23 = icmp sge i32 %27, 0 + br i1 %cmp23, label %land.lhs.true24, label %if.end + +land.lhs.true24: ; preds = %land.lhs.true22 + %28 = load i32, i32* %loadXidx, align 4 + %29 = load i32, i32* %grid_cols.addr, align 4 + %sub25 = sub nsw i32 %29, 1 + %cmp26 = icmp sle i32 %28, %sub25 + br i1 %cmp26, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true24 + %30 = load float*, float** %temp_src.addr, align 8 + %31 = load i32, i32* %index, align 4 + %idxprom = sext i32 %31 to i64 + %arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom + %32 = load float, float* %arrayidx, align 4 + %33 = load i32, i32* %ty, align 4 + %idxprom27 = sext i32 %33 to i64 + %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27 + %34 = load i32, i32* %tx, align 4 + %idxprom29 = sext i32 %34 to i64 + %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29 + store float %32, float* %arrayidx30, align 4 + %35 = load float*, float** %power.addr, align 8 + %36 = load i32, i32* %index, align 4 + %idxprom31 = sext i32 %36 to i64 + %arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31 + %37 = load float, float* %arrayidx32, align 4 + %38 = load i32, i32* %ty, align 4 + %idxprom33 = sext i32 %38 to i64 + %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33 + %39 = load i32, i32* %tx, align 4 + %idxprom35 = sext i32 %39 to i64 + %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35 + store float %37, float* %arrayidx36, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry + call void @llvm.nvvm.barrier0() + %40 = load i32, i32* %blkY, align 4 + %cmp37 = icmp slt i32 %40, 0 + br i1 %cmp37, label %cond.true, label %cond.false + +cond.true: ; preds = %if.end + %41 = load i32, i32* %blkY, align 4 + %sub38 = sub nsw i32 0, %41 + br label %cond.end + +cond.false: ; preds = %if.end + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ] + store i32 %cond, i32* %validYmin, align 4 + %42 = load i32, i32* %blkYmax, align 4 + %43 = load i32, i32* %grid_rows.addr, align 4 + %sub39 = sub nsw i32 %43, 1 + %cmp40 = icmp sgt i32 %42, %sub39 + br i1 %cmp40, label %cond.true41, label %cond.false45 + +cond.true41: ; preds = %cond.end + %44 = load i32, i32* %blkYmax, align 4 + %45 = load i32, i32* %grid_rows.addr, align 4 + %sub42 = sub nsw i32 %44, %45 + %add43 = add nsw i32 %sub42, 1 + %sub44 = sub nsw i32 15, %add43 + br label %cond.end46 + +cond.false45: ; preds = %cond.end + br label %cond.end46 + +cond.end46: ; preds = %cond.false45, %cond.true41 + %cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ] + store i32 %cond47, i32* %validYmax, align 4 + %46 = load i32, i32* %blkX, align 4 + %cmp48 = icmp slt i32 %46, 0 + br i1 %cmp48, label %cond.true49, label %cond.false51 + +cond.true49: ; preds = %cond.end46 + %47 = load i32, i32* %blkX, align 4 + %sub50 = sub nsw i32 0, %47 + br label %cond.end52 + +cond.false51: ; preds = %cond.end46 + br label %cond.end52 + +cond.end52: ; preds = %cond.false51, %cond.true49 + %cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ] + store i32 %cond53, i32* %validXmin, align 4 + %48 = load i32, i32* %blkXmax, align 4 + %49 = load i32, i32* %grid_cols.addr, align 4 + %sub54 = sub nsw i32 %49, 1 + %cmp55 = icmp sgt i32 %48, %sub54 + br i1 %cmp55, label %cond.true56, label %cond.false60 + +cond.true56: ; preds = %cond.end52 + %50 = load i32, i32* %blkXmax, align 4 + %51 = load i32, i32* %grid_cols.addr, align 4 + %sub57 = sub nsw i32 %50, %51 + %add58 = add nsw i32 %sub57, 1 + %sub59 = sub nsw i32 15, %add58 + br label %cond.end61 + +cond.false60: ; preds = %cond.end52 + br label %cond.end61 + +cond.end61: ; preds = %cond.false60, %cond.true56 + %cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ] + store i32 %cond62, i32* %validXmax, align 4 + %52 = load i32, i32* %ty, align 4 + %sub63 = sub nsw i32 %52, 1 + store i32 %sub63, i32* %N, align 4 + %53 = load i32, i32* %ty, align 4 + %add64 = add nsw i32 %53, 1 + store i32 %add64, i32* %S, align 4 + %54 = load i32, i32* %tx, align 4 + %sub65 = sub nsw i32 %54, 1 + store i32 %sub65, i32* %W, align 4 + %55 = load i32, i32* %tx, align 4 + %add66 = add nsw i32 %55, 1 + store i32 %add66, i32* %E, align 4 + %56 = load i32, i32* %N, align 4 + %57 = load i32, i32* %validYmin, align 4 + %cmp67 = icmp slt i32 %56, %57 + br i1 %cmp67, label %cond.true68, label %cond.false69 + +cond.true68: ; preds = %cond.end61 + %58 = load i32, i32* %validYmin, align 4 + br label %cond.end70 + +cond.false69: ; preds = %cond.end61 + %59 = load i32, i32* %N, align 4 + br label %cond.end70 + +cond.end70: ; preds = %cond.false69, %cond.true68 + %cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ] + store i32 %cond71, i32* %N, align 4 + %60 = load i32, i32* %S, align 4 + %61 = load i32, i32* %validYmax, align 4 + %cmp72 = icmp sgt i32 %60, %61 + br i1 %cmp72, label %cond.true73, label %cond.false74 + +cond.true73: ; preds = %cond.end70 + %62 = load i32, i32* %validYmax, align 4 + br label %cond.end75 + +cond.false74: ; preds = %cond.end70 + %63 = load i32, i32* %S, align 4 + br label %cond.end75 + +cond.end75: ; preds = %cond.false74, %cond.true73 + %cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ] + store i32 %cond76, i32* %S, align 4 + %64 = load i32, i32* %W, align 4 + %65 = load i32, i32* %validXmin, align 4 + %cmp77 = icmp slt i32 %64, %65 + br i1 %cmp77, label %cond.true78, label %cond.false79 + +cond.true78: ; preds = %cond.end75 + %66 = load i32, i32* %validXmin, align 4 + br label %cond.end80 + +cond.false79: ; preds = %cond.end75 + %67 = load i32, i32* %W, align 4 + br label %cond.end80 + +cond.end80: ; preds = %cond.false79, %cond.true78 + %cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ] + store i32 %cond81, i32* %W, align 4 + %68 = load i32, i32* %E, align 4 + %69 = load i32, i32* %validXmax, align 4 + %cmp82 = icmp sgt i32 %68, %69 + br i1 %cmp82, label %cond.true83, label %cond.false84 + +cond.true83: ; preds = %cond.end80 + %70 = load i32, i32* %validXmax, align 4 + br label %cond.end85 + +cond.false84: ; preds = %cond.end80 + %71 = load i32, i32* %E, align 4 + br label %cond.end85 + +cond.end85: ; preds = %cond.false84, %cond.true83 + %cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ] + store i32 %cond86, i32* %E, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %cond.end85 + %72 = load i32, i32* %i, align 4 + %73 = load i32, i32* %iteration.addr, align 4 + %cmp87 = icmp slt i32 %72, %73 + br i1 %cmp87, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i8 0, i8* %computed, align 1 + %74 = load i32, i32* %tx, align 4 + %75 = load i32, i32* %i, align 4 + %add88 = add nsw i32 %75, 1 + %cmp89 = icmp sge i32 %74, %add88 + br i1 %cmp89, label %land.lhs.true90, label %if.end175 + +land.lhs.true90: ; preds = %for.body + %76 = load i32, i32* %tx, align 4 + %77 = load i32, i32* %i, align 4 + %sub91 = sub nsw i32 16, %77 + %sub92 = sub nsw i32 %sub91, 2 + %cmp93 = icmp sle i32 %76, %sub92 + br i1 %cmp93, label %land.lhs.true94, label %if.end175 + +land.lhs.true94: ; preds = %land.lhs.true90 + %78 = load i32, i32* %ty, align 4 + %79 = load i32, i32* %i, align 4 + %add95 = add nsw i32 %79, 1 + %cmp96 = icmp sge i32 %78, %add95 + br i1 %cmp96, label %land.lhs.true97, label %if.end175 + +land.lhs.true97: ; preds = %land.lhs.true94 + %80 = load i32, i32* %ty, align 4 + %81 = load i32, i32* %i, align 4 + %sub98 = sub nsw i32 16, %81 + %sub99 = sub nsw i32 %sub98, 2 + %cmp100 = icmp sle i32 %80, %sub99 + br i1 %cmp100, label %land.lhs.true101, label %if.end175 + +land.lhs.true101: ; preds = %land.lhs.true97 + %82 = load i32, i32* %tx, align 4 + %83 = load i32, i32* %validXmin, align 4 + %cmp102 = icmp sge i32 %82, %83 + br i1 %cmp102, label %land.lhs.true103, label %if.end175 + +land.lhs.true103: ; preds = %land.lhs.true101 + %84 = load i32, i32* %tx, align 4 + %85 = load i32, i32* %validXmax, align 4 + %cmp104 = icmp sle i32 %84, %85 + br i1 %cmp104, label %land.lhs.true105, label %if.end175 + +land.lhs.true105: ; preds = %land.lhs.true103 + %86 = load i32, i32* %ty, align 4 + %87 = load i32, i32* %validYmin, align 4 + %cmp106 = icmp sge i32 %86, %87 + br i1 %cmp106, label %land.lhs.true107, label %if.end175 + +land.lhs.true107: ; preds = %land.lhs.true105 + %88 = load i32, i32* %ty, align 4 + %89 = load i32, i32* %validYmax, align 4 + %cmp108 = icmp sle i32 %88, %89 + br i1 %cmp108, label %if.then109, label %if.end175 + +if.then109: ; preds = %land.lhs.true107 + store i8 1, i8* %computed, align 1 + %90 = load i32, i32* %ty, align 4 + %idxprom110 = sext i32 %90 to i64 + %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110 + %91 = load i32, i32* %tx, align 4 + %idxprom112 = sext i32 %91 to i64 + %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112 + %92 = load float, float* %arrayidx113, align 4 + %conv = fpext float %92 to double + %93 = load float, float* %step_div_Cap, align 4 + %conv114 = fpext float %93 to double + %94 = load i32, i32* %ty, align 4 + %idxprom115 = sext i32 %94 to i64 + %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115 + %95 = load i32, i32* %tx, align 4 + %idxprom117 = sext i32 %95 to i64 + %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117 + %96 = load float, float* %arrayidx118, align 4 + %conv119 = fpext float %96 to double + %97 = load i32, i32* %S, align 4 + %idxprom120 = sext i32 %97 to i64 + %arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120 + %98 = load i32, i32* %tx, align 4 + %idxprom122 = sext i32 %98 to i64 + %arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122 + %99 = load float, float* %arrayidx123, align 4 + %100 = load i32, i32* %N, align 4 + %idxprom124 = sext i32 %100 to i64 + %arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124 + %101 = load i32, i32* %tx, align 4 + %idxprom126 = sext i32 %101 to i64 + %arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126 + %102 = load float, float* %arrayidx127, align 4 + %add128 = fadd contract float %99, %102 + %conv129 = fpext float %add128 to double + %103 = load i32, i32* %ty, align 4 + %idxprom130 = sext i32 %103 to i64 + %arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130 + %104 = load i32, i32* %tx, align 4 + %idxprom132 = sext i32 %104 to i64 + %arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132 + %105 = load float, float* %arrayidx133, align 4 + %conv134 = fpext float %105 to double + %mul135 = fmul contract double 2.000000e+00, %conv134 + %sub136 = fsub contract double %conv129, %mul135 + %106 = load float, float* %Ry_1, align 4 + %conv137 = fpext float %106 to double + %mul138 = fmul contract double %sub136, %conv137 + %add139 = fadd contract double %conv119, %mul138 + %107 = load i32, i32* %ty, align 4 + %idxprom140 = sext i32 %107 to i64 + %arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140 + %108 = load i32, i32* %E, align 4 + %idxprom142 = sext i32 %108 to i64 + %arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142 + %109 = load float, float* %arrayidx143, align 4 + %110 = load i32, i32* %ty, align 4 + %idxprom144 = sext i32 %110 to i64 + %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144 + %111 = load i32, i32* %W, align 4 + %idxprom146 = sext i32 %111 to i64 + %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146 + %112 = load float, float* %arrayidx147, align 4 + %add148 = fadd contract float %109, %112 + %conv149 = fpext float %add148 to double + %113 = load i32, i32* %ty, align 4 + %idxprom150 = sext i32 %113 to i64 + %arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150 + %114 = load i32, i32* %tx, align 4 + %idxprom152 = sext i32 %114 to i64 + %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152 + %115 = load float, float* %arrayidx153, align 4 + %conv154 = fpext float %115 to double + %mul155 = fmul contract double 2.000000e+00, %conv154 + %sub156 = fsub contract double %conv149, %mul155 + %116 = load float, float* %Rx_1, align 4 + %conv157 = fpext float %116 to double + %mul158 = fmul contract double %sub156, %conv157 + %add159 = fadd contract double %add139, %mul158 + %117 = load float, float* %amb_temp, align 4 + %118 = load i32, i32* %ty, align 4 + %idxprom160 = sext i32 %118 to i64 + %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160 + %119 = load i32, i32* %tx, align 4 + %idxprom162 = sext i32 %119 to i64 + %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162 + %120 = load float, float* %arrayidx163, align 4 + %sub164 = fsub contract float %117, %120 + %121 = load float, float* %Rz_1, align 4 + %mul165 = fmul contract float %sub164, %121 + %conv166 = fpext float %mul165 to double + %add167 = fadd contract double %add159, %conv166 + %mul168 = fmul contract double %conv114, %add167 + %add169 = fadd contract double %conv, %mul168 + %conv170 = fptrunc double %add169 to float + %122 = load i32, i32* %ty, align 4 + %idxprom171 = sext i32 %122 to i64 + %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171 + %123 = load i32, i32* %tx, align 4 + %idxprom173 = sext i32 %123 to i64 + %arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173 + store float %conv170, float* %arrayidx174, align 4 + br label %if.end175 + +if.end175: ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body + call void @llvm.nvvm.barrier0() + %124 = load i32, i32* %i, align 4 + %125 = load i32, i32* %iteration.addr, align 4 + %sub176 = sub nsw i32 %125, 1 + %cmp177 = icmp eq i32 %124, %sub176 + br i1 %cmp177, label %if.then178, label %if.end179 + +if.then178: ; preds = %if.end175 + br label %for.end + +if.end179: ; preds = %if.end175 + %126 = load i8, i8* %computed, align 1 + %tobool = trunc i8 %126 to i1 + br i1 %tobool, label %if.then180, label %if.end189 + +if.then180: ; preds = %if.end179 + %127 = load i32, i32* %ty, align 4 + %idxprom181 = sext i32 %127 to i64 + %arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181 + %128 = load i32, i32* %tx, align 4 + %idxprom183 = sext i32 %128 to i64 + %arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183 + %129 = load float, float* %arrayidx184, align 4 + %130 = load i32, i32* %ty, align 4 + %idxprom185 = sext i32 %130 to i64 + %arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185 + %131 = load i32, i32* %tx, align 4 + %idxprom187 = sext i32 %131 to i64 + %arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187 + store float %129, float* %arrayidx188, align 4 + br label %if.end189 + +if.end189: ; preds = %if.then180, %if.end179 + call void @llvm.nvvm.barrier0() + br label %for.inc + +for.inc: ; preds = %if.end189 + %132 = load i32, i32* %i, align 4 + %inc = add nsw i32 %132, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %if.then178, %for.cond + %133 = load i8, i8* %computed, align 1 + %tobool190 = trunc i8 %133 to i1 + br i1 %tobool190, label %if.then191, label %if.end198 + +if.then191: ; preds = %for.end + %134 = load i32, i32* %ty, align 4 + %idxprom192 = sext i32 %134 to i64 + %arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192 + %135 = load i32, i32* %tx, align 4 + %idxprom194 = sext i32 %135 to i64 + %arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194 + %136 = load float, float* %arrayidx195, align 4 + %137 = load float*, float** %temp_dst.addr, align 8 + %138 = load i32, i32* %index, align 4 + %idxprom196 = sext i32 %138 to i64 + %arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196 + store float %136, float* %arrayidx197, align 4 + br label %if.end198 + +if.end198: ; preds = %if.then191, %for.end + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll b/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..76aac61 --- /dev/null +++ b/examples/hotspot/hotspot-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,1022 @@ +; ModuleID = 'hotspot-host-x86_64-unknown-linux-gnu.bc' +source_filename = "hotspot.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +@t_chip = dso_local global float 0x3F40624DE0000000, align 4 +@chip_height = dso_local global float 0x3F90624DE0000000, align 4 +@chip_width = dso_local global float 0x3F90624DE0000000, align 4 +@amb_temp = dso_local global float 8.000000e+01, align 4 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str = private unnamed_addr constant [11 x i8] c"error: %s\0A\00", align 1 +@.str.1 = private unnamed_addr constant [2 x i8] c"w\00", align 1 +@.str.2 = private unnamed_addr constant [25 x i8] c"The file was not opened\0A\00", align 1 +@.str.3 = private unnamed_addr constant [7 x i8] c"%d\09%g\0A\00", align 1 +@.str.4 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.5 = private unnamed_addr constant [25 x i8] c"not enough lines in file\00", align 1 +@.str.6 = private unnamed_addr constant [3 x i8] c"%f\00", align 1 +@.str.7 = private unnamed_addr constant [20 x i8] c"invalid file format\00", align 1 +@.str.8 = private unnamed_addr constant [100 x i8] c"Usage: %s \0A\00", align 1 +@.str.9 = private unnamed_addr constant [78 x i8] c"\09 - number of rows/cols in the grid (positive integer)\0A\00", align 1 +@.str.10 = private unnamed_addr constant [53 x i8] c"\09 - pyramid heigh(positive integer)\0A\00", align 1 +@.str.11 = private unnamed_addr constant [38 x i8] c"\09 - number of iterations\0A\00", align 1 +@.str.12 = private unnamed_addr constant [89 x i8] c"\09 - name of the file containing the initial temperature values of each cell\0A\00", align 1 +@.str.13 = private unnamed_addr constant [86 x i8] c"\09 - name of the file containing the dissipated power values of each cell\0A\00", align 1 +@.str.14 = private unnamed_addr constant [42 x i8] c"\09 - name of the output file\0A\00", align 1 +@.str.15 = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1 +@.str.16 = private unnamed_addr constant [26 x i8] c"unable to allocate memory\00", align 1 +@.str.17 = private unnamed_addr constant [94 x i8] c"pyramidHeight: %d\0AgridSize: [%d, %d]\0Aborder:[%d, %d]\0AblockGrid:[%d, %d]\0AtargetBlock:[%d, %d]\0A\00", align 1 +@.str.18 = private unnamed_addr constant [43 x i8] c"Start computing the transient temperature\0A\00", align 1 +@.str.19 = private unnamed_addr constant [19 x i8] c"Ending simulation\0A\00", align 1 +@0 = private unnamed_addr constant [36 x i8] c"_Z14calculate_tempiPfS_S_iiiiffffff\00", align 1 +@1 = private constant [35409 x i8] c"P\EDU\BA\01\00\10\00@\8A\00\00\00\00\00\00\02\00\01\01@\00\00\00\A8v\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\00v\00\00\00\00\00\00\80s\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0A\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.info._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.shared._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.global\00.nv.constant0._Z14calculate_tempiPfS_S_iiiiffffff\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z14calculate_tempiPfS_S_iiiiffffff\00.text._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.info._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.shared._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.global\00blockIdx\00threadIdx\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm20_rcp_rn_f32_slowpath\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm3x_div_rn_noftz_f32\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda__196\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda__198\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t__200\00.nv.constant0._Z14calculate_tempiPfS_S_iiiiffffff\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00V\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AD\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E7\00\00\00\01\00\09\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F0\00\00\00\01\00\09\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\FA\00\00\00\22\00\07\00`]\00\00\00\00\00\00 \04\00\00\00\00\00\00?\01\00\00\22\00\07\00\80a\00\00\00\00\00\00`\01\00\00\00\00\00\00\81\01\00\00\22\00\07\00\E0b\00\00\00\00\00\00`\08\00\00\00\00\00\00z\02\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00@k\00\00\00\00\00\00\04/\08\00\0A\00\00\00\17\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00\00\00\00\00\04\11\08\00\07\00\00\00\00\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\04\11\08\00\06\00\00\00\00\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00\C0\00\00\00\04\11\08\00\0A\00\00\00\C0\00\00\00\010\00\00\01*\00\00\04\0A\08\00\09\00\00\00@\01H\00\03\19H\00\04\17\0C\00\00\00\00\00\0D\00D\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0C\00@\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0B\00<\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0A\008\00\00\F0\11\00\04\17\0C\00\00\00\00\00\09\004\00\00\F0\11\00\04\17\0C\00\00\00\00\00\08\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\07\00,\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\00(\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0\11\00\03\1B\FF\00\04\1D\08\00h\09\00\00\D8\09\00\00\04\1C\04\00X]\00\00\04\1E\04\00\B0\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F1\1Evisible .entry _Z14calculate_tempiPfS_S_iiiif\01\00\06\AC\04\00x\00\0F1\00\10\0E\99\04\00\DF\00\0F9\00\17\1F19\00%\1F29\00%/3,\E4\00$\1F49\00%\1F59\00%\1F69\00%\1679\00\1Ff9\00\1A\1F89\00%\1F99\00%/10:\00&\1F1:\00&\1F2:\00&\0F\9A\0A\14O6[19\9B\0A\16\A6pred %p<25U\06\8516 %rs<7\12\00\00\A8\00k%f<36>\8A\06'14%\00\00Z\00]fd<15\9F\06 88\A0\06P\09.shao\00\03\BB\00\124\BB\00\1FZ\01\01\0F0E12\18\00@_on_G\05o[1024]R\00,o3powerS\009\116\A4\00\14tK\00\0F\8F\07\08\1F6\8F\07\12\02v\01O6, [$\02\19\1D]B\00\1F5B\00\1B\1E2B\00\1F4B\00\1B\1E1B\00\1F3B\00\1B\1E0B\00\1F2B\00\1A\1E9A\00\1F1A\00\1A\1E8p\08\1F2I\01\1B\1F7B\00\00\0FI\01\1B\1F6B\00\00\0FI\01\1B\1F5B\00\00\0FI\01\1B\0F\CD\09\01\0F\84\00\1B\1F3f\09\00\0F\84\00\1B\0F\A8\09\01\0F\CE\01\1B\1F1\08\01\00\0FB\00\1B#0]\B4\03#to\8C\19\04\8A\00\144H\09\01\1F\00\0A\1C\00\115\1C\00\1F4;\00\05\146\B0\09\0F;\00\00\117\1C\00\1F6;\00\05\148\01\0A\0F;\00\00\119\1C\00\1A8\DA\09\03y\0E\0F3\0A\03\1A9\16\00\033\0A/d74\0A\03\1F54\0A\03\0Bx\0A\123E\00\1B2b\0A\134\89\00\1B4\17\00\02\\\00\122\\\00\15f\17\00\01\A1\00*f1\16\00\115r\00+f2\16\00\01q\00*f3\16\00\116p\00+f4\16\00\01o\00\1Bfn\00\126n\00\196\D4\0F\DA6, 1117782016\B5\00\137\FA\00\0A1\00\00\BA\01zctaid.x/\00\139/\00\197/\00\158/\00\1By/\00\03A\01\198/\00\00\DD\01\1Et\\\00#10X\01\199\BB\0B\130.\00\1Cy.\00\02o\01!30\18\03\02!\05%7,\1C\01\07\16\00%8,\8A\01\83;\0Adiv.rn\1A\00\229,5\00:%f8G\01\127\89\01\179F\00510,\BB\01V;\0ArcpG\00!11\E2\01\1A0D\00\128\B7\01(11E\00%2,\EA\01\0CE\00\113.\00\0B\12\02\128\E6\01(13E\00%4,\19\02\0CE\00!5,!\00\0BE\00\03\83\02\115E\00\03X\01$1,V\03S;\0Ashl\10\09332,\1D\00\09\E6\15A33, ;\02Bub.s\12\00#4,\18\00\005\00\0C\9C\01\02\95\03(34q\00\1F5q\00\03#6,\1D\00\191_\00\177_\00\0B\B4\02\131\07\18(37_\00&8,}\00\08\18\00%9,\8C\02\83;\0Amul.lod\00340,9\00\00'\00\074\00541,\D3\03\08\95\00542,7\00\1C4\9B\18\03r\04(42I\00&3,\B3\00\08\18\00%4,P\03\0C\95\00#5,9\00\00'\00\084\00%6,\7F\04\09\95\00&7,7\00\0C*\01\122S\03\184*\01648,\B3\00T;\0AaddJ\00#9,\1F\00,15H\00\02m\03\184&\01650,f\00\08H\00351,\1F\00\0EH\00\02\19\02(51H\00\192\90\00\06\18\00&3,\EB\03\09`\00#4,7\00\00%\00\0Bb\00\03\CB\05\185;\01)55\AA\00\06\18\00&6,{\04\09\C2\00#7,7\00\00%\00\0Cb\00\02\E9\01\185T\01658,\80\00\0B0\00\03/\06(580\00&9,N\00\0C0\00\02l\01\185l\01\146l\01\193H\02661,e\00\0BI\02362,9\00\00(\00\085\00\05X\01\1A4X\01664,8\00\0CX\01\02\C7\06(r6X\01)65\7F\00rsetp.ltN\003p1,\22\00\F2\0C0;\0A@%p1 bra LBB6_5;\0Abra.uni\10\0021;\0A\08\00\17:[\00\196[\00\06\18\00\04\E3\05\1A3\BC\02368,\1E\00#-1\8A\00\14g\8A\00#2,Q\00\00'\00\01\8D\00\1F2\8D\00\07\132\8D\00\182\8D\00\1992\01\0A\E8\00#3,\22\00\02\E8\00\1F3[\00\07\133[\00\173[\00)70[\00\06\18\00\181\F1\01\06\E8\00372,\1E\00\0E\E8\00#4,Q\00\00'\00\01\8D\00\1F4\8D\00\07\134\8D\00\124\8D\00\03\93\0A\05\8E\00\03\8A\03\02u\00&d1h\02\148\BE\05\032\00$2,!\00\132\A8\00\03\19\00$3,R\00\01'\00\08\C0\06\01\DC\01\00#\00\0Ae\00\194\F0\03\08e\00$5,!\00\176\96\0E rdJ\00\0F~\0F \03s\0A\02'\0F\05L\00\02\AF\0A*16\C7\00(8,\1D\00\195\B1\00\199?\04\07\B1\00\132\F6\00\1C9\16\01\00\B3\0B\06V\00\1822\08\00\1D\00\02\A7\07'6;\93\01\142\1E\05\198-\01/23\92\01\05\00\A2\0D\03!\00\0B|\00$5,Q\00\01'\00\08\92\01\227,\82\00\1A5e\00\196\92\01\07y\01\00\22\0A\0F\A5\10!\0Fz\01\02\132\0E\01\1A7\C8\00$9,\84\00\0A\93\01\01\12\0A\066\00\189\B2\00/31\93\01\05\02\D9\08-d3\A9\02\02d\08\04V\00)32\93\01\2233\93\01\1B7>\03\135>\03\D85:\0Abar.sync 0\BF\03\193\D4\06\0A\A8\03#5,\22\00!-1\A6\03\165\A6\03\0Ch\00\136h\00\1863\04\195\\\00Tneg.sN\18\00\1E\00\09\9E\1F#39:\17\09W\00\138W\00'7:,\00!74\C4\04\0D=\00/74>\00\04*8:9\1C\012\00\0B7\06\155\8E\17\06\1C\01\05V\02\1A2\F3\04\0F\DB\05\04\117\E9\01\187\DB\05#leK\01#6,Q\00\00'\00\01M\01\166M\01+10\B9\00\139\B9\00\179N\01/80\8E\00\03(81\8E\00\06o\09382,\1E\00\00<\00\08\A8\00#3,\1F\00)14F\01\03g\0A\0C\F5\06\05\F6\06\190\85\01\1296\09\0D@\00/79A\00\06\181A\00\224,4\00\0C\88\01\03\9E\0A\08\E2\00\193\16\09\0A\A4\02\01\F0\04\168\A4\02\177W\01\0D\CB\00\044\07\1815\07\1A8u\09\06\A7\02\01\FF\02\1B8\E4\00\023\0A\0CY\00\04\A5\06(13\E3\00\1D8\A9\02\02?\00/84@\00\06\184@\00\01/\02\1D1\C1\0B\02g\0F(r6#\01\06\AB\02\09U\0C/87\9E\07\03\118\AB\02\1F8\AB\02\02#8,Q\00\00'\00\01T\01\178T\01\1C6\BB\00\04b\04'15T\01/90\90\00\03\199.\08\06\AD\02392,\1E\00\00<\00\09U\03\01\13\00\0F\AD\02\00\02`\08\0C\8E\00\147\8E\00\196\89\01\0F\AD\02\00\02\1F\03\1F9A\00\06\09s\04\228,4\00\0C\8A\01\02\02\0B\081\0B\1F9\0D\0C\05394,\1F\00\1D-\D1\01\02\B3\0A\189\F5\02/95H\00\05#6,\1F\00\0CG\00\03#\11\189\19\02/97:\0C\04398,\1F\00\0D\8F\00\02T\10)r9\D7\00\1F9H\00\04C100, \00\0C\90\00\138P%\180q\06%10\FA\06\196\1B\02F102,\8C\05\04\E7\03\04\95\02$9,<\00\01*\00\01\97\02\179\97\02\0D\C8\01\04\FC\05(18\97\02\09b\00\08\0E\02\01\14\05\0CE\00$20E\00\08\89\05\07|\0A\1F8F\00\01/10G\00\05\192B\05\01\8D\11\005\00\0F\0E\02\03)11\1D\01\05W\02*726\01&4,:\05\0C\CB\03\02\D5\0A#10\98\00\124\E1\0C\1608\01,22\AC\00\04\AD\05\182\E5\0C)12e\00\08\F3\00\02\C7\05\0DG\00\04C\0C\182j\05\1A1\C5\00\0EG\00\1F3G\00\06\09X\05\131<\06\1F4\01\03\03)14!\01\05I\03*76:\01&6,Q\05\0Cp\02\03\9D\01\2205\07\00\126:\01\07\1C\0E\1D2\90\0D\04\C4\05(24\F3\00\195e\00\08\F3\00\02[\00\0DG\00\04J\09\182P\05)16\C5\00\0EG\00\1F6G\00\06\09\09\05\02\1F\0C?145\F3\03\03)17!\01\05<\04*80:\01&8,\01\05\0Dt\02$2,=\00\01+\00\02:\01\07\C9\0E,28\AC\00\04t\05(27\F3\00\0Ae\00\08\F3\00\02\18\07\0DG\00\046\09\182\F3\03)19\C5\00\0EG\00\1F9G\00\06\1997\0A\02K\03/46\E5\04\03\1A2\E8\07\120g\0F\0A*\00\03\D8\03\1C0r\00\143 \04(30\B9\00\06h\04\1A8)\05\05\E5\0E\0F\B7\02\00$3,;\00\01)\00\02}\01\07\EB\0F,46}\00\04\F1\03\143\9E\09\02\93\1E\151\C1\00\148\86\00\02\ED\06\19s\98\04)12\18\06\07\A5\00\193\BE\00\071\06\03\BE\03\00!\00\1D1\8C\11$4,V\00\01*\00\02\C0\00\07\1E\10,40\C0\00\04\D4\09\193j\04/15\98\00\05\196\98\00\08\8F\05\127e\08\06\96\08\121\FF\0E\1316\03,16}\0A\02\F0\0F\141\07\00\03\EC\02\1F5\AF\00\09\05\\\11\193\AF\00\199\EE\07\07y\05\1F0G\01\05\02\A2\0F\01!\00\0EG\01$6,V\00\01*\00\02\98\00\1F6\98\00\09\04\BE\04\193\BE\04/22\98\00\05\1F3G\01\05-24G\01\02\D1\0F#12,\06\1D3G\01$7,m\00\01.\00\02\AF\00\1F7\AF\00\09\04v\0A\193&\05/26\F6\01\04.27\EC\05\06\01\14$8,=\00\01+\00\02\7F\00\1F8\7F\00\09\04^\05)36\7F\00\1F8\7F\00\05\1E91\05\06\FE\00$9,=\00\01+\00\02\7F\00\1F9\7F\00\09\041\05\1931\05/30\AD\01\04.31Z\09\05\FE\00\03\D9\04\133\07\00\02,\02/20\7F\00\09\04\\\09\193i\05/32\7F\00\05\1E3\A3\08\06\F4\14$1,=\00\01+\00\02\7F\00\1F1\7F\00\09\04\A1\05)39\B2\04\00$\0C\0F\B2\04\03\192z\11\1F4,\12\05/35\A5\137\02.\1A:d35\DC\11$7,\83\00\0B+\12(8,6\00\197\B1\00\0F\BE\13\06\02q\0F-d3\BE\13\02g\0E\04V\00(40B\13\128B\13\2241f\1F\00L$\03\1B\00\12d\1E\1C\188.\00%9,h\1C\0D/\00!2,\22\00\09D\01/42p\138\02P\0B+d4\DD\00'4,\1D\00*37\1D\00\03\07 \1E4\FA\00#20\FA\00\1E5\CB\00\01\D3\1C(20w\01)46\AD\0A\08w\01\02\C6\01\1D4\F1\13\194\C6\01\1B4\9A\00'9,$\00\0A\9A\00\131\9A\00\199\D6\14\1A5i\0C\07;\15\02\B9\1A\01!\00\0A\82\00(52\82\00*51\1D\00'3,$\00\0A\82\00\122\82\00#53e\06\05\A9\1D\00\E9\05\02\A2\00\00&\00\0D9\01\114\18\00a3;\0Afma5\00\02\9A,P\02J0dC0\01\00\01\1A\00\08\D4\15\01\AA#\03C\1E\0D`\00!6,\22\00\0C`\00&7,f\00\01#\00)d3:\01\194\F5\09\08:\01$5,!\00\0A3\03(563\03(55\A1\00\135\1D\01\0Al\18)57\94\0B\08e\00\03E\00\1D7e\00\199e\00\08j\03#26e\00\1D9\82\01#7,\85\00\00&\00\0D\22\01\118\18\00\1C7\22\01\1F9\82\01\0C\09|\00%8,\0A \0E\15\04\010\00,28a\00\02L \04h\00\02U\00\187U\00\05;\04\04\EB\11\05\D2\00\01l\06\01!\00\0Bo\04\05\A9\06#88&\1C\064\00%2,:\00.31\A6\00\02\1F\00\03o\01\07\A6\00'3,\AD\00-12\C6\00%4,\CB\04\05-\00\02^\00\02{\00\03)\00\133\1B\00\194\EE\04\00\B5\1C\0F\B0(\19\0F\E6\04\02\126\CD\03)60+\02\02\DE\1C\04\1D\00\0B\E6\04\00\1A\1D\06$\00\1A4\9E\19\126\0B\18\1C3M\0F\144g\0C/40\0D\18\04?134s\0C\04\09\14!\07\FD\11\02i\04\01\1F\00\05I\14\14n\8D\0C\03\D0\09\133\DC\0F\02D\0F\172\0A\0E,42\A3\00\04\8D\0C;41:\1A\00\04r\09\134\E7\0B\108\D9\07\06\85\0A\01$\13$ndQ+\01s\00\17sb\0C\22eq\1B\00\11p\D0\04\01!\00\00\EA\08\10!\11\00\07\91\00\0C\99\15$43w\00'3:\FB\03\1F6\02\1C\06\00H\1E\03!\00\0A\02\1C/66+\021\03\AA&\0A\F5\05\02S\1E\05\1D\00\09\FB\1B\1F6=\08\06\02\80\1D-d6=\08\00\B4\1D\06V\00\187'\06\02 \02\00#\00\0A:\09/72:\097\2273x\00\0B\95\00'4,\1D\00*65\1D\00(5,$\00\0A\0F\03\2275\0F\03\0D\DB\01\05Y\1E\1A4\0F\03\0A&\00\04i\0C\194i\0C/37\97\0D\05\02\DC\05\01!\00\1F1\01\10\04/38\01\10\05'46\DD\02\1F5\DD\02\05#6,\1F\00\0E\DD\02\02\F8\02\166\DD\02\08;\0F\0Cw\00$47w\00\187\DD\02/76\DD\02\05\00\DB\1A\03!\00\0A\DD\02/78\DD\020\2279^\00\198\DC\01\00\F7\1A\07\1D\00\09\1A\0B\1F8E\1D\06\02\FD\1A-d8E\1D\00*\1A\06V\00\188\EE\1F\133\E7\07883]\D8\1E\00\C5\19\04\93)\09}\00\1F5\D9\1E\05\02<\19\01!\00\0B}\00$7,R\00\01'\00\09\A8\02\2287\A8\02\0D\00!\04n\0D\C048:\0Aret;\0A\0A}\0A\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([35409 x i8], [35409 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z5fatalPc(i8* %s) #0 { +entry: + %s.addr = alloca i8*, align 8 + store i8* %s, i8** %s.addr, align 8 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %1 = load i8*, i8** %s.addr, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i8* %1) + ret void +} + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z11writeoutputPfiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i8* %file) #0 { +entry: + %vect.addr = alloca float*, align 8 + %grid_rows.addr = alloca i32, align 4 + %grid_cols.addr = alloca i32, align 4 + %file.addr = alloca i8*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %index = alloca i32, align 4 + %fp = alloca %struct._IO_FILE*, align 8 + %str = alloca [256 x i8], align 16 + store float* %vect, float** %vect.addr, align 8 + store i32 %grid_rows, i32* %grid_rows.addr, align 4 + store i32 %grid_cols, i32* %grid_cols.addr, align 4 + store i8* %file, i8** %file.addr, align 8 + store i32 0, i32* %index, align 4 + %0 = load i8*, i8** %file.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.2, i64 0, i64 0)) + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc10, %if.end + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %grid_rows.addr, align 4 + %cmp2 = icmp slt i32 %1, %2 + br i1 %cmp2, label %for.body, label %for.end12 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond3 + +for.cond3: ; preds = %for.inc, %for.body + %3 = load i32, i32* %j, align 4 + %4 = load i32, i32* %grid_cols.addr, align 4 + %cmp4 = icmp slt i32 %3, %4 + br i1 %cmp4, label %for.body5, label %for.end + +for.body5: ; preds = %for.cond3 + %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %5 = load i32, i32* %index, align 4 + %6 = load float*, float** %vect.addr, align 8 + %7 = load i32, i32* %i, align 4 + %8 = load i32, i32* %grid_cols.addr, align 4 + %mul = mul nsw i32 %7, %8 + %9 = load i32, i32* %j, align 4 + %add = add nsw i32 %mul, %9 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %6, i64 %idxprom + %10 = load float, float* %arrayidx, align 4 + %conv = fpext float %10 to double + %call6 = call i32 (i8*, i8*, ...) @sprintf(i8* %arraydecay, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.3, i64 0, i64 0), i32 %5, double %conv) #8 + %arraydecay7 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %11 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call8 = call i32 @fputs(i8* %arraydecay7, %struct._IO_FILE* %11) + %12 = load i32, i32* %index, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %index, align 4 + br label %for.inc + +for.inc: ; preds = %for.body5 + %13 = load i32, i32* %j, align 4 + %inc9 = add nsw i32 %13, 1 + store i32 %inc9, i32* %j, align 4 + br label %for.cond3 + +for.end: ; preds = %for.cond3 + br label %for.inc10 + +for.inc10: ; preds = %for.end + %14 = load i32, i32* %i, align 4 + %inc11 = add nsw i32 %14, 1 + store i32 %inc11, i32* %i, align 4 + br label %for.cond + +for.end12: ; preds = %for.cond + %15 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call13 = call i32 @fclose(%struct._IO_FILE* %15) + ret void +} + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #1 + +declare dso_local i32 @printf(i8*, ...) #1 + +; Function Attrs: nounwind +declare dso_local i32 @sprintf(i8*, i8*, ...) #2 + +declare dso_local i32 @fputs(i8*, %struct._IO_FILE*) #1 + +declare dso_local i32 @fclose(%struct._IO_FILE*) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z9readinputPfiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i8* %file) #0 { +entry: + %vect.addr = alloca float*, align 8 + %grid_rows.addr = alloca i32, align 4 + %grid_cols.addr = alloca i32, align 4 + %file.addr = alloca i8*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %fp = alloca %struct._IO_FILE*, align 8 + %str = alloca [256 x i8], align 16 + %val = alloca float, align 4 + store float* %vect, float** %vect.addr, align 8 + store i32 %grid_rows, i32* %grid_rows.addr, align 4 + store i32 %grid_cols, i32* %grid_cols.addr, align 4 + store i8* %file, i8** %file.addr, align 8 + %0 = load i8*, i8** %file.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.2, i64 0, i64 0)) + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc16, %if.end + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %grid_rows.addr, align 4 + %sub = sub nsw i32 %2, 1 + %cmp2 = icmp sle i32 %1, %sub + br i1 %cmp2, label %for.body, label %for.end18 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond3 + +for.cond3: ; preds = %for.inc, %for.body + %3 = load i32, i32* %j, align 4 + %4 = load i32, i32* %grid_cols.addr, align 4 + %sub4 = sub nsw i32 %4, 1 + %cmp5 = icmp sle i32 %3, %sub4 + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond3 + %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %5 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call7 = call i8* @fgets(i8* %arraydecay, i32 256, %struct._IO_FILE* %5) + %6 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call8 = call i32 @feof(%struct._IO_FILE* %6) #8 + %tobool = icmp ne i32 %call8, 0 + br i1 %tobool, label %if.then9, label %if.end10 + +if.then9: ; preds = %for.body6 + call void @_Z5fatalPc(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.5, i64 0, i64 0)) + br label %if.end10 + +if.end10: ; preds = %if.then9, %for.body6 + %arraydecay11 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %call12 = call i32 (i8*, i8*, ...) @sscanf(i8* %arraydecay11, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), float* %val) #8 + %cmp13 = icmp ne i32 %call12, 1 + br i1 %cmp13, label %if.then14, label %if.end15 + +if.then14: ; preds = %if.end10 + call void @_Z5fatalPc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.7, i64 0, i64 0)) + br label %if.end15 + +if.end15: ; preds = %if.then14, %if.end10 + %7 = load float, float* %val, align 4 + %8 = load float*, float** %vect.addr, align 8 + %9 = load i32, i32* %i, align 4 + %10 = load i32, i32* %grid_cols.addr, align 4 + %mul = mul nsw i32 %9, %10 + %11 = load i32, i32* %j, align 4 + %add = add nsw i32 %mul, %11 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %8, i64 %idxprom + store float %7, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %if.end15 + %12 = load i32, i32* %j, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond3 + +for.end: ; preds = %for.cond3 + br label %for.inc16 + +for.inc16: ; preds = %for.end + %13 = load i32, i32* %i, align 4 + %inc17 = add nsw i32 %13, 1 + store i32 %inc17, i32* %i, align 4 + br label %for.cond + +for.end18: ; preds = %for.cond + %14 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call19 = call i32 @fclose(%struct._IO_FILE* %14) + ret void +} + +declare dso_local i8* @fgets(i8*, i32, %struct._IO_FILE*) #1 + +; Function Attrs: nounwind +declare dso_local i32 @feof(%struct._IO_FILE*) #2 + +; Function Attrs: nounwind +declare dso_local i32 @sscanf(i8*, i8*, ...) #2 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 { +entry: + %iteration.addr = alloca i32, align 4 + %power.addr = alloca float*, align 8 + %temp_src.addr = alloca float*, align 8 + %temp_dst.addr = alloca float*, align 8 + %grid_cols.addr = alloca i32, align 4 + %grid_rows.addr = alloca i32, align 4 + %border_cols.addr = alloca i32, align 4 + %border_rows.addr = alloca i32, align 4 + %Cap.addr = alloca float, align 4 + %Rx.addr = alloca float, align 4 + %Ry.addr = alloca float, align 4 + %Rz.addr = alloca float, align 4 + %step.addr = alloca float, align 4 + %time_elapsed.addr = alloca float, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32 %iteration, i32* %iteration.addr, align 4 + store float* %power, float** %power.addr, align 8 + store float* %temp_src, float** %temp_src.addr, align 8 + store float* %temp_dst, float** %temp_dst.addr, align 8 + store i32 %grid_cols, i32* %grid_cols.addr, align 4 + store i32 %grid_rows, i32* %grid_rows.addr, align 4 + store i32 %border_cols, i32* %border_cols.addr, align 4 + store i32 %border_rows, i32* %border_rows.addr, align 4 + store float %Cap, float* %Cap.addr, align 4 + store float %Rx, float* %Rx.addr, align 4 + store float %Ry, float* %Ry.addr, align 4 + store float %Rz, float* %Rz.addr, align 4 + store float %step, float* %step.addr, align 4 + store float %time_elapsed, float* %time_elapsed.addr, align 4 + %kernel_args = alloca i8*, i64 14, align 16 + %0 = bitcast i32* %iteration.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast float** %power.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast float** %temp_src.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast float** %temp_dst.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %grid_cols.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %grid_rows.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32* %border_cols.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast i32* %border_rows.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = bitcast float* %Cap.addr to i8* + %17 = getelementptr i8*, i8** %kernel_args, i32 8 + store i8* %16, i8** %17 + %18 = bitcast float* %Rx.addr to i8* + %19 = getelementptr i8*, i8** %kernel_args, i32 9 + store i8* %18, i8** %19 + %20 = bitcast float* %Ry.addr to i8* + %21 = getelementptr i8*, i8** %kernel_args, i32 10 + store i8* %20, i8** %21 + %22 = bitcast float* %Rz.addr to i8* + %23 = getelementptr i8*, i8** %kernel_args, i32 11 + store i8* %22, i8** %23 + %24 = bitcast float* %step.addr to i8* + %25 = getelementptr i8*, i8** %kernel_args, i32 12 + store i8* %24, i8** %25 + %26 = bitcast float* %time_elapsed.addr to i8* + %27 = getelementptr i8*, i8** %kernel_args, i32 13 + store i8* %26, i8** %27 + %28 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %29 = load i64, i64* %shmem_size, align 8 + %30 = load i8*, i8** %stream, align 8 + %31 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %32 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false) + %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %34 = load i64, i64* %33, align 8 + %35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %36 = load i32, i32* %35, align 8 + %37 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %38 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 12, i1 false) + %39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %40 = load i64, i64* %39, align 8 + %41 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %42 = load i32, i32* %41, align 8 + %43 = bitcast i8* %30 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff to i8*), i64 %34, i32 %36, i64 %40, i32 %42, i8** %kernel_args, i64 %29, %struct.CUstream_st* %43) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3 + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @_Z17compute_tran_tempPfPS_iiiiiiii(float* %MatrixPower, float** %MatrixTemp, i32 %col, i32 %row, i32 %total_iterations, i32 %num_iterations, i32 %blockCols, i32 %blockRows, i32 %borderCols, i32 %borderRows) #0 { +entry: + %MatrixPower.addr = alloca float*, align 8 + %MatrixTemp.addr = alloca float**, align 8 + %col.addr = alloca i32, align 4 + %row.addr = alloca i32, align 4 + %total_iterations.addr = alloca i32, align 4 + %num_iterations.addr = alloca i32, align 4 + %blockCols.addr = alloca i32, align 4 + %blockRows.addr = alloca i32, align 4 + %borderCols.addr = alloca i32, align 4 + %borderRows.addr = alloca i32, align 4 + %dimBlock = alloca %struct.dim3, align 4 + %dimGrid = alloca %struct.dim3, align 4 + %grid_height = alloca float, align 4 + %grid_width = alloca float, align 4 + %Cap = alloca float, align 4 + %Rx = alloca float, align 4 + %Ry = alloca float, align 4 + %Rz = alloca float, align 4 + %max_slope = alloca float, align 4 + %step = alloca float, align 4 + %t = alloca float, align 4 + %time_elapsed = alloca float, align 4 + %src = alloca i32, align 4 + %dst = alloca i32, align 4 + %temp = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp35 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp35.coerce = alloca { i64, i32 }, align 4 + store float* %MatrixPower, float** %MatrixPower.addr, align 8 + store float** %MatrixTemp, float*** %MatrixTemp.addr, align 8 + store i32 %col, i32* %col.addr, align 4 + store i32 %row, i32* %row.addr, align 4 + store i32 %total_iterations, i32* %total_iterations.addr, align 4 + store i32 %num_iterations, i32* %num_iterations.addr, align 4 + store i32 %blockCols, i32* %blockCols.addr, align 4 + store i32 %blockRows, i32* %blockRows.addr, align 4 + store i32 %borderCols, i32* %borderCols.addr, align 4 + store i32 %borderRows, i32* %borderRows.addr, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1) + %0 = load i32, i32* %blockCols.addr, align 4 + %1 = load i32, i32* %blockRows.addr, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %0, i32 %1, i32 1) + %2 = load float, float* @chip_height, align 4 + %3 = load i32, i32* %row.addr, align 4 + %conv = sitofp i32 %3 to float + %div = fdiv float %2, %conv + store float %div, float* %grid_height, align 4 + %4 = load float, float* @chip_width, align 4 + %5 = load i32, i32* %col.addr, align 4 + %conv1 = sitofp i32 %5 to float + %div2 = fdiv float %4, %conv1 + store float %div2, float* %grid_width, align 4 + %6 = load float, float* @t_chip, align 4 + %conv3 = fpext float %6 to double + %mul = fmul contract double 8.750000e+05, %conv3 + %7 = load float, float* %grid_width, align 4 + %conv4 = fpext float %7 to double + %mul5 = fmul contract double %mul, %conv4 + %8 = load float, float* %grid_height, align 4 + %conv6 = fpext float %8 to double + %mul7 = fmul contract double %mul5, %conv6 + %conv8 = fptrunc double %mul7 to float + store float %conv8, float* %Cap, align 4 + %9 = load float, float* %grid_width, align 4 + %conv9 = fpext float %9 to double + %10 = load float, float* @t_chip, align 4 + %conv10 = fpext float %10 to double + %mul11 = fmul contract double 2.000000e+02, %conv10 + %11 = load float, float* %grid_height, align 4 + %conv12 = fpext float %11 to double + %mul13 = fmul contract double %mul11, %conv12 + %div14 = fdiv double %conv9, %mul13 + %conv15 = fptrunc double %div14 to float + store float %conv15, float* %Rx, align 4 + %12 = load float, float* %grid_height, align 4 + %conv16 = fpext float %12 to double + %13 = load float, float* @t_chip, align 4 + %conv17 = fpext float %13 to double + %mul18 = fmul contract double 2.000000e+02, %conv17 + %14 = load float, float* %grid_width, align 4 + %conv19 = fpext float %14 to double + %mul20 = fmul contract double %mul18, %conv19 + %div21 = fdiv double %conv16, %mul20 + %conv22 = fptrunc double %div21 to float + store float %conv22, float* %Ry, align 4 + %15 = load float, float* @t_chip, align 4 + %16 = load float, float* %grid_height, align 4 + %mul23 = fmul contract float 1.000000e+02, %16 + %17 = load float, float* %grid_width, align 4 + %mul24 = fmul contract float %mul23, %17 + %div25 = fdiv float %15, %mul24 + store float %div25, float* %Rz, align 4 + %18 = load float, float* @t_chip, align 4 + %conv26 = fpext float %18 to double + %mul27 = fmul contract double 5.000000e-01, %conv26 + %mul28 = fmul contract double %mul27, 1.750000e+06 + %div29 = fdiv double 3.000000e+06, %mul28 + %conv30 = fptrunc double %div29 to float + store float %conv30, float* %max_slope, align 4 + %19 = load float, float* %max_slope, align 4 + %conv31 = fpext float %19 to double + %div32 = fdiv double 1.000000e-03, %conv31 + %conv33 = fptrunc double %div32 to float + store float %conv33, float* %step, align 4 + store float 0x3F50624DE0000000, float* %time_elapsed, align 4 + store i32 1, i32* %src, align 4 + store i32 0, i32* %dst, align 4 + store float 0.000000e+00, float* %t, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %20 = load float, float* %t, align 4 + %21 = load i32, i32* %total_iterations.addr, align 4 + %conv34 = sitofp i32 %21 to float + %cmp = fcmp olt float %20, %conv34 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %22 = load i32, i32* %src, align 4 + store i32 %22, i32* %temp, align 4 + %23 = load i32, i32* %dst, align 4 + store i32 %23, i32* %src, align 4 + %24 = load i32, i32* %temp, align 4 + store i32 %24, i32* %dst, align 4 + %25 = bitcast %struct.dim3* %agg.tmp to i8* + %26 = bitcast %struct.dim3* %dimGrid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %25, i8* align 4 %26, i64 12, i1 false) + %27 = bitcast %struct.dim3* %agg.tmp35 to i8* + %28 = bitcast %struct.dim3* %dimBlock to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %27, i8* align 4 %28, i64 12, i1 false) + %29 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %30 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %29, i8* align 4 %30, i64 12, i1 false) + %31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %32 = load i64, i64* %31, align 4 + %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %34 = load i32, i32* %33, align 4 + %35 = bitcast { i64, i32 }* %agg.tmp35.coerce to i8* + %36 = bitcast %struct.dim3* %agg.tmp35 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %35, i8* align 4 %36, i64 12, i1 false) + %37 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp35.coerce, i32 0, i32 0 + %38 = load i64, i64* %37, align 4 + %39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp35.coerce, i32 0, i32 1 + %40 = load i32, i32* %39, align 4 + %call = call i32 @__cudaPushCallConfiguration(i64 %32, i32 %34, i64 %38, i32 %40, i64 0, i8* null) + %tobool = icmp ne i32 %call, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %41 = load i32, i32* %num_iterations.addr, align 4 + %conv36 = sitofp i32 %41 to float + %42 = load i32, i32* %total_iterations.addr, align 4 + %conv37 = sitofp i32 %42 to float + %43 = load float, float* %t, align 4 + %sub = fsub contract float %conv37, %43 + %cmp38 = fcmp ole float %conv36, %sub + br i1 %cmp38, label %cond.true, label %cond.false + +cond.true: ; preds = %kcall.configok + %44 = load i32, i32* %num_iterations.addr, align 4 + %conv39 = sitofp i32 %44 to float + br label %cond.end + +cond.false: ; preds = %kcall.configok + %45 = load i32, i32* %total_iterations.addr, align 4 + %conv40 = sitofp i32 %45 to float + %46 = load float, float* %t, align 4 + %sub41 = fsub contract float %conv40, %46 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi float [ %conv39, %cond.true ], [ %sub41, %cond.false ] + %conv42 = fptosi float %cond to i32 + %47 = load float*, float** %MatrixPower.addr, align 8 + %48 = load float**, float*** %MatrixTemp.addr, align 8 + %49 = load i32, i32* %src, align 4 + %idxprom = sext i32 %49 to i64 + %arrayidx = getelementptr inbounds float*, float** %48, i64 %idxprom + %50 = load float*, float** %arrayidx, align 8 + %51 = load float**, float*** %MatrixTemp.addr, align 8 + %52 = load i32, i32* %dst, align 4 + %idxprom43 = sext i32 %52 to i64 + %arrayidx44 = getelementptr inbounds float*, float** %51, i64 %idxprom43 + %53 = load float*, float** %arrayidx44, align 8 + %54 = load i32, i32* %col.addr, align 4 + %55 = load i32, i32* %row.addr, align 4 + %56 = load i32, i32* %borderCols.addr, align 4 + %57 = load i32, i32* %borderRows.addr, align 4 + %58 = load float, float* %Cap, align 4 + %59 = load float, float* %Rx, align 4 + %60 = load float, float* %Ry, align 4 + %61 = load float, float* %Rz, align 4 + %62 = load float, float* %step, align 4 + %63 = load float, float* %time_elapsed, align 4 + call void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %conv42, float* %47, float* %50, float* %53, i32 %54, i32 %55, i32 %56, i32 %57, float %58, float %59, float %60, float %61, float %62, float %63) + br label %kcall.end + +kcall.end: ; preds = %cond.end, %for.body + %call45 = call i32 @cudaDeviceSynchronize() + br label %for.inc + +for.inc: ; preds = %kcall.end + %64 = load i32, i32* %num_iterations.addr, align 4 + %conv46 = sitofp i32 %64 to float + %65 = load float, float* %t, align 4 + %add = fadd contract float %65, %conv46 + store float %add, float* %t, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %66 = load i32, i32* %dst, align 4 + ret i32 %66 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #1 + +declare dso_local i32 @cudaDeviceSynchronize() #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %1 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0 + %2 = load i8*, i8** %arrayidx, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @.str.8, i64 0, i64 0), i8* %2) + %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([78 x i8], [78 x i8]* @.str.9, i64 0, i64 0)) + %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([53 x i8], [53 x i8]* @.str.10, i64 0, i64 0)) + %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.11, i64 0, i64 0)) + %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([89 x i8], [89 x i8]* @.str.12, i64 0, i64 0)) + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([86 x i8], [86 x i8]* @.str.13, i64 0, i64 0)) + %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.14, i64 0, i64 0)) + call void @exit(i32 1) #9 + unreachable +} + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #5 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #6 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.15, i64 0, i64 0), i32 16, i32 16) + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + call void @_Z3runiPPc(i32 %0, i8** %1) + ret i32 0 +} + +declare dso_local i32 @cudaSetDevice(i32) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z3runiPPc(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %size = alloca i32, align 4 + %grid_rows = alloca i32, align 4 + %grid_cols = alloca i32, align 4 + %FilesavingTemp = alloca float*, align 8 + %FilesavingPower = alloca float*, align 8 + %MatrixOut = alloca float*, align 8 + %tfile = alloca i8*, align 8 + %pfile = alloca i8*, align 8 + %ofile = alloca i8*, align 8 + %total_iterations = alloca i32, align 4 + %pyramid_height = alloca i32, align 4 + %borderCols = alloca i32, align 4 + %borderRows = alloca i32, align 4 + %smallBlockCol = alloca i32, align 4 + %smallBlockRow = alloca i32, align 4 + %blockCols = alloca i32, align 4 + %blockRows = alloca i32, align 4 + %MatrixTemp = alloca [2 x float*], align 16 + %MatrixPower = alloca float*, align 8 + %ret = alloca i32, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + store i32 60, i32* %total_iterations, align 4 + store i32 1, i32* %pyramid_height, align 4 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp ne i32 %0, 7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* %argc.addr, align 4 + %2 = load i8**, i8*** %argv.addr, align 8 + call void @_Z5usageiPPc(i32 %1, i8** %2) + br label %if.end + +if.end: ; preds = %if.then, %entry + %3 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %3, i64 1 + %4 = load i8*, i8** %arrayidx, align 8 + %call = call i32 @atoi(i8* %4) #10 + store i32 %call, i32* %grid_rows, align 4 + %cmp1 = icmp sle i32 %call, 0 + br i1 %cmp1, label %if.then13, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %5 = load i8**, i8*** %argv.addr, align 8 + %arrayidx2 = getelementptr inbounds i8*, i8** %5, i64 1 + %6 = load i8*, i8** %arrayidx2, align 8 + %call3 = call i32 @atoi(i8* %6) #10 + store i32 %call3, i32* %grid_cols, align 4 + %cmp4 = icmp sle i32 %call3, 0 + br i1 %cmp4, label %if.then13, label %lor.lhs.false5 + +lor.lhs.false5: ; preds = %lor.lhs.false + %7 = load i8**, i8*** %argv.addr, align 8 + %arrayidx6 = getelementptr inbounds i8*, i8** %7, i64 2 + %8 = load i8*, i8** %arrayidx6, align 8 + %call7 = call i32 @atoi(i8* %8) #10 + store i32 %call7, i32* %pyramid_height, align 4 + %cmp8 = icmp sle i32 %call7, 0 + br i1 %cmp8, label %if.then13, label %lor.lhs.false9 + +lor.lhs.false9: ; preds = %lor.lhs.false5 + %9 = load i8**, i8*** %argv.addr, align 8 + %arrayidx10 = getelementptr inbounds i8*, i8** %9, i64 3 + %10 = load i8*, i8** %arrayidx10, align 8 + %call11 = call i32 @atoi(i8* %10) #10 + store i32 %call11, i32* %total_iterations, align 4 + %cmp12 = icmp sle i32 %call11, 0 + br i1 %cmp12, label %if.then13, label %if.end14 + +if.then13: ; preds = %lor.lhs.false9, %lor.lhs.false5, %lor.lhs.false, %if.end + %11 = load i32, i32* %argc.addr, align 4 + %12 = load i8**, i8*** %argv.addr, align 8 + call void @_Z5usageiPPc(i32 %11, i8** %12) + br label %if.end14 + +if.end14: ; preds = %if.then13, %lor.lhs.false9 + %13 = load i8**, i8*** %argv.addr, align 8 + %arrayidx15 = getelementptr inbounds i8*, i8** %13, i64 4 + %14 = load i8*, i8** %arrayidx15, align 8 + store i8* %14, i8** %tfile, align 8 + %15 = load i8**, i8*** %argv.addr, align 8 + %arrayidx16 = getelementptr inbounds i8*, i8** %15, i64 5 + %16 = load i8*, i8** %arrayidx16, align 8 + store i8* %16, i8** %pfile, align 8 + %17 = load i8**, i8*** %argv.addr, align 8 + %arrayidx17 = getelementptr inbounds i8*, i8** %17, i64 6 + %18 = load i8*, i8** %arrayidx17, align 8 + store i8* %18, i8** %ofile, align 8 + %19 = load i32, i32* %grid_rows, align 4 + %20 = load i32, i32* %grid_cols, align 4 + %mul = mul nsw i32 %19, %20 + store i32 %mul, i32* %size, align 4 + %21 = load i32, i32* %pyramid_height, align 4 + %mul18 = mul nsw i32 %21, 2 + %div = sdiv i32 %mul18, 2 + store i32 %div, i32* %borderCols, align 4 + %22 = load i32, i32* %pyramid_height, align 4 + %mul19 = mul nsw i32 %22, 2 + %div20 = sdiv i32 %mul19, 2 + store i32 %div20, i32* %borderRows, align 4 + %23 = load i32, i32* %pyramid_height, align 4 + %mul21 = mul nsw i32 %23, 2 + %sub = sub nsw i32 16, %mul21 + store i32 %sub, i32* %smallBlockCol, align 4 + %24 = load i32, i32* %pyramid_height, align 4 + %mul22 = mul nsw i32 %24, 2 + %sub23 = sub nsw i32 16, %mul22 + store i32 %sub23, i32* %smallBlockRow, align 4 + %25 = load i32, i32* %grid_cols, align 4 + %26 = load i32, i32* %smallBlockCol, align 4 + %div24 = sdiv i32 %25, %26 + %27 = load i32, i32* %grid_cols, align 4 + %28 = load i32, i32* %smallBlockCol, align 4 + %rem = srem i32 %27, %28 + %cmp25 = icmp eq i32 %rem, 0 + %29 = zext i1 %cmp25 to i64 + %cond = select i1 %cmp25, i32 0, i32 1 + %add = add nsw i32 %div24, %cond + store i32 %add, i32* %blockCols, align 4 + %30 = load i32, i32* %grid_rows, align 4 + %31 = load i32, i32* %smallBlockRow, align 4 + %div26 = sdiv i32 %30, %31 + %32 = load i32, i32* %grid_rows, align 4 + %33 = load i32, i32* %smallBlockRow, align 4 + %rem27 = srem i32 %32, %33 + %cmp28 = icmp eq i32 %rem27, 0 + %34 = zext i1 %cmp28 to i64 + %cond29 = select i1 %cmp28, i32 0, i32 1 + %add30 = add nsw i32 %div26, %cond29 + store i32 %add30, i32* %blockRows, align 4 + %35 = load i32, i32* %size, align 4 + %conv = sext i32 %35 to i64 + %mul31 = mul i64 %conv, 4 + %call32 = call noalias i8* @malloc(i64 %mul31) #8 + %36 = bitcast i8* %call32 to float* + store float* %36, float** %FilesavingTemp, align 8 + %37 = load i32, i32* %size, align 4 + %conv33 = sext i32 %37 to i64 + %mul34 = mul i64 %conv33, 4 + %call35 = call noalias i8* @malloc(i64 %mul34) #8 + %38 = bitcast i8* %call35 to float* + store float* %38, float** %FilesavingPower, align 8 + %39 = load i32, i32* %size, align 4 + %conv36 = sext i32 %39 to i64 + %call37 = call noalias i8* @calloc(i64 %conv36, i64 4) #8 + %40 = bitcast i8* %call37 to float* + store float* %40, float** %MatrixOut, align 8 + %41 = load float*, float** %FilesavingPower, align 8 + %tobool = icmp ne float* %41, null + br i1 %tobool, label %lor.lhs.false38, label %if.then42 + +lor.lhs.false38: ; preds = %if.end14 + %42 = load float*, float** %FilesavingTemp, align 8 + %tobool39 = icmp ne float* %42, null + br i1 %tobool39, label %lor.lhs.false40, label %if.then42 + +lor.lhs.false40: ; preds = %lor.lhs.false38 + %43 = load float*, float** %MatrixOut, align 8 + %tobool41 = icmp ne float* %43, null + br i1 %tobool41, label %if.end43, label %if.then42 + +if.then42: ; preds = %lor.lhs.false40, %lor.lhs.false38, %if.end14 + call void @_Z5fatalPc(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.16, i64 0, i64 0)) + br label %if.end43 + +if.end43: ; preds = %if.then42, %lor.lhs.false40 + %44 = load i32, i32* %pyramid_height, align 4 + %45 = load i32, i32* %grid_cols, align 4 + %46 = load i32, i32* %grid_rows, align 4 + %47 = load i32, i32* %borderCols, align 4 + %48 = load i32, i32* %borderRows, align 4 + %49 = load i32, i32* %blockCols, align 4 + %50 = load i32, i32* %blockRows, align 4 + %51 = load i32, i32* %smallBlockCol, align 4 + %52 = load i32, i32* %smallBlockRow, align 4 + %call44 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([94 x i8], [94 x i8]* @.str.17, i64 0, i64 0), i32 %44, i32 %45, i32 %46, i32 %47, i32 %48, i32 %49, i32 %50, i32 %51, i32 %52) + %53 = load float*, float** %FilesavingTemp, align 8 + %54 = load i32, i32* %grid_rows, align 4 + %55 = load i32, i32* %grid_cols, align 4 + %56 = load i8*, i8** %tfile, align 8 + call void @_Z9readinputPfiiPc(float* %53, i32 %54, i32 %55, i8* %56) + %57 = load float*, float** %FilesavingPower, align 8 + %58 = load i32, i32* %grid_rows, align 4 + %59 = load i32, i32* %grid_cols, align 4 + %60 = load i8*, i8** %pfile, align 8 + call void @_Z9readinputPfiiPc(float* %57, i32 %58, i32 %59, i8* %60) + %arrayidx45 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0 + %61 = bitcast float** %arrayidx45 to i8** + %62 = load i32, i32* %size, align 4 + %conv46 = sext i32 %62 to i64 + %mul47 = mul i64 4, %conv46 + %call48 = call i32 @cudaMalloc(i8** %61, i64 %mul47) + %arrayidx49 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 1 + %63 = bitcast float** %arrayidx49 to i8** + %64 = load i32, i32* %size, align 4 + %conv50 = sext i32 %64 to i64 + %mul51 = mul i64 4, %conv50 + %call52 = call i32 @cudaMalloc(i8** %63, i64 %mul51) + %arrayidx53 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0 + %65 = load float*, float** %arrayidx53, align 16 + %66 = bitcast float* %65 to i8* + %67 = load float*, float** %FilesavingTemp, align 8 + %68 = bitcast float* %67 to i8* + %69 = load i32, i32* %size, align 4 + %conv54 = sext i32 %69 to i64 + %mul55 = mul i64 4, %conv54 + %call56 = call i32 @cudaMemcpy(i8* %66, i8* %68, i64 %mul55, i32 1) + %70 = bitcast float** %MatrixPower to i8** + %71 = load i32, i32* %size, align 4 + %conv57 = sext i32 %71 to i64 + %mul58 = mul i64 4, %conv57 + %call59 = call i32 @cudaMalloc(i8** %70, i64 %mul58) + %72 = load float*, float** %MatrixPower, align 8 + %73 = bitcast float* %72 to i8* + %74 = load float*, float** %FilesavingPower, align 8 + %75 = bitcast float* %74 to i8* + %76 = load i32, i32* %size, align 4 + %conv60 = sext i32 %76 to i64 + %mul61 = mul i64 4, %conv60 + %call62 = call i32 @cudaMemcpy(i8* %73, i8* %75, i64 %mul61, i32 1) + %call63 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.18, i64 0, i64 0)) + %77 = load float*, float** %MatrixPower, align 8 + %arraydecay = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0 + %78 = load i32, i32* %grid_cols, align 4 + %79 = load i32, i32* %grid_rows, align 4 + %80 = load i32, i32* %total_iterations, align 4 + %81 = load i32, i32* %pyramid_height, align 4 + %82 = load i32, i32* %blockCols, align 4 + %83 = load i32, i32* %blockRows, align 4 + %84 = load i32, i32* %borderCols, align 4 + %85 = load i32, i32* %borderRows, align 4 + %call64 = call i32 @_Z17compute_tran_tempPfPS_iiiiiiii(float* %77, float** %arraydecay, i32 %78, i32 %79, i32 %80, i32 %81, i32 %82, i32 %83, i32 %84, i32 %85) + store i32 %call64, i32* %ret, align 4 + %call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.19, i64 0, i64 0)) + %86 = load float*, float** %MatrixOut, align 8 + %87 = bitcast float* %86 to i8* + %88 = load i32, i32* %ret, align 4 + %idxprom = sext i32 %88 to i64 + %arrayidx66 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 %idxprom + %89 = load float*, float** %arrayidx66, align 8 + %90 = bitcast float* %89 to i8* + %91 = load i32, i32* %size, align 4 + %conv67 = sext i32 %91 to i64 + %mul68 = mul i64 4, %conv67 + %call69 = call i32 @cudaMemcpy(i8* %87, i8* %90, i64 %mul68, i32 2) + %92 = load float*, float** %MatrixOut, align 8 + %93 = load i32, i32* %grid_rows, align 4 + %94 = load i32, i32* %grid_cols, align 4 + %95 = load i8*, i8** %ofile, align 8 + call void @_Z11writeoutputPfiiPc(float* %92, i32 %93, i32 %94, i8* %95) + %96 = load float*, float** %MatrixPower, align 8 + %97 = bitcast float* %96 to i8* + %call70 = call i32 @cudaFree(i8* %97) + %arrayidx71 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0 + %98 = load float*, float** %arrayidx71, align 16 + %99 = bitcast float* %98 to i8* + %call72 = call i32 @cudaFree(i8* %99) + %arrayidx73 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 1 + %100 = load float*, float** %arrayidx73, align 8 + %101 = bitcast float* %100 to i8* + %call74 = call i32 @cudaFree(i8* %101) + %102 = load float*, float** %MatrixOut, align 8 + %103 = bitcast float* %102 to i8* + call void @free(i8* %103) #8 + ret void +} + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #7 + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #2 + +; Function Attrs: nounwind +declare dso_local noalias i8* @calloc(i64, i64) #2 + +declare dso_local i32 @cudaMalloc(i8**, i64) #1 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1 + +declare dso_local i32 @cudaFree(i8*) #1 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #2 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff to i8*), i8* getelementptr inbounds ([36 x i8], [36 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([36 x i8], [36 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { argmemonly nounwind willreturn } +attributes #4 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nounwind } +attributes #9 = { noreturn nounwind } +attributes #10 = { nounwind readonly } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/hotspot/hotspot.cu b/examples/hotspot/hotspot.cu new file mode 100644 index 0000000..9788e82 --- /dev/null +++ b/examples/hotspot/hotspot.cu @@ -0,0 +1,353 @@ +#include +#include +#include +#include + +#ifdef RD_WG_SIZE_0_0 +#define BLOCK_SIZE RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define BLOCK_SIZE RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE RD_WG_SIZE +#else +#define BLOCK_SIZE 16 +#endif + +#define STR_SIZE 256 + +/* maximum power density possible (say 300W for a 10mm x 10mm chip) */ +#define MAX_PD (3.0e6) +/* required precision in degrees */ +#define PRECISION 0.001 +#define SPEC_HEAT_SI 1.75e6 +#define K_SI 100 +/* capacitance fitting factor */ +#define FACTOR_CHIP 0.5 + +/* chip parameters */ +float t_chip = 0.0005; +float chip_height = 0.016; +float chip_width = 0.016; +/* ambient temperature, assuming no package at all */ +float amb_temp = 80.0; + +void run(int argc, char **argv); + +/* define timer macros */ +#define pin_stats_reset() startCycle() +#define pin_stats_pause(cycles) stopCycle(cycles) +#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles) + +void fatal(char *s) { fprintf(stderr, "error: %s\n", s); } + +void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) { + + int i, j, index = 0; + FILE *fp; + char str[STR_SIZE]; + + if ((fp = fopen(file, "w")) == 0) + printf("The file was not opened\n"); + + for (i = 0; i < grid_rows; i++) + for (j = 0; j < grid_cols; j++) { + + sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]); + fputs(str, fp); + index++; + } + + fclose(fp); +} + +void readinput(float *vect, int grid_rows, int grid_cols, char *file) { + + int i, j; + FILE *fp; + char str[STR_SIZE]; + float val; + + if ((fp = fopen(file, "r")) == 0) + printf("The file was not opened\n"); + + for (i = 0; i <= grid_rows - 1; i++) + for (j = 0; j <= grid_cols - 1; j++) { + fgets(str, STR_SIZE, fp); + if (feof(fp)) + fatal("not enough lines in file"); + // if ((sscanf(str, "%d%f", &index, &val) != 2) || (index != + // ((i-1)*(grid_cols-2)+j-1))) + if ((sscanf(str, "%f", &val) != 1)) + fatal("invalid file format"); + vect[i * grid_cols + j] = val; + } + + fclose(fp); +} + +#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max)) +#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +__global__ void calculate_temp(int iteration, // number of iteration + float *power, // power input + float *temp_src, // temperature input/output + float *temp_dst, // temperature input/output + int grid_cols, // Col of grid + int grid_rows, // Row of grid + int border_cols, // border offset + int border_rows, // border offset + float Cap, // Capacitance + float Rx, float Ry, float Rz, float step, + float time_elapsed) { + + __shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float temp_t[BLOCK_SIZE] + [BLOCK_SIZE]; // saving temparary temperature result + + float amb_temp = 80.0; + float step_div_Cap; + float Rx_1, Ry_1, Rz_1; + + int bx = blockIdx.x; + int by = blockIdx.y; + + int tx = threadIdx.x; + int ty = threadIdx.y; + + step_div_Cap = step / Cap; + + Rx_1 = 1 / Rx; + Ry_1 = 1 / Ry; + Rz_1 = 1 / Rz; + + // each block finally computes result for a small block + // after N iterations. + // it is the non-overlapping small blocks that cover + // all the input data + + // calculate the small block size + int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE + int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE + + // calculate the boundary for the block according to + // the boundary of its small block + int blkY = small_block_rows * by - border_rows; + int blkX = small_block_cols * bx - border_cols; + int blkYmax = blkY + BLOCK_SIZE - 1; + int blkXmax = blkX + BLOCK_SIZE - 1; + + // calculate the global thread coordination + int yidx = blkY + ty; + int xidx = blkX + tx; + + // load data if it is within the valid input range + int loadYidx = yidx, loadXidx = xidx; + int index = grid_cols * loadYidx + loadXidx; + + if (IN_RANGE(loadYidx, 0, grid_rows - 1) && + IN_RANGE(loadXidx, 0, grid_cols - 1)) { + temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from + // global memory to shared memory + power_on_cuda[ty][tx] = + power[index]; // Load the power data from global memory to shared memory + } + __syncthreads(); + + // effective range within this block that falls within + // the valid range of the input data + // used to rule out computation outside the boundary. + int validYmin = (blkY < 0) ? -blkY : 0; + int validYmax = (blkYmax > grid_rows - 1) + ? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1) + : BLOCK_SIZE - 1; + int validXmin = (blkX < 0) ? -blkX : 0; + int validXmax = (blkXmax > grid_cols - 1) + ? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1) + : BLOCK_SIZE - 1; + + int N = ty - 1; + int S = ty + 1; + int W = tx - 1; + int E = tx + 1; + + N = (N < validYmin) ? validYmin : N; + S = (S > validYmax) ? validYmax : S; + W = (W < validXmin) ? validXmin : W; + E = (E > validXmax) ? validXmax : E; + + bool computed; + for (int i = 0; i < iteration; i++) { + computed = false; + if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) && + IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) && + IN_RANGE(tx, validXmin, validXmax) && + IN_RANGE(ty, validYmin, validYmax)) { + computed = true; + temp_t[ty][tx] = + temp_on_cuda[ty][tx] + + step_div_Cap * (power_on_cuda[ty][tx] + + (temp_on_cuda[S][tx] + temp_on_cuda[N][tx] - + 2.0 * temp_on_cuda[ty][tx]) * + Ry_1 + + (temp_on_cuda[ty][E] + temp_on_cuda[ty][W] - + 2.0 * temp_on_cuda[ty][tx]) * + Rx_1 + + (amb_temp - temp_on_cuda[ty][tx]) * Rz_1); + } + __syncthreads(); + if (i == iteration - 1) + break; + if (computed) // Assign the computation range + temp_on_cuda[ty][tx] = temp_t[ty][tx]; + __syncthreads(); + } + + // update the global memory + // after the last iteration, only threads coordinated within the + // small block perform the calculation and switch on ``computed'' + if (computed) { + temp_dst[index] = temp_t[ty][tx]; + } +} + +/* + compute N time steps +*/ + +int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col, + int row, int total_iterations, int num_iterations, + int blockCols, int blockRows, int borderCols, + int borderRows) { + dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 dimGrid(blockCols, blockRows); + + float grid_height = chip_height / row; + float grid_width = chip_width / col; + + float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; + float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); + float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); + float Rz = t_chip / (K_SI * grid_height * grid_width); + + float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); + float step = PRECISION / max_slope; + float t; + float time_elapsed; + time_elapsed = 0.001; + + int src = 1, dst = 0; + + for (t = 0; t < total_iterations; t += num_iterations) { + int temp = src; + src = dst; + dst = temp; + calculate_temp<<>>( + MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src], + MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz, + step, time_elapsed); + cudaDeviceSynchronize(); + } + return dst; +} + +void usage(int argc, char **argv) { + fprintf(stderr, + "Usage: %s " + " \n", + argv[0]); + fprintf(stderr, "\t - number of rows/cols in the grid " + "(positive integer)\n"); + fprintf(stderr, "\t - pyramid heigh(positive integer)\n"); + fprintf(stderr, "\t - number of iterations\n"); + fprintf(stderr, "\t - name of the file containing the initial " + "temperature values of each cell\n"); + fprintf(stderr, "\t - name of the file containing the dissipated " + "power values of each cell\n"); + fprintf(stderr, "\t - name of the output file\n"); + exit(1); +} + +int main(int argc, char **argv) { + cudaSetDevice(0); + printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE); + + run(argc, argv); + + return EXIT_SUCCESS; +} + +void run(int argc, char **argv) { + int size; + int grid_rows, grid_cols; + float *FilesavingTemp, *FilesavingPower, *MatrixOut; + char *tfile, *pfile, *ofile; + + int total_iterations = 60; + int pyramid_height = 1; // number of iterations + + if (argc != 7) + usage(argc, argv); + if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 || + (pyramid_height = atoi(argv[2])) <= 0 || + (total_iterations = atoi(argv[3])) <= 0) + usage(argc, argv); + + tfile = argv[4]; + pfile = argv[5]; + ofile = argv[6]; + + size = grid_rows * grid_cols; + +/* --------------- pyramid parameters --------------- */ +#define EXPAND_RATE \ + 2 // add one iteration will extend the pyramid base by 2 per each borderline + int borderCols = (pyramid_height)*EXPAND_RATE / 2; + int borderRows = (pyramid_height)*EXPAND_RATE / 2; + int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE; + int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE; + int blockCols = + grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1); + int blockRows = + grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1); + + FilesavingTemp = (float *)malloc(size * sizeof(float)); + FilesavingPower = (float *)malloc(size * sizeof(float)); + MatrixOut = (float *)calloc(size, sizeof(float)); + + if (!FilesavingPower || !FilesavingTemp || !MatrixOut) + fatal("unable to allocate memory"); + + printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, " + "%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n", + pyramid_height, grid_cols, grid_rows, borderCols, borderRows, + blockCols, blockRows, smallBlockCol, smallBlockRow); + + readinput(FilesavingTemp, grid_rows, grid_cols, tfile); + readinput(FilesavingPower, grid_rows, grid_cols, pfile); + + float *MatrixTemp[2], *MatrixPower; + cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size); + cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size); + cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size, + cudaMemcpyHostToDevice); + + cudaMalloc((void **)&MatrixPower, sizeof(float) * size); + cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size, + cudaMemcpyHostToDevice); + printf("Start computing the transient temperature\n"); + int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, + total_iterations, pyramid_height, blockCols, + blockRows, borderCols, borderRows); + printf("Ending simulation\n"); + cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size, + cudaMemcpyDeviceToHost); + + writeoutput(MatrixOut, grid_rows, grid_cols, ofile); + + cudaFree(MatrixPower); + cudaFree(MatrixTemp[0]); + cudaFree(MatrixTemp[1]); + free(MatrixOut); +} diff --git a/examples/hotspot/run.sh b/examples/hotspot/run.sh new file mode 100644 index 0000000..679325d --- /dev/null +++ b/examples/hotspot/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e +llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \ + -o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread + +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out +if head output.out | grep -q "323.829"; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..46c4ec2 --- /dev/null +++ b/examples/hotspot3D/3D-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,587 @@ +; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "3D.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockDim_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any + +@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 { +entry: + %p.addr = alloca float*, align 8 + %tIn.addr = alloca float*, align 8 + %tOut.addr = alloca float*, align 8 + %sdc.addr = alloca float, align 4 + %nx.addr = alloca i32, align 4 + %ny.addr = alloca i32, align 4 + %nz.addr = alloca i32, align 4 + %ce.addr = alloca float, align 4 + %cw.addr = alloca float, align 4 + %cn.addr = alloca float, align 4 + %cs.addr = alloca float, align 4 + %ct.addr = alloca float, align 4 + %cb.addr = alloca float, align 4 + %cc.addr = alloca float, align 4 + %amb_temp = alloca float, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %c = alloca i32, align 4 + %xy = alloca i32, align 4 + %W = alloca i32, align 4 + %E = alloca i32, align 4 + %N = alloca i32, align 4 + %S = alloca i32, align 4 + %temp1 = alloca float, align 4 + %temp2 = alloca float, align 4 + %temp3 = alloca float, align 4 + %k = alloca i32, align 4 + store float* %p, float** %p.addr, align 8 + store float* %tIn, float** %tIn.addr, align 8 + store float* %tOut, float** %tOut.addr, align 8 + store float %sdc, float* %sdc.addr, align 4 + store i32 %nx, i32* %nx.addr, align 4 + store i32 %ny, i32* %ny.addr, align 4 + store i32 %nz, i32* %nz.addr, align 4 + store float %ce, float* %ce.addr, align 4 + store float %cw, float* %cw.addr, align 4 + store float %cn, float* %cn.addr, align 4 + store float %cs, float* %cs.addr, align 4 + store float %ct, float* %ct.addr, align 4 + store float %cb, float* %cb.addr, align 4 + store float %cc, float* %cc.addr, align 4 + store float 8.000000e+01, float* %amb_temp, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %mul = mul i32 %call, %call1 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add = add i32 %mul, %call2 + store i32 %add, i32* %i, align 4 + %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3 + %call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3 + %mul5 = mul i32 %call3, %call4 + %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3 + %add7 = add i32 %mul5, %call6 + store i32 %add7, i32* %j, align 4 + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %j, align 4 + %2 = load i32, i32* %nx.addr, align 4 + %mul8 = mul nsw i32 %1, %2 + %add9 = add nsw i32 %0, %mul8 + store i32 %add9, i32* %c, align 4 + %3 = load i32, i32* %nx.addr, align 4 + %4 = load i32, i32* %ny.addr, align 4 + %mul10 = mul nsw i32 %3, %4 + store i32 %mul10, i32* %xy, align 4 + %5 = load i32, i32* %i, align 4 + %cmp = icmp eq i32 %5, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %6 = load i32, i32* %c, align 4 + br label %cond.end + +cond.false: ; preds = %entry + %7 = load i32, i32* %c, align 4 + %sub = sub nsw i32 %7, 1 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ] + store i32 %cond, i32* %W, align 4 + %8 = load i32, i32* %i, align 4 + %9 = load i32, i32* %nx.addr, align 4 + %sub11 = sub nsw i32 %9, 1 + %cmp12 = icmp eq i32 %8, %sub11 + br i1 %cmp12, label %cond.true13, label %cond.false14 + +cond.true13: ; preds = %cond.end + %10 = load i32, i32* %c, align 4 + br label %cond.end16 + +cond.false14: ; preds = %cond.end + %11 = load i32, i32* %c, align 4 + %add15 = add nsw i32 %11, 1 + br label %cond.end16 + +cond.end16: ; preds = %cond.false14, %cond.true13 + %cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ] + store i32 %cond17, i32* %E, align 4 + %12 = load i32, i32* %j, align 4 + %cmp18 = icmp eq i32 %12, 0 + br i1 %cmp18, label %cond.true19, label %cond.false20 + +cond.true19: ; preds = %cond.end16 + %13 = load i32, i32* %c, align 4 + br label %cond.end22 + +cond.false20: ; preds = %cond.end16 + %14 = load i32, i32* %c, align 4 + %15 = load i32, i32* %nx.addr, align 4 + %sub21 = sub nsw i32 %14, %15 + br label %cond.end22 + +cond.end22: ; preds = %cond.false20, %cond.true19 + %cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ] + store i32 %cond23, i32* %N, align 4 + %16 = load i32, i32* %j, align 4 + %17 = load i32, i32* %ny.addr, align 4 + %sub24 = sub nsw i32 %17, 1 + %cmp25 = icmp eq i32 %16, %sub24 + br i1 %cmp25, label %cond.true26, label %cond.false27 + +cond.true26: ; preds = %cond.end22 + %18 = load i32, i32* %c, align 4 + br label %cond.end29 + +cond.false27: ; preds = %cond.end22 + %19 = load i32, i32* %c, align 4 + %20 = load i32, i32* %nx.addr, align 4 + %add28 = add nsw i32 %19, %20 + br label %cond.end29 + +cond.end29: ; preds = %cond.false27, %cond.true26 + %cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ] + store i32 %cond30, i32* %S, align 4 + %21 = load float*, float** %tIn.addr, align 8 + %22 = load i32, i32* %c, align 4 + %idxprom = sext i32 %22 to i64 + %arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom + %23 = load float, float* %arrayidx, align 4 + store float %23, float* %temp2, align 4 + store float %23, float* %temp1, align 4 + %24 = load float*, float** %tIn.addr, align 8 + %25 = load i32, i32* %c, align 4 + %26 = load i32, i32* %xy, align 4 + %add31 = add nsw i32 %25, %26 + %idxprom32 = sext i32 %add31 to i64 + %arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32 + %27 = load float, float* %arrayidx33, align 4 + store float %27, float* %temp3, align 4 + %28 = load float, float* %cc.addr, align 4 + %29 = load float, float* %temp2, align 4 + %mul34 = fmul contract float %28, %29 + %30 = load float, float* %cw.addr, align 4 + %31 = load float*, float** %tIn.addr, align 8 + %32 = load i32, i32* %W, align 4 + %idxprom35 = sext i32 %32 to i64 + %arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35 + %33 = load float, float* %arrayidx36, align 4 + %mul37 = fmul contract float %30, %33 + %add38 = fadd contract float %mul34, %mul37 + %34 = load float, float* %ce.addr, align 4 + %35 = load float*, float** %tIn.addr, align 8 + %36 = load i32, i32* %E, align 4 + %idxprom39 = sext i32 %36 to i64 + %arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39 + %37 = load float, float* %arrayidx40, align 4 + %mul41 = fmul contract float %34, %37 + %add42 = fadd contract float %add38, %mul41 + %38 = load float, float* %cs.addr, align 4 + %39 = load float*, float** %tIn.addr, align 8 + %40 = load i32, i32* %S, align 4 + %idxprom43 = sext i32 %40 to i64 + %arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43 + %41 = load float, float* %arrayidx44, align 4 + %mul45 = fmul contract float %38, %41 + %add46 = fadd contract float %add42, %mul45 + %42 = load float, float* %cn.addr, align 4 + %43 = load float*, float** %tIn.addr, align 8 + %44 = load i32, i32* %N, align 4 + %idxprom47 = sext i32 %44 to i64 + %arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47 + %45 = load float, float* %arrayidx48, align 4 + %mul49 = fmul contract float %42, %45 + %add50 = fadd contract float %add46, %mul49 + %46 = load float, float* %cb.addr, align 4 + %47 = load float, float* %temp1, align 4 + %mul51 = fmul contract float %46, %47 + %add52 = fadd contract float %add50, %mul51 + %48 = load float, float* %ct.addr, align 4 + %49 = load float, float* %temp3, align 4 + %mul53 = fmul contract float %48, %49 + %add54 = fadd contract float %add52, %mul53 + %50 = load float, float* %sdc.addr, align 4 + %51 = load float*, float** %p.addr, align 8 + %52 = load i32, i32* %c, align 4 + %idxprom55 = sext i32 %52 to i64 + %arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55 + %53 = load float, float* %arrayidx56, align 4 + %mul57 = fmul contract float %50, %53 + %add58 = fadd contract float %add54, %mul57 + %54 = load float, float* %ct.addr, align 4 + %55 = load float, float* %amb_temp, align 4 + %mul59 = fmul contract float %54, %55 + %add60 = fadd contract float %add58, %mul59 + %56 = load float*, float** %tOut.addr, align 8 + %57 = load i32, i32* %c, align 4 + %idxprom61 = sext i32 %57 to i64 + %arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61 + store float %add60, float* %arrayidx62, align 4 + %58 = load i32, i32* %xy, align 4 + %59 = load i32, i32* %c, align 4 + %add63 = add nsw i32 %59, %58 + store i32 %add63, i32* %c, align 4 + %60 = load i32, i32* %xy, align 4 + %61 = load i32, i32* %W, align 4 + %add64 = add nsw i32 %61, %60 + store i32 %add64, i32* %W, align 4 + %62 = load i32, i32* %xy, align 4 + %63 = load i32, i32* %E, align 4 + %add65 = add nsw i32 %63, %62 + store i32 %add65, i32* %E, align 4 + %64 = load i32, i32* %xy, align 4 + %65 = load i32, i32* %N, align 4 + %add66 = add nsw i32 %65, %64 + store i32 %add66, i32* %N, align 4 + %66 = load i32, i32* %xy, align 4 + %67 = load i32, i32* %S, align 4 + %add67 = add nsw i32 %67, %66 + store i32 %add67, i32* %S, align 4 + store i32 1, i32* %k, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %cond.end29 + %68 = load i32, i32* %k, align 4 + %69 = load i32, i32* %nz.addr, align 4 + %sub68 = sub nsw i32 %69, 1 + %cmp69 = icmp slt i32 %68, %sub68 + br i1 %cmp69, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %70 = load float, float* %temp2, align 4 + store float %70, float* %temp1, align 4 + %71 = load float, float* %temp3, align 4 + store float %71, float* %temp2, align 4 + %72 = load float*, float** %tIn.addr, align 8 + %73 = load i32, i32* %c, align 4 + %74 = load i32, i32* %xy, align 4 + %add70 = add nsw i32 %73, %74 + %idxprom71 = sext i32 %add70 to i64 + %arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71 + %75 = load float, float* %arrayidx72, align 4 + store float %75, float* %temp3, align 4 + %76 = load float, float* %cc.addr, align 4 + %77 = load float, float* %temp2, align 4 + %mul73 = fmul contract float %76, %77 + %78 = load float, float* %cw.addr, align 4 + %79 = load float*, float** %tIn.addr, align 8 + %80 = load i32, i32* %W, align 4 + %idxprom74 = sext i32 %80 to i64 + %arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74 + %81 = load float, float* %arrayidx75, align 4 + %mul76 = fmul contract float %78, %81 + %add77 = fadd contract float %mul73, %mul76 + %82 = load float, float* %ce.addr, align 4 + %83 = load float*, float** %tIn.addr, align 8 + %84 = load i32, i32* %E, align 4 + %idxprom78 = sext i32 %84 to i64 + %arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78 + %85 = load float, float* %arrayidx79, align 4 + %mul80 = fmul contract float %82, %85 + %add81 = fadd contract float %add77, %mul80 + %86 = load float, float* %cs.addr, align 4 + %87 = load float*, float** %tIn.addr, align 8 + %88 = load i32, i32* %S, align 4 + %idxprom82 = sext i32 %88 to i64 + %arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82 + %89 = load float, float* %arrayidx83, align 4 + %mul84 = fmul contract float %86, %89 + %add85 = fadd contract float %add81, %mul84 + %90 = load float, float* %cn.addr, align 4 + %91 = load float*, float** %tIn.addr, align 8 + %92 = load i32, i32* %N, align 4 + %idxprom86 = sext i32 %92 to i64 + %arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86 + %93 = load float, float* %arrayidx87, align 4 + %mul88 = fmul contract float %90, %93 + %add89 = fadd contract float %add85, %mul88 + %94 = load float, float* %cb.addr, align 4 + %95 = load float, float* %temp1, align 4 + %mul90 = fmul contract float %94, %95 + %add91 = fadd contract float %add89, %mul90 + %96 = load float, float* %ct.addr, align 4 + %97 = load float, float* %temp3, align 4 + %mul92 = fmul contract float %96, %97 + %add93 = fadd contract float %add91, %mul92 + %98 = load float, float* %sdc.addr, align 4 + %99 = load float*, float** %p.addr, align 8 + %100 = load i32, i32* %c, align 4 + %idxprom94 = sext i32 %100 to i64 + %arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94 + %101 = load float, float* %arrayidx95, align 4 + %mul96 = fmul contract float %98, %101 + %add97 = fadd contract float %add93, %mul96 + %102 = load float, float* %ct.addr, align 4 + %103 = load float, float* %amb_temp, align 4 + %mul98 = fmul contract float %102, %103 + %add99 = fadd contract float %add97, %mul98 + %104 = load float*, float** %tOut.addr, align 8 + %105 = load i32, i32* %c, align 4 + %idxprom100 = sext i32 %105 to i64 + %arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100 + store float %add99, float* %arrayidx101, align 4 + %106 = load i32, i32* %xy, align 4 + %107 = load i32, i32* %c, align 4 + %add102 = add nsw i32 %107, %106 + store i32 %add102, i32* %c, align 4 + %108 = load i32, i32* %xy, align 4 + %109 = load i32, i32* %W, align 4 + %add103 = add nsw i32 %109, %108 + store i32 %add103, i32* %W, align 4 + %110 = load i32, i32* %xy, align 4 + %111 = load i32, i32* %E, align 4 + %add104 = add nsw i32 %111, %110 + store i32 %add104, i32* %E, align 4 + %112 = load i32, i32* %xy, align 4 + %113 = load i32, i32* %N, align 4 + %add105 = add nsw i32 %113, %112 + store i32 %add105, i32* %N, align 4 + %114 = load i32, i32* %xy, align 4 + %115 = load i32, i32* %S, align 4 + %add106 = add nsw i32 %115, %114 + store i32 %add106, i32* %S, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %116 = load i32, i32* %k, align 4 + %inc = add nsw i32 %116, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %117 = load float, float* %temp2, align 4 + store float %117, float* %temp1, align 4 + %118 = load float, float* %temp3, align 4 + store float %118, float* %temp2, align 4 + %119 = load float, float* %cc.addr, align 4 + %120 = load float, float* %temp2, align 4 + %mul107 = fmul contract float %119, %120 + %121 = load float, float* %cw.addr, align 4 + %122 = load float*, float** %tIn.addr, align 8 + %123 = load i32, i32* %W, align 4 + %idxprom108 = sext i32 %123 to i64 + %arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108 + %124 = load float, float* %arrayidx109, align 4 + %mul110 = fmul contract float %121, %124 + %add111 = fadd contract float %mul107, %mul110 + %125 = load float, float* %ce.addr, align 4 + %126 = load float*, float** %tIn.addr, align 8 + %127 = load i32, i32* %E, align 4 + %idxprom112 = sext i32 %127 to i64 + %arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112 + %128 = load float, float* %arrayidx113, align 4 + %mul114 = fmul contract float %125, %128 + %add115 = fadd contract float %add111, %mul114 + %129 = load float, float* %cs.addr, align 4 + %130 = load float*, float** %tIn.addr, align 8 + %131 = load i32, i32* %S, align 4 + %idxprom116 = sext i32 %131 to i64 + %arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116 + %132 = load float, float* %arrayidx117, align 4 + %mul118 = fmul contract float %129, %132 + %add119 = fadd contract float %add115, %mul118 + %133 = load float, float* %cn.addr, align 4 + %134 = load float*, float** %tIn.addr, align 8 + %135 = load i32, i32* %N, align 4 + %idxprom120 = sext i32 %135 to i64 + %arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120 + %136 = load float, float* %arrayidx121, align 4 + %mul122 = fmul contract float %133, %136 + %add123 = fadd contract float %add119, %mul122 + %137 = load float, float* %cb.addr, align 4 + %138 = load float, float* %temp1, align 4 + %mul124 = fmul contract float %137, %138 + %add125 = fadd contract float %add123, %mul124 + %139 = load float, float* %ct.addr, align 4 + %140 = load float, float* %temp3, align 4 + %mul126 = fmul contract float %139, %140 + %add127 = fadd contract float %add125, %mul126 + %141 = load float, float* %sdc.addr, align 4 + %142 = load float*, float** %p.addr, align 8 + %143 = load i32, i32* %c, align 4 + %idxprom128 = sext i32 %143 to i64 + %arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128 + %144 = load float, float* %arrayidx129, align 4 + %mul130 = fmul contract float %141, %144 + %add131 = fadd contract float %add127, %mul130 + %145 = load float, float* %ct.addr, align 4 + %146 = load float, float* %amb_temp, align 4 + %mul132 = fmul contract float %145, %146 + %add133 = fadd contract float %add131, %mul132 + %147 = load float*, float** %tOut.addr, align 8 + %148 = load i32, i32* %c, align 4 + %idxprom134 = sext i32 %148 to i64 + %arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134 + store float %add133, float* %arrayidx135, align 4 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + ret i32 %0 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll b/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..dba0e85 --- /dev/null +++ b/examples/hotspot3D/3D-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,1507 @@ +; ModuleID = '3D-host-x86_64-unknown-linux-gnu.bc' +source_filename = "3D.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +$_ZSt4sqrtf = comdat any + +@.str = private unnamed_addr constant [16 x i8] c"Time: %.3f (s)\0A\00", align 1 +@t_chip = dso_local global float 0x3F40624DE0000000, align 4 +@chip_height = dso_local global float 0x3F90624DE0000000, align 4 +@chip_width = dso_local global float 0x3F90624DE0000000, align 4 +@amb_temp = dso_local global float 8.000000e+01, align 4 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.1 = private unnamed_addr constant [11 x i8] c"Error: %s\0A\00", align 1 +@.str.2 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.3 = private unnamed_addr constant [24 x i8] c"The file was not opened\00", align 1 +@.str.4 = private unnamed_addr constant [20 x i8] c"Error reading file\0A\00", align 1 +@.str.5 = private unnamed_addr constant [25 x i8] c"not enough lines in file\00", align 1 +@.str.6 = private unnamed_addr constant [3 x i8] c"%f\00", align 1 +@.str.7 = private unnamed_addr constant [20 x i8] c"invalid file format\00", align 1 +@.str.8 = private unnamed_addr constant [2 x i8] c"w\00", align 1 +@.str.9 = private unnamed_addr constant [25 x i8] c"The file was not opened\0A\00", align 1 +@.str.10 = private unnamed_addr constant [7 x i8] c"%d\09%g\0A\00", align 1 +@.str.11 = private unnamed_addr constant [81 x i8] c"Usage: %s \0A\00", align 1 +@.str.12 = private unnamed_addr constant [68 x i8] c"\09 - number of rows/cols in the grid (positive integer)\0A\00", align 1 +@.str.13 = private unnamed_addr constant [62 x i8] c"\09 - number of layers in the grid (positive integer)\0A\00", align 1 +@.str.14 = private unnamed_addr constant [37 x i8] c"\09 - number of iterations\0A\00", align 1 +@.str.15 = private unnamed_addr constant [83 x i8] c"\09 - name of the file containing the initial power values of each cell\0A\00", align 1 +@.str.16 = private unnamed_addr constant [88 x i8] c"\09 - name of the file containing the initial temperature values of each cell\0A\00", align 1 +@.str.17 = private unnamed_addr constant [28 x i8] c"\09!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F2\1Avisible .entry _Z11hotspotOpt1PfS_S_fiiif\01\00\06\A9\04\00\A3\00\0F.\00\0D\0E\93\04\0F6\00\18\1F16\00\22\07g\04?f326\00\15\07Q\04\1Fu6\00\17\1F46\00\22\1F56\00\22\1F6\D8\00\22\1F76\00\22\1F86\00\22\1F96\00\22/107\00#\1F17\00#\0F\1F\02#\1F1m\0A\14O6[12&\06\16\95pred %p<6'\06\00\92\00k%f<94>J\06-95K\06/79L\06\0C\1F6L\06\12\02s\00O8, [\0B\01\16\1D]?\00\1F7?\00\18\1E2?\00\1F6?\00\18\1E1?\00\1F5?\00\18\1E0?\00\1F4?\00\17\1E9>\00\1F3>\00\17\1E8>\00\1F2>\00\17\1F7Y\07\00\0F\FA\00\18\1F6?\00\00\0F\FA\00\18\1F5?\00\00\0F\FB\00\18\1E4\FB\00\1F1>\00\17\1F3\A9\08\00\0F}\00\18\0F?\08\01\0Fy\01\18\0F)\08\01\0F\BD\00\18#0]\88\03#to\1D\18\04E\00\144\D9\07\01\1F\00\0A\1C\00\115\1C\00\1F4;\00\05\146A\08\0F;\00\00\117\1C\00\1F6;\00\05\148\92\08\0F;\00\00\119\1C\00\1F8\C4\08\02\1F9\C4\08\02\1A7\16\00\03\C4\08!d5\17\00\00\E9\01\07\C5\08\1Bf\E8\12\132\FE\12\0F\DB\08\03+14\17\00\03\1F\09\0B[\00\114\9F\00+f2\16\00\02q\00\1B3\16\00\01q\00*f4\16\00\115p\00\1BfX\00\115o\00*f6\16\00\126n\00\1B7\16\00\02n\00\1982\12\DA6, 1117782016\CB\00\136\F9\00\1961\00\00\B8\01intid.x\17\00\00\B0\013cta\18\00rul.lo.s\1B\00$9,8\00)r1}\00S20, %K\003add0\00521,6\00*20\93\00\137P\0A\09\F6\12$22\93\00\19y\17\00\153\93\00\1By\93\00424,8\00*r2s\0F\135\93\00\19y\93\00&6,6\00\1B5\93\00\02\F1\01!26\7F\03\03F\00%7,\B0\00\08\17\00%8,4\00\08\17\00%9,j\02\0B\A9\00330,8\00\00'\00\08\93\00331,i\00\00&\00\0B\AF\15\03@\0B\171e\00(32e\00\06\17\00%3,\CF\02\0C|\00#4,8\00\00'\00\0Bb\00\02\EF\0B(34K\00\185\F5\00rsetp.neL\003p1,!\00\F2\0C0;\0A@%p1 bra LBB6_2;\0Abra.uni\10\0021;\0A\08\00\16:Z\00%1,\D8\00\08\AC\01\129)\02\0BA\00\133A\00\172A\00(36B\00\07\D8\01\13,\1D\00.-1Y\00\1F2Y\00\04*3:\CD\0C\000\00\0B\1E\01\148\C0\0C\06\1D\01\0F\12\02\03(38\96\01\07\DE\01#9,\1E\00,-1L\01#2,P\00\00'\00\01O\01\162O\01\1B5\B5\00\134\B5\00\174\0E\01\1F4O\01\04\01Y\00\1B4A\00\136A\00\185A\00\1F0O\01\03#5,\1D\00\1E1X\00\1F5X\00\04\186N\01\01g\03\1B9q\0E\129\F4\03\08`\03(41I\03\0Ak\02#3,!\00\02k\02\163\1C\01\1B8\83\00\137\83\00\177\DB\00\1F7\1C\01\04\01V\00\1B7A\00\139A\00\188\1C\01\182B\00\06\17\00\183\E9\013sub3\01#8,4\00\00#\00\0Er\00\1F8r\00\04\1896\01\01,\02\1B9\A2\03\129\97\04\189\84\00\1F4\7F\04\02(45\03\04\06\CE\01346,\1E\00\0E\84\02#4,P\00\00'\00\01h\01\164h\01\1C1\93\03$10\D5\03\180\D6\03\1F0l\01\04\01W\00,10E\00\142E\00\08\1B\04/47o\01\03\0FX\03\04311,5\00\00$\00\0Fv\00\00\1F1v\00\06\182s\01\111\B2\01\1B9\FC\07#10^\19\122\8B\00\03\85\09\05\E8\00\02\A2\00\02\8A\00)d1\D6\044shl3\0D\02\\\00\01 \00\132\BC\00\03\19\00$3,P\00\01'\00\01e\00\02\A6\0A\01B\06\00\22\00\1A]\F3\07\2210L\08\1C9\17\00\144\17\00\07\A8\00\1A4\A8\00\03J\01\1F9a\01\02\155\D5\00\1A4\03\04#1,5\00\00$\00\01\BE\09\00\BF\00\05\F5\00\01#\04*51\F1\00$6,\1C\00\0B\F1\00$7,\99\00\01'\00\07\F1\00#10\F2\00\1C7\F2\00\02\97\1C8f10.\00%1,\03\09\08\17\00&2,'\01\08\18\00%3,\A0\09\09\22\01\1F8\CA\01\03\05#\01\1A8\CA\01\01\E0\08\02 \00\0A\D9\00\02\E3\08\22d1\E5\0A\192\AB\00\124\D9\00#21\90\07$rn\1A\00#5,\9C\00\00&\00W;\0Afma\1D\00#6,\E8\00\02\D7\00\00/\00\08V\00%7,\88\0A\08\BB\00\00\03\09\04e\05\0A\BB\00$3, \00\0B\BB\00\194\BB\00\183e\00\138\BB\00,4]\9E\00#9,\85\00\02&\00)%f\CD\01\00(\01\04\E4\0A\09\9E\00&5,Y\03\0A\9F\00\03Z\0C\1D5\9F\00\197\9F\00\09f\00\131\9F\00\1C7\9F\00\01\09\0A\01\86\00\02&\00\00\B1\00\089\00\05\0F\02\0A\F8\01\05\CF\09+96\9E\00$9, \00\0A\9E\00)30\9E\00\09e\00\03\F8\01-30\9E\00#5,\85\00\02&\00\00\B0\00\089\00%6,\F5\0B\08\17\00&7,\EC\03\0CR\00#8,9\00\02(\00\00d\00\08;\00%9,]\0C\07\17\00\00\BD\00\05c\03\0BR\00\01\8E\0A\019\00\02(\00\00d\00\08;\00\05}\0A\0AQ\03\143\98\03\0A\96\02/32\1B\05\04\00\AE\0A\03 \00\0BY\01$4,P\00\01'\00\08|\00\133Y\01\0D\07\01\015\00\01\9C\00\02&\00\00\C7\00\089\00%5,\08\0D\0C:\00\176\EF\00\01\9F\01)34\D8\00\05\B9\02*6]\A8\00$6, \00\03\A8\00\05\F9\0D\22rd;\0E(f3\EA\08(52h\05\06\7F\05\1F3\82\09\04#4,\1E\00\00;\00\0F\16\0C\02\185\B4\0B/55_\00\03\186\D2\04\07_\00#7,\1E\00\00;\00\0F\F5\0A\02(57H\00\1F8_\00\03\189v\04\06r\08360,\1E\00\00;\00\0F\06\0A\03\170H\00/61_\00\02(62\98\03\07_\00#3,\1E\00\00;\00\0F/\09\02\186\B4\0B/64_\00\03\0A\96\04\07`\00#6,\1F\00\00<\00\0F\1B\08\03\196M\0FL67, V#\03\9C\10\1C6m\0A$13\8A\08\173\00\09668,8\00\08\A5\00\04c\01\1A3\04\01370,\1E\00\05\D5\09\14g\D5\09#5,Q\00\00'\00\01\D5\09\175\D5\09\1C6\90\00\04[\0C614:5\03)65S\07\0Fu\08\01\186l\04)66U\04\0F\BC\08\01(66[\03/56\A6\08\01/71\A6\08\02/72\A6\08\03373,5\00\00$\00\0D\A6\08\02\E7\02)73|\04\00\CC\02\03\1C\00\0A|\04\02\B0\02\12d\16\038d58\DE\00\127|\04/59\A6\08\04(67.\00\1F8\A6\08\02/69\A6\08\03/70\A6\08\03\1F6p\0A\03/61\A6\08\04\02\11\03-d6p\0A\020\03\12d\96\03(d6u\06\137L\07,63\A6\08\00s\01\02\9C\00\00&\00\0B8\05\01x\01\01\E8\00\02\D7\00\00/\00\08V\00\1F4\A6\08\03/64\A6\08\04\02l\03\01 \00\0B\BB\00\196\BB\00\08r\02#75\BB\00,6]\9E\00#6,\85\00\02&\00\00\B0\00\089\00\1F7\A6\08\03/67\A6\08\05\02L\03-d63\02)69\9F\00\083\02\137E\09-69\9F\00#9,\86\00\02&\00\00\B1\00\079\00/80\A6\08\03/70\A6\08\04\02\FF\02\01 \00\0A\9E\00(72\9E\00(71e\00\03\F8\01\1D7\02\08382,\85\00\02&\00\00\B0\00\089\00\1F3\A6\08\02/84\A6\08\07385,9\00\02(\00\00d\00\08;\00\1F6\A6\08\02/87\A6\08\07388,9\00\02(\00\00d\00\08;\00\1F9\A6\08\03/73\A6\08\02/74\A6\08\04\01<\02\02 \00\0BY\01$6,P\00\01'\00\08\C1\0D\03\D0\0C\1D7\96\02\01\8D\12\01\9C\00\02&\00\00\C7\00\089\00\1F2\A6\08\06&93\EF\00\01\85\119f91\D8\00\1F7\A6\08\04\01^\02\02 \00\03\A8\00\07\A6\08\137R\0E\08*\07\1F7*\07\03/75\A6\08\03\02\08\01\02'\01/74\A6\08\02(76H\00\1F7_\00\03\1F8\A6\08\03\01)\03\12r\C5\00/77\A6\08\02\187\18\11/80_\00\02/81\A6\08\03\01\EA\02\01\1E\00\00;\00\0F\A6\08\02(82H\00\1F3_\00\03\1F4\A6\08\03\01\F7\02\01\1E\00\00;\00\0F\A6\08\02(85H\00\1F6_\00\03\1F7\A6\08\04\01\05\03\01\1F\00\00<\00\0F\A6\08\03,88\ED\07\145\ED\07\08\08\14)89}\08\06\C4\00\01\A8\02\01\1F\00\0F\DE\08\04/90\DE\08\06\176N\08/37N\08\0E\183B\07/38N\08\0E\193\98\0C\1F9r\07\02/40r\07\03/41r\07\03/37r\07\02/38r\07\04\02\DE\15-d3 \0E\00y\00\03P\00\01'\00\08|\00\122!\04,40r\07\00[\14\02\9C\00\00&\00\0B\04\04\01\A4\13\01\E8\00\02\D7\00\00/\00\08V\00\1F5r\07\03/41r\07\04\02\AB\14-d4-\08(43\BB\00(42e\00\136\BB\00,3]\9E\00#7,\85\00\02&\00\00\B0\00\089\00\1F8r\07\03/44r\07\05\01K\00\02!\00\0B\9F\00\196\9F\00\08{\05#49\9F\00\0D{\05\00\E5\12\02\86\00\02&\00\00\B1\00\079\00/51r\07\03/47r\07\04\01J\00\02 \00\0B\9E\00\199\9E\00\08\A3\02\145\F8\01\0D\10\08\01\E6\0D\01\85\00\02&\00\00\B0\00\089\00\1F4r\07\02/55r\07\07\01\C3\0A\019\00\02(\00\00d\00\08;\00\1F7r\07\02/58r\07\07\01\1C\0B\019\00\02(\00\00d\00\08\DB\0A\1F0r\07\03/50r\07\02\1F53\15\05\01\FF\00\02 \00\0B\9C\0B$3,P\00\01'\00\08|\00\03\CB\08\1D5\96\02\01\11\0B\01\9C\00\02&\00\00\C7\00\089\00\1F3r\07\06&64\EF\00\126\EC\0A\09\0B\16/54r\07\04\01n\01\02 \00\03\A8\00\07r\07\2255,\0C\B04;\0Aret;\0A\0A}\0A\00\00\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([27433 x i8], [27433 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i64 @_Z8get_timev() #0 { +entry: + %tv = alloca %struct.timeval, align 8 + %call = call i32 @gettimeofday(%struct.timeval* %tv, %struct.timezone* null) #8 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 0 + %0 = load i64, i64* %tv_sec, align 8 + %mul = mul nsw i64 %0, 1000000 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 1 + %1 = load i64, i64* %tv_usec, align 8 + %add = add nsw i64 %mul, %1 + ret i64 %add +} + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #2 { +entry: + %p.addr = alloca float*, align 8 + %tIn.addr = alloca float*, align 8 + %tOut.addr = alloca float*, align 8 + %sdc.addr = alloca float, align 4 + %nx.addr = alloca i32, align 4 + %ny.addr = alloca i32, align 4 + %nz.addr = alloca i32, align 4 + %ce.addr = alloca float, align 4 + %cw.addr = alloca float, align 4 + %cn.addr = alloca float, align 4 + %cs.addr = alloca float, align 4 + %ct.addr = alloca float, align 4 + %cb.addr = alloca float, align 4 + %cc.addr = alloca float, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %p, float** %p.addr, align 8 + store float* %tIn, float** %tIn.addr, align 8 + store float* %tOut, float** %tOut.addr, align 8 + store float %sdc, float* %sdc.addr, align 4 + store i32 %nx, i32* %nx.addr, align 4 + store i32 %ny, i32* %ny.addr, align 4 + store i32 %nz, i32* %nz.addr, align 4 + store float %ce, float* %ce.addr, align 4 + store float %cw, float* %cw.addr, align 4 + store float %cn, float* %cn.addr, align 4 + store float %cs, float* %cs.addr, align 4 + store float %ct, float* %ct.addr, align 4 + store float %cb, float* %cb.addr, align 4 + store float %cc, float* %cc.addr, align 4 + %kernel_args = alloca i8*, i64 14, align 16 + %0 = bitcast float** %p.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast float** %tIn.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast float** %tOut.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast float* %sdc.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %nx.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %ny.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32* %nz.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast float* %ce.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = bitcast float* %cw.addr to i8* + %17 = getelementptr i8*, i8** %kernel_args, i32 8 + store i8* %16, i8** %17 + %18 = bitcast float* %cn.addr to i8* + %19 = getelementptr i8*, i8** %kernel_args, i32 9 + store i8* %18, i8** %19 + %20 = bitcast float* %cs.addr to i8* + %21 = getelementptr i8*, i8** %kernel_args, i32 10 + store i8* %20, i8** %21 + %22 = bitcast float* %ct.addr to i8* + %23 = getelementptr i8*, i8** %kernel_args, i32 11 + store i8* %22, i8** %23 + %24 = bitcast float* %cb.addr to i8* + %25 = getelementptr i8*, i8** %kernel_args, i32 12 + store i8* %24, i8** %25 + %26 = bitcast float* %cc.addr to i8* + %27 = getelementptr i8*, i8** %kernel_args, i32 13 + store i8* %26, i8** %27 + %28 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %29 = load i64, i64* %shmem_size, align 8 + %30 = load i8*, i8** %stream, align 8 + %31 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %32 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false) + %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %34 = load i64, i64* %33, align 8 + %35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %36 = load i32, i32* %35, align 8 + %37 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %38 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 12, i1 false) + %39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %40 = load i64, i64* %39, align 8 + %41 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %42 = load i32, i32* %41, align 8 + %43 = bitcast i8* %30 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff to i8*), i64 %34, i32 %36, i64 %40, i32 %42, i8** %kernel_args, i64 %29, %struct.CUstream_st* %43) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z12hotspot_opt1PfS_S_iiifffffi(float* %p, float* %tIn, float* %tOut, i32 %nx, i32 %ny, i32 %nz, float %Cap, float %Rx, float %Ry, float %Rz, float %dt, i32 %numiter) #2 { +entry: + %p.addr = alloca float*, align 8 + %tIn.addr = alloca float*, align 8 + %tOut.addr = alloca float*, align 8 + %nx.addr = alloca i32, align 4 + %ny.addr = alloca i32, align 4 + %nz.addr = alloca i32, align 4 + %Cap.addr = alloca float, align 4 + %Rx.addr = alloca float, align 4 + %Ry.addr = alloca float, align 4 + %Rz.addr = alloca float, align 4 + %dt.addr = alloca float, align 4 + %numiter.addr = alloca i32, align 4 + %ce = alloca float, align 4 + %cw = alloca float, align 4 + %cn = alloca float, align 4 + %cs = alloca float, align 4 + %ct = alloca float, align 4 + %cb = alloca float, align 4 + %cc = alloca float, align 4 + %stepDivCap = alloca float, align 4 + %s = alloca i64, align 8 + %tIn_d = alloca float*, align 8 + %tOut_d = alloca float*, align 8 + %p_d = alloca float*, align 8 + %block_dim = alloca %struct.dim3, align 4 + %grid_dim = alloca %struct.dim3, align 4 + %start = alloca i64, align 8 + %i = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp23 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp23.coerce = alloca { i64, i32 }, align 4 + %t = alloca float*, align 8 + %stop = alloca i64, align 8 + %time = alloca float, align 4 + store float* %p, float** %p.addr, align 8 + store float* %tIn, float** %tIn.addr, align 8 + store float* %tOut, float** %tOut.addr, align 8 + store i32 %nx, i32* %nx.addr, align 4 + store i32 %ny, i32* %ny.addr, align 4 + store i32 %nz, i32* %nz.addr, align 4 + store float %Cap, float* %Cap.addr, align 4 + store float %Rx, float* %Rx.addr, align 4 + store float %Ry, float* %Ry.addr, align 4 + store float %Rz, float* %Rz.addr, align 4 + store float %dt, float* %dt.addr, align 4 + store i32 %numiter, i32* %numiter.addr, align 4 + %0 = load float, float* %dt.addr, align 4 + %1 = load float, float* %Cap.addr, align 4 + %div = fdiv float %0, %1 + store float %div, float* %stepDivCap, align 4 + %2 = load float, float* %stepDivCap, align 4 + %3 = load float, float* %Rx.addr, align 4 + %div1 = fdiv float %2, %3 + store float %div1, float* %cw, align 4 + store float %div1, float* %ce, align 4 + %4 = load float, float* %stepDivCap, align 4 + %5 = load float, float* %Ry.addr, align 4 + %div2 = fdiv float %4, %5 + store float %div2, float* %cs, align 4 + store float %div2, float* %cn, align 4 + %6 = load float, float* %stepDivCap, align 4 + %7 = load float, float* %Rz.addr, align 4 + %div3 = fdiv float %6, %7 + store float %div3, float* %cb, align 4 + store float %div3, float* %ct, align 4 + %8 = load float, float* %ce, align 4 + %conv = fpext float %8 to double + %mul = fmul contract double 2.000000e+00, %conv + %9 = load float, float* %cn, align 4 + %conv4 = fpext float %9 to double + %mul5 = fmul contract double 2.000000e+00, %conv4 + %add = fadd contract double %mul, %mul5 + %10 = load float, float* %ct, align 4 + %conv6 = fpext float %10 to double + %mul7 = fmul contract double 3.000000e+00, %conv6 + %add8 = fadd contract double %add, %mul7 + %sub = fsub contract double 1.000000e+00, %add8 + %conv9 = fptrunc double %sub to float + store float %conv9, float* %cc, align 4 + %11 = load i32, i32* %nx.addr, align 4 + %conv10 = sext i32 %11 to i64 + %mul11 = mul i64 4, %conv10 + %12 = load i32, i32* %ny.addr, align 4 + %conv12 = sext i32 %12 to i64 + %mul13 = mul i64 %mul11, %conv12 + %13 = load i32, i32* %nz.addr, align 4 + %conv14 = sext i32 %13 to i64 + %mul15 = mul i64 %mul13, %conv14 + store i64 %mul15, i64* %s, align 8 + %14 = bitcast float** %p_d to i8** + %15 = load i64, i64* %s, align 8 + %call = call i32 @cudaMalloc(i8** %14, i64 %15) + %16 = bitcast float** %tIn_d to i8** + %17 = load i64, i64* %s, align 8 + %call16 = call i32 @cudaMalloc(i8** %16, i64 %17) + %18 = bitcast float** %tOut_d to i8** + %19 = load i64, i64* %s, align 8 + %call17 = call i32 @cudaMalloc(i8** %18, i64 %19) + %20 = load float*, float** %tIn_d, align 8 + %21 = bitcast float* %20 to i8* + %22 = load float*, float** %tIn.addr, align 8 + %23 = bitcast float* %22 to i8* + %24 = load i64, i64* %s, align 8 + %call18 = call i32 @cudaMemcpy(i8* %21, i8* %23, i64 %24, i32 1) + %25 = load float*, float** %p_d, align 8 + %26 = bitcast float* %25 to i8* + %27 = load float*, float** %p.addr, align 8 + %28 = bitcast float* %27 to i8* + %29 = load i64, i64* %s, align 8 + %call19 = call i32 @cudaMemcpy(i8* %26, i8* %28, i64 %29, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %block_dim, i32 64, i32 4, i32 1) + %30 = load i32, i32* %nx.addr, align 4 + %div20 = sdiv i32 %30, 64 + %31 = load i32, i32* %ny.addr, align 4 + %div21 = sdiv i32 %31, 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid_dim, i32 %div20, i32 %div21, i32 1) + %call22 = call i64 @_Z8get_timev() + store i64 %call22, i64* %start, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %32 = load i32, i32* %i, align 4 + %33 = load i32, i32* %numiter.addr, align 4 + %cmp = icmp slt i32 %32, %33 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %34 = bitcast %struct.dim3* %agg.tmp to i8* + %35 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %34, i8* align 4 %35, i64 12, i1 false) + %36 = bitcast %struct.dim3* %agg.tmp23 to i8* + %37 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %36, i8* align 4 %37, i64 12, i1 false) + %38 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %39 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %38, i8* align 4 %39, i64 12, i1 false) + %40 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %41 = load i64, i64* %40, align 4 + %42 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %43 = load i32, i32* %42, align 4 + %44 = bitcast { i64, i32 }* %agg.tmp23.coerce to i8* + %45 = bitcast %struct.dim3* %agg.tmp23 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %44, i8* align 4 %45, i64 12, i1 false) + %46 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp23.coerce, i32 0, i32 0 + %47 = load i64, i64* %46, align 4 + %48 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp23.coerce, i32 0, i32 1 + %49 = load i32, i32* %48, align 4 + %call24 = call i32 @__cudaPushCallConfiguration(i64 %41, i32 %43, i64 %47, i32 %49, i64 0, i8* null) + %tobool = icmp ne i32 %call24, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %50 = load float*, float** %p_d, align 8 + %51 = load float*, float** %tIn_d, align 8 + %52 = load float*, float** %tOut_d, align 8 + %53 = load float, float* %stepDivCap, align 4 + %54 = load i32, i32* %nx.addr, align 4 + %55 = load i32, i32* %ny.addr, align 4 + %56 = load i32, i32* %nz.addr, align 4 + %57 = load float, float* %ce, align 4 + %58 = load float, float* %cw, align 4 + %59 = load float, float* %cn, align 4 + %60 = load float, float* %cs, align 4 + %61 = load float, float* %ct, align 4 + %62 = load float, float* %cb, align 4 + %63 = load float, float* %cc, align 4 + call void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %50, float* %51, float* %52, float %53, i32 %54, i32 %55, i32 %56, float %57, float %58, float %59, float %60, float %61, float %62, float %63) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.body + %64 = load float*, float** %tIn_d, align 8 + store float* %64, float** %t, align 8 + %65 = load float*, float** %tOut_d, align 8 + store float* %65, float** %tIn_d, align 8 + %66 = load float*, float** %t, align 8 + store float* %66, float** %tOut_d, align 8 + br label %for.inc + +for.inc: ; preds = %kcall.end + %67 = load i32, i32* %i, align 4 + %inc = add nsw i32 %67, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %call25 = call i32 @cudaDeviceSynchronize() + %call26 = call i64 @_Z8get_timev() + store i64 %call26, i64* %stop, align 8 + %68 = load i64, i64* %stop, align 8 + %69 = load i64, i64* %start, align 8 + %sub27 = sub nsw i64 %68, %69 + %conv28 = sitofp i64 %sub27 to double + %div29 = fdiv double %conv28, 1.000000e+06 + %conv30 = fptrunc double %div29 to float + store float %conv30, float* %time, align 4 + %70 = load float, float* %time, align 4 + %conv31 = fpext float %70 to double + %call32 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str, i64 0, i64 0), double %conv31) + %71 = load float*, float** %tOut.addr, align 8 + %72 = bitcast float* %71 to i8* + %73 = load float*, float** %tOut_d, align 8 + %74 = bitcast float* %73 to i8* + %75 = load i64, i64* %s, align 8 + %call33 = call i32 @cudaMemcpy(i8* %72, i8* %74, i64 %75, i32 2) + %76 = load float*, float** %p_d, align 8 + %77 = bitcast float* %76 to i8* + %call34 = call i32 @cudaFree(i8* %77) + %78 = load float*, float** %tIn_d, align 8 + %79 = bitcast float* %78 to i8* + %call35 = call i32 @cudaFree(i8* %79) + %80 = load float*, float** %tOut_d, align 8 + %81 = bitcast float* %80 to i8* + %call36 = call i32 @cudaFree(i8* %81) + ret void +} + +declare dso_local i32 @cudaMalloc(i8**, i64) #4 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #4 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #4 + +declare dso_local i32 @cudaDeviceSynchronize() #4 + +declare dso_local i32 @printf(i8*, ...) #4 + +declare dso_local i32 @cudaFree(i8*) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z5fatalPKc(i8* %s) #2 { +entry: + %s.addr = alloca i8*, align 8 + store i8* %s, i8** %s.addr, align 8 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %1 = load i8*, i8** %s.addr, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.1, i64 0, i64 0), i8* %1) + ret void +} + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z9readinputPfiiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i32 %layers, i8* %file) #2 { +entry: + %vect.addr = alloca float*, align 8 + %grid_rows.addr = alloca i32, align 4 + %grid_cols.addr = alloca i32, align 4 + %layers.addr = alloca i32, align 4 + %file.addr = alloca i8*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + %fp = alloca %struct._IO_FILE*, align 8 + %str = alloca [256 x i8], align 16 + %val = alloca float, align 4 + store float* %vect, float** %vect.addr, align 8 + store i32 %grid_rows, i32* %grid_rows.addr, align 4 + store i32 %grid_cols, i32* %grid_cols.addr, align 4 + store i32 %layers, i32* %layers.addr, align 4 + store i8* %file, i8** %file.addr, align 8 + %0 = load i8*, i8** %file.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_Z5fatalPKc(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.3, i64 0, i64 0)) + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc28, %if.end + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %grid_rows.addr, align 4 + %sub = sub nsw i32 %2, 1 + %cmp1 = icmp sle i32 %1, %sub + br i1 %cmp1, label %for.body, label %for.end30 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond2 + +for.cond2: ; preds = %for.inc25, %for.body + %3 = load i32, i32* %j, align 4 + %4 = load i32, i32* %grid_cols.addr, align 4 + %sub3 = sub nsw i32 %4, 1 + %cmp4 = icmp sle i32 %3, %sub3 + br i1 %cmp4, label %for.body5, label %for.end27 + +for.body5: ; preds = %for.cond2 + store i32 0, i32* %k, align 4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc, %for.body5 + %5 = load i32, i32* %k, align 4 + %6 = load i32, i32* %layers.addr, align 4 + %sub7 = sub nsw i32 %6, 1 + %cmp8 = icmp sle i32 %5, %sub7 + br i1 %cmp8, label %for.body9, label %for.end + +for.body9: ; preds = %for.cond6 + %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %7 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call10 = call i8* @fgets(i8* %arraydecay, i32 256, %struct._IO_FILE* %7) + %cmp11 = icmp eq i8* %call10, null + br i1 %cmp11, label %if.then12, label %if.end13 + +if.then12: ; preds = %for.body9 + call void @_Z5fatalPKc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.4, i64 0, i64 0)) + br label %if.end13 + +if.end13: ; preds = %if.then12, %for.body9 + %8 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call14 = call i32 @feof(%struct._IO_FILE* %8) #8 + %tobool = icmp ne i32 %call14, 0 + br i1 %tobool, label %if.then15, label %if.end16 + +if.then15: ; preds = %if.end13 + call void @_Z5fatalPKc(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.5, i64 0, i64 0)) + br label %if.end16 + +if.end16: ; preds = %if.then15, %if.end13 + %arraydecay17 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %call18 = call i32 (i8*, i8*, ...) @sscanf(i8* %arraydecay17, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), float* %val) #8 + %cmp19 = icmp ne i32 %call18, 1 + br i1 %cmp19, label %if.then20, label %if.end21 + +if.then20: ; preds = %if.end16 + call void @_Z5fatalPKc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.7, i64 0, i64 0)) + br label %if.end21 + +if.end21: ; preds = %if.then20, %if.end16 + %9 = load float, float* %val, align 4 + %10 = load float*, float** %vect.addr, align 8 + %11 = load i32, i32* %i, align 4 + %12 = load i32, i32* %grid_cols.addr, align 4 + %mul = mul nsw i32 %11, %12 + %13 = load i32, i32* %j, align 4 + %add = add nsw i32 %mul, %13 + %14 = load i32, i32* %k, align 4 + %15 = load i32, i32* %grid_rows.addr, align 4 + %mul22 = mul nsw i32 %14, %15 + %16 = load i32, i32* %grid_cols.addr, align 4 + %mul23 = mul nsw i32 %mul22, %16 + %add24 = add nsw i32 %add, %mul23 + %idxprom = sext i32 %add24 to i64 + %arrayidx = getelementptr inbounds float, float* %10, i64 %idxprom + store float %9, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %if.end21 + %17 = load i32, i32* %k, align 4 + %inc = add nsw i32 %17, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond6 + +for.end: ; preds = %for.cond6 + br label %for.inc25 + +for.inc25: ; preds = %for.end + %18 = load i32, i32* %j, align 4 + %inc26 = add nsw i32 %18, 1 + store i32 %inc26, i32* %j, align 4 + br label %for.cond2 + +for.end27: ; preds = %for.cond2 + br label %for.inc28 + +for.inc28: ; preds = %for.end27 + %19 = load i32, i32* %i, align 4 + %inc29 = add nsw i32 %19, 1 + store i32 %inc29, i32* %i, align 4 + br label %for.cond + +for.end30: ; preds = %for.cond + %20 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call31 = call i32 @fclose(%struct._IO_FILE* %20) + ret void +} + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #4 + +declare dso_local i8* @fgets(i8*, i32, %struct._IO_FILE*) #4 + +; Function Attrs: nounwind +declare dso_local i32 @feof(%struct._IO_FILE*) #1 + +; Function Attrs: nounwind +declare dso_local i32 @sscanf(i8*, i8*, ...) #1 + +declare dso_local i32 @fclose(%struct._IO_FILE*) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z11writeoutputPfiiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i32 %layers, i8* %file) #2 { +entry: + %vect.addr = alloca float*, align 8 + %grid_rows.addr = alloca i32, align 4 + %grid_cols.addr = alloca i32, align 4 + %layers.addr = alloca i32, align 4 + %file.addr = alloca i8*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + %index = alloca i32, align 4 + %fp = alloca %struct._IO_FILE*, align 8 + %str = alloca [256 x i8], align 16 + store float* %vect, float** %vect.addr, align 8 + store i32 %grid_rows, i32* %grid_rows.addr, align 4 + store i32 %grid_cols, i32* %grid_cols.addr, align 4 + store i32 %layers, i32* %layers.addr, align 4 + store i8* %file, i8** %file.addr, align 8 + store i32 0, i32* %index, align 4 + %0 = load i8*, i8** %file.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.8, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.9, i64 0, i64 0)) + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc19, %if.end + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %grid_rows.addr, align 4 + %cmp2 = icmp slt i32 %1, %2 + br i1 %cmp2, label %for.body, label %for.end21 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond3 + +for.cond3: ; preds = %for.inc16, %for.body + %3 = load i32, i32* %j, align 4 + %4 = load i32, i32* %grid_cols.addr, align 4 + %cmp4 = icmp slt i32 %3, %4 + br i1 %cmp4, label %for.body5, label %for.end18 + +for.body5: ; preds = %for.cond3 + store i32 0, i32* %k, align 4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc, %for.body5 + %5 = load i32, i32* %k, align 4 + %6 = load i32, i32* %layers.addr, align 4 + %cmp7 = icmp slt i32 %5, %6 + br i1 %cmp7, label %for.body8, label %for.end + +for.body8: ; preds = %for.cond6 + %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %7 = load i32, i32* %index, align 4 + %8 = load float*, float** %vect.addr, align 8 + %9 = load i32, i32* %i, align 4 + %10 = load i32, i32* %grid_cols.addr, align 4 + %mul = mul nsw i32 %9, %10 + %11 = load i32, i32* %j, align 4 + %add = add nsw i32 %mul, %11 + %12 = load i32, i32* %k, align 4 + %13 = load i32, i32* %grid_rows.addr, align 4 + %mul9 = mul nsw i32 %12, %13 + %14 = load i32, i32* %grid_cols.addr, align 4 + %mul10 = mul nsw i32 %mul9, %14 + %add11 = add nsw i32 %add, %mul10 + %idxprom = sext i32 %add11 to i64 + %arrayidx = getelementptr inbounds float, float* %8, i64 %idxprom + %15 = load float, float* %arrayidx, align 4 + %conv = fpext float %15 to double + %call12 = call i32 (i8*, i8*, ...) @sprintf(i8* %arraydecay, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.10, i64 0, i64 0), i32 %7, double %conv) #8 + %arraydecay13 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0 + %16 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call14 = call i32 @fputs(i8* %arraydecay13, %struct._IO_FILE* %16) + %17 = load i32, i32* %index, align 4 + %inc = add nsw i32 %17, 1 + store i32 %inc, i32* %index, align 4 + br label %for.inc + +for.inc: ; preds = %for.body8 + %18 = load i32, i32* %k, align 4 + %inc15 = add nsw i32 %18, 1 + store i32 %inc15, i32* %k, align 4 + br label %for.cond6 + +for.end: ; preds = %for.cond6 + br label %for.inc16 + +for.inc16: ; preds = %for.end + %19 = load i32, i32* %j, align 4 + %inc17 = add nsw i32 %19, 1 + store i32 %inc17, i32* %j, align 4 + br label %for.cond3 + +for.end18: ; preds = %for.cond3 + br label %for.inc19 + +for.inc19: ; preds = %for.end18 + %20 = load i32, i32* %i, align 4 + %inc20 = add nsw i32 %20, 1 + store i32 %inc20, i32* %i, align 4 + br label %for.cond + +for.end21: ; preds = %for.cond + %21 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call22 = call i32 @fclose(%struct._IO_FILE* %21) + ret void +} + +; Function Attrs: nounwind +declare dso_local i32 @sprintf(i8*, i8*, ...) #1 + +declare dso_local i32 @fputs(i8*, %struct._IO_FILE*) #4 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z14computeTempCPUPfS_S_iiifffffi(float* %pIn, float* %tIn, float* %tOut, i32 %nx, i32 %ny, i32 %nz, float %Cap, float %Rx, float %Ry, float %Rz, float %dt, i32 %numiter) #0 { +entry: + %pIn.addr = alloca float*, align 8 + %tIn.addr = alloca float*, align 8 + %tOut.addr = alloca float*, align 8 + %nx.addr = alloca i32, align 4 + %ny.addr = alloca i32, align 4 + %nz.addr = alloca i32, align 4 + %Cap.addr = alloca float, align 4 + %Rx.addr = alloca float, align 4 + %Ry.addr = alloca float, align 4 + %Rz.addr = alloca float, align 4 + %dt.addr = alloca float, align 4 + %numiter.addr = alloca i32, align 4 + %ce = alloca float, align 4 + %cw = alloca float, align 4 + %cn = alloca float, align 4 + %cs = alloca float, align 4 + %ct = alloca float, align 4 + %cb = alloca float, align 4 + %cc = alloca float, align 4 + %stepDivCap = alloca float, align 4 + %c = alloca i32, align 4 + %w = alloca i32, align 4 + %e = alloca i32, align 4 + %n = alloca i32, align 4 + %s = alloca i32, align 4 + %b = alloca i32, align 4 + %t = alloca i32, align 4 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %z = alloca i32, align 4 + %i = alloca i32, align 4 + %temp = alloca float*, align 8 + store float* %pIn, float** %pIn.addr, align 8 + store float* %tIn, float** %tIn.addr, align 8 + store float* %tOut, float** %tOut.addr, align 8 + store i32 %nx, i32* %nx.addr, align 4 + store i32 %ny, i32* %ny.addr, align 4 + store i32 %nz, i32* %nz.addr, align 4 + store float %Cap, float* %Cap.addr, align 4 + store float %Rx, float* %Rx.addr, align 4 + store float %Ry, float* %Ry.addr, align 4 + store float %Rz, float* %Rz.addr, align 4 + store float %dt, float* %dt.addr, align 4 + store i32 %numiter, i32* %numiter.addr, align 4 + %0 = load float, float* %dt.addr, align 4 + %1 = load float, float* %Cap.addr, align 4 + %div = fdiv float %0, %1 + store float %div, float* %stepDivCap, align 4 + %2 = load float, float* %stepDivCap, align 4 + %3 = load float, float* %Rx.addr, align 4 + %div1 = fdiv float %2, %3 + store float %div1, float* %cw, align 4 + store float %div1, float* %ce, align 4 + %4 = load float, float* %stepDivCap, align 4 + %5 = load float, float* %Ry.addr, align 4 + %div2 = fdiv float %4, %5 + store float %div2, float* %cs, align 4 + store float %div2, float* %cn, align 4 + %6 = load float, float* %stepDivCap, align 4 + %7 = load float, float* %Rz.addr, align 4 + %div3 = fdiv float %6, %7 + store float %div3, float* %cb, align 4 + store float %div3, float* %ct, align 4 + %8 = load float, float* %ce, align 4 + %conv = fpext float %8 to double + %mul = fmul contract double 2.000000e+00, %conv + %9 = load float, float* %cn, align 4 + %conv4 = fpext float %9 to double + %mul5 = fmul contract double 2.000000e+00, %conv4 + %add = fadd contract double %mul, %mul5 + %10 = load float, float* %ct, align 4 + %conv6 = fpext float %10 to double + %mul7 = fmul contract double 3.000000e+00, %conv6 + %add8 = fadd contract double %add, %mul7 + %sub = fsub contract double 1.000000e+00, %add8 + %conv9 = fptrunc double %sub to float + store float %conv9, float* %cc, align 4 + store i32 0, i32* %i, align 4 + br label %do.body + +do.body: ; preds = %do.cond, %entry + store i32 0, i32* %z, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc95, %do.body + %11 = load i32, i32* %z, align 4 + %12 = load i32, i32* %nz.addr, align 4 + %cmp = icmp slt i32 %11, %12 + br i1 %cmp, label %for.body, label %for.end97 + +for.body: ; preds = %for.cond + store i32 0, i32* %y, align 4 + br label %for.cond10 + +for.cond10: ; preds = %for.inc92, %for.body + %13 = load i32, i32* %y, align 4 + %14 = load i32, i32* %ny.addr, align 4 + %cmp11 = icmp slt i32 %13, %14 + br i1 %cmp11, label %for.body12, label %for.end94 + +for.body12: ; preds = %for.cond10 + store i32 0, i32* %x, align 4 + br label %for.cond13 + +for.cond13: ; preds = %for.inc, %for.body12 + %15 = load i32, i32* %x, align 4 + %16 = load i32, i32* %nx.addr, align 4 + %cmp14 = icmp slt i32 %15, %16 + br i1 %cmp14, label %for.body15, label %for.end + +for.body15: ; preds = %for.cond13 + %17 = load i32, i32* %x, align 4 + %18 = load i32, i32* %y, align 4 + %19 = load i32, i32* %nx.addr, align 4 + %mul16 = mul nsw i32 %18, %19 + %add17 = add nsw i32 %17, %mul16 + %20 = load i32, i32* %z, align 4 + %21 = load i32, i32* %nx.addr, align 4 + %mul18 = mul nsw i32 %20, %21 + %22 = load i32, i32* %ny.addr, align 4 + %mul19 = mul nsw i32 %mul18, %22 + %add20 = add nsw i32 %add17, %mul19 + store i32 %add20, i32* %c, align 4 + %23 = load i32, i32* %x, align 4 + %cmp21 = icmp eq i32 %23, 0 + br i1 %cmp21, label %cond.true, label %cond.false + +cond.true: ; preds = %for.body15 + %24 = load i32, i32* %c, align 4 + br label %cond.end + +cond.false: ; preds = %for.body15 + %25 = load i32, i32* %c, align 4 + %sub22 = sub nsw i32 %25, 1 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %24, %cond.true ], [ %sub22, %cond.false ] + store i32 %cond, i32* %w, align 4 + %26 = load i32, i32* %x, align 4 + %27 = load i32, i32* %nx.addr, align 4 + %sub23 = sub nsw i32 %27, 1 + %cmp24 = icmp eq i32 %26, %sub23 + br i1 %cmp24, label %cond.true25, label %cond.false26 + +cond.true25: ; preds = %cond.end + %28 = load i32, i32* %c, align 4 + br label %cond.end28 + +cond.false26: ; preds = %cond.end + %29 = load i32, i32* %c, align 4 + %add27 = add nsw i32 %29, 1 + br label %cond.end28 + +cond.end28: ; preds = %cond.false26, %cond.true25 + %cond29 = phi i32 [ %28, %cond.true25 ], [ %add27, %cond.false26 ] + store i32 %cond29, i32* %e, align 4 + %30 = load i32, i32* %y, align 4 + %cmp30 = icmp eq i32 %30, 0 + br i1 %cmp30, label %cond.true31, label %cond.false32 + +cond.true31: ; preds = %cond.end28 + %31 = load i32, i32* %c, align 4 + br label %cond.end34 + +cond.false32: ; preds = %cond.end28 + %32 = load i32, i32* %c, align 4 + %33 = load i32, i32* %nx.addr, align 4 + %sub33 = sub nsw i32 %32, %33 + br label %cond.end34 + +cond.end34: ; preds = %cond.false32, %cond.true31 + %cond35 = phi i32 [ %31, %cond.true31 ], [ %sub33, %cond.false32 ] + store i32 %cond35, i32* %n, align 4 + %34 = load i32, i32* %y, align 4 + %35 = load i32, i32* %ny.addr, align 4 + %sub36 = sub nsw i32 %35, 1 + %cmp37 = icmp eq i32 %34, %sub36 + br i1 %cmp37, label %cond.true38, label %cond.false39 + +cond.true38: ; preds = %cond.end34 + %36 = load i32, i32* %c, align 4 + br label %cond.end41 + +cond.false39: ; preds = %cond.end34 + %37 = load i32, i32* %c, align 4 + %38 = load i32, i32* %nx.addr, align 4 + %add40 = add nsw i32 %37, %38 + br label %cond.end41 + +cond.end41: ; preds = %cond.false39, %cond.true38 + %cond42 = phi i32 [ %36, %cond.true38 ], [ %add40, %cond.false39 ] + store i32 %cond42, i32* %s, align 4 + %39 = load i32, i32* %z, align 4 + %cmp43 = icmp eq i32 %39, 0 + br i1 %cmp43, label %cond.true44, label %cond.false45 + +cond.true44: ; preds = %cond.end41 + %40 = load i32, i32* %c, align 4 + br label %cond.end48 + +cond.false45: ; preds = %cond.end41 + %41 = load i32, i32* %c, align 4 + %42 = load i32, i32* %nx.addr, align 4 + %43 = load i32, i32* %ny.addr, align 4 + %mul46 = mul nsw i32 %42, %43 + %sub47 = sub nsw i32 %41, %mul46 + br label %cond.end48 + +cond.end48: ; preds = %cond.false45, %cond.true44 + %cond49 = phi i32 [ %40, %cond.true44 ], [ %sub47, %cond.false45 ] + store i32 %cond49, i32* %b, align 4 + %44 = load i32, i32* %z, align 4 + %45 = load i32, i32* %nz.addr, align 4 + %sub50 = sub nsw i32 %45, 1 + %cmp51 = icmp eq i32 %44, %sub50 + br i1 %cmp51, label %cond.true52, label %cond.false53 + +cond.true52: ; preds = %cond.end48 + %46 = load i32, i32* %c, align 4 + br label %cond.end56 + +cond.false53: ; preds = %cond.end48 + %47 = load i32, i32* %c, align 4 + %48 = load i32, i32* %nx.addr, align 4 + %49 = load i32, i32* %ny.addr, align 4 + %mul54 = mul nsw i32 %48, %49 + %add55 = add nsw i32 %47, %mul54 + br label %cond.end56 + +cond.end56: ; preds = %cond.false53, %cond.true52 + %cond57 = phi i32 [ %46, %cond.true52 ], [ %add55, %cond.false53 ] + store i32 %cond57, i32* %t, align 4 + %50 = load float*, float** %tIn.addr, align 8 + %51 = load i32, i32* %c, align 4 + %idxprom = sext i32 %51 to i64 + %arrayidx = getelementptr inbounds float, float* %50, i64 %idxprom + %52 = load float, float* %arrayidx, align 4 + %53 = load float, float* %cc, align 4 + %mul58 = fmul contract float %52, %53 + %54 = load float*, float** %tIn.addr, align 8 + %55 = load i32, i32* %n, align 4 + %idxprom59 = sext i32 %55 to i64 + %arrayidx60 = getelementptr inbounds float, float* %54, i64 %idxprom59 + %56 = load float, float* %arrayidx60, align 4 + %57 = load float, float* %cn, align 4 + %mul61 = fmul contract float %56, %57 + %add62 = fadd contract float %mul58, %mul61 + %58 = load float*, float** %tIn.addr, align 8 + %59 = load i32, i32* %s, align 4 + %idxprom63 = sext i32 %59 to i64 + %arrayidx64 = getelementptr inbounds float, float* %58, i64 %idxprom63 + %60 = load float, float* %arrayidx64, align 4 + %61 = load float, float* %cs, align 4 + %mul65 = fmul contract float %60, %61 + %add66 = fadd contract float %add62, %mul65 + %62 = load float*, float** %tIn.addr, align 8 + %63 = load i32, i32* %e, align 4 + %idxprom67 = sext i32 %63 to i64 + %arrayidx68 = getelementptr inbounds float, float* %62, i64 %idxprom67 + %64 = load float, float* %arrayidx68, align 4 + %65 = load float, float* %ce, align 4 + %mul69 = fmul contract float %64, %65 + %add70 = fadd contract float %add66, %mul69 + %66 = load float*, float** %tIn.addr, align 8 + %67 = load i32, i32* %w, align 4 + %idxprom71 = sext i32 %67 to i64 + %arrayidx72 = getelementptr inbounds float, float* %66, i64 %idxprom71 + %68 = load float, float* %arrayidx72, align 4 + %69 = load float, float* %cw, align 4 + %mul73 = fmul contract float %68, %69 + %add74 = fadd contract float %add70, %mul73 + %70 = load float*, float** %tIn.addr, align 8 + %71 = load i32, i32* %t, align 4 + %idxprom75 = sext i32 %71 to i64 + %arrayidx76 = getelementptr inbounds float, float* %70, i64 %idxprom75 + %72 = load float, float* %arrayidx76, align 4 + %73 = load float, float* %ct, align 4 + %mul77 = fmul contract float %72, %73 + %add78 = fadd contract float %add74, %mul77 + %74 = load float*, float** %tIn.addr, align 8 + %75 = load i32, i32* %b, align 4 + %idxprom79 = sext i32 %75 to i64 + %arrayidx80 = getelementptr inbounds float, float* %74, i64 %idxprom79 + %76 = load float, float* %arrayidx80, align 4 + %77 = load float, float* %cb, align 4 + %mul81 = fmul contract float %76, %77 + %add82 = fadd contract float %add78, %mul81 + %78 = load float, float* %dt.addr, align 4 + %79 = load float, float* %Cap.addr, align 4 + %div83 = fdiv float %78, %79 + %80 = load float*, float** %pIn.addr, align 8 + %81 = load i32, i32* %c, align 4 + %idxprom84 = sext i32 %81 to i64 + %arrayidx85 = getelementptr inbounds float, float* %80, i64 %idxprom84 + %82 = load float, float* %arrayidx85, align 4 + %mul86 = fmul contract float %div83, %82 + %add87 = fadd contract float %add82, %mul86 + %83 = load float, float* %ct, align 4 + %84 = load float, float* @amb_temp, align 4 + %mul88 = fmul contract float %83, %84 + %add89 = fadd contract float %add87, %mul88 + %85 = load float*, float** %tOut.addr, align 8 + %86 = load i32, i32* %c, align 4 + %idxprom90 = sext i32 %86 to i64 + %arrayidx91 = getelementptr inbounds float, float* %85, i64 %idxprom90 + store float %add89, float* %arrayidx91, align 4 + br label %for.inc + +for.inc: ; preds = %cond.end56 + %87 = load i32, i32* %x, align 4 + %inc = add nsw i32 %87, 1 + store i32 %inc, i32* %x, align 4 + br label %for.cond13 + +for.end: ; preds = %for.cond13 + br label %for.inc92 + +for.inc92: ; preds = %for.end + %88 = load i32, i32* %y, align 4 + %inc93 = add nsw i32 %88, 1 + store i32 %inc93, i32* %y, align 4 + br label %for.cond10 + +for.end94: ; preds = %for.cond10 + br label %for.inc95 + +for.inc95: ; preds = %for.end94 + %89 = load i32, i32* %z, align 4 + %inc96 = add nsw i32 %89, 1 + store i32 %inc96, i32* %z, align 4 + br label %for.cond + +for.end97: ; preds = %for.cond + %90 = load float*, float** %tIn.addr, align 8 + store float* %90, float** %temp, align 8 + %91 = load float*, float** %tOut.addr, align 8 + store float* %91, float** %tIn.addr, align 8 + %92 = load float*, float** %temp, align 8 + store float* %92, float** %tOut.addr, align 8 + %93 = load i32, i32* %i, align 4 + %inc98 = add nsw i32 %93, 1 + store i32 %inc98, i32* %i, align 4 + br label %do.cond + +do.cond: ; preds = %for.end97 + %94 = load i32, i32* %i, align 4 + %95 = load i32, i32* %numiter.addr, align 4 + %cmp99 = icmp slt i32 %94, %95 + br i1 %cmp99, label %do.body, label %do.end + +do.end: ; preds = %do.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local float @_Z8accuracyPfS_i(float* %arr1, float* %arr2, i32 %len) #2 { +entry: + %arr1.addr = alloca float*, align 8 + %arr2.addr = alloca float*, align 8 + %len.addr = alloca i32, align 4 + %err = alloca float, align 4 + %i = alloca i32, align 4 + store float* %arr1, float** %arr1.addr, align 8 + store float* %arr2, float** %arr2.addr, align 8 + store i32 %len, i32* %len.addr, align 4 + store float 0.000000e+00, float* %err, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %len.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load float*, float** %arr1.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom + %4 = load float, float* %arrayidx, align 4 + %5 = load float*, float** %arr2.addr, align 8 + %6 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %6 to i64 + %arrayidx2 = getelementptr inbounds float, float* %5, i64 %idxprom1 + %7 = load float, float* %arrayidx2, align 4 + %sub = fsub contract float %4, %7 + %8 = load float*, float** %arr1.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom3 = sext i32 %9 to i64 + %arrayidx4 = getelementptr inbounds float, float* %8, i64 %idxprom3 + %10 = load float, float* %arrayidx4, align 4 + %11 = load float*, float** %arr2.addr, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom5 = sext i32 %12 to i64 + %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5 + %13 = load float, float* %arrayidx6, align 4 + %sub7 = fsub contract float %10, %13 + %mul = fmul contract float %sub, %sub7 + %14 = load float, float* %err, align 4 + %add = fadd contract float %14, %mul + store float %add, float* %err, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %16 = load float, float* %err, align 4 + %17 = load i32, i32* %len.addr, align 4 + %conv = sitofp i32 %17 to float + %div = fdiv float %16, %conv + %call = call float @_ZSt4sqrtf(float %div) + ret float %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt4sqrtf(float %__x) #0 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %call = call float @sqrtf(float %0) #8 + ret float %call +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #2 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %1 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0 + %2 = load i8*, i8** %arrayidx, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([81 x i8], [81 x i8]* @.str.11, i64 0, i64 0), i8* %2) + %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([68 x i8], [68 x i8]* @.str.12, i64 0, i64 0)) + %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([62 x i8], [62 x i8]* @.str.13, i64 0, i64 0)) + %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.14, i64 0, i64 0)) + %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([83 x i8], [83 x i8]* @.str.15, i64 0, i64 0)) + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([88 x i8], [88 x i8]* @.str.16, i64 0, i64 0)) + %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.17, i64 0, i64 0)) + call void @exit(i32 1) #9 + unreachable +} + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #5 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #6 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %pfile = alloca i8*, align 8 + %tfile = alloca i8*, align 8 + %ofile = alloca i8*, align 8 + %iterations = alloca i32, align 4 + %numCols = alloca i32, align 4 + %numRows = alloca i32, align 4 + %layers = alloca i32, align 4 + %dx = alloca float, align 4 + %dy = alloca float, align 4 + %dz = alloca float, align 4 + %Cap = alloca float, align 4 + %Rx = alloca float, align 4 + %Ry = alloca float, align 4 + %Rz = alloca float, align 4 + %max_slope = alloca float, align 4 + %dt = alloca float, align 4 + %powerIn = alloca float*, align 8 + %tempOut = alloca float*, align 8 + %tempIn = alloca float*, align 8 + %tempCopy = alloca float*, align 8 + %size = alloca i32, align 4 + %answer = alloca float*, align 8 + %acc = alloca float, align 4 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp ne i32 %0, 7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* %argc.addr, align 4 + %2 = load i8**, i8*** %argv.addr, align 8 + call void @_Z5usageiPPc(i32 %1, i8** %2) + br label %if.end + +if.end: ; preds = %if.then, %entry + %3 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %3, i64 3 + %4 = load i8*, i8** %arrayidx, align 8 + %call1 = call i32 @atoi(i8* %4) #10 + store i32 %call1, i32* %iterations, align 4 + %5 = load i8**, i8*** %argv.addr, align 8 + %arrayidx2 = getelementptr inbounds i8*, i8** %5, i64 4 + %6 = load i8*, i8** %arrayidx2, align 8 + store i8* %6, i8** %pfile, align 8 + %7 = load i8**, i8*** %argv.addr, align 8 + %arrayidx3 = getelementptr inbounds i8*, i8** %7, i64 5 + %8 = load i8*, i8** %arrayidx3, align 8 + store i8* %8, i8** %tfile, align 8 + %9 = load i8**, i8*** %argv.addr, align 8 + %arrayidx4 = getelementptr inbounds i8*, i8** %9, i64 6 + %10 = load i8*, i8** %arrayidx4, align 8 + store i8* %10, i8** %ofile, align 8 + %11 = load i8**, i8*** %argv.addr, align 8 + %arrayidx5 = getelementptr inbounds i8*, i8** %11, i64 1 + %12 = load i8*, i8** %arrayidx5, align 8 + %call6 = call i32 @atoi(i8* %12) #10 + store i32 %call6, i32* %numCols, align 4 + %13 = load i8**, i8*** %argv.addr, align 8 + %arrayidx7 = getelementptr inbounds i8*, i8** %13, i64 1 + %14 = load i8*, i8** %arrayidx7, align 8 + %call8 = call i32 @atoi(i8* %14) #10 + store i32 %call8, i32* %numRows, align 4 + %15 = load i8**, i8*** %argv.addr, align 8 + %arrayidx9 = getelementptr inbounds i8*, i8** %15, i64 2 + %16 = load i8*, i8** %arrayidx9, align 8 + %call10 = call i32 @atoi(i8* %16) #10 + store i32 %call10, i32* %layers, align 4 + %17 = load float, float* @chip_height, align 4 + %18 = load i32, i32* %numRows, align 4 + %conv = sitofp i32 %18 to float + %div = fdiv float %17, %conv + store float %div, float* %dx, align 4 + %19 = load float, float* @chip_width, align 4 + %20 = load i32, i32* %numCols, align 4 + %conv11 = sitofp i32 %20 to float + %div12 = fdiv float %19, %conv11 + store float %div12, float* %dy, align 4 + %21 = load float, float* @t_chip, align 4 + %22 = load i32, i32* %layers, align 4 + %conv13 = sitofp i32 %22 to float + %div14 = fdiv float %21, %conv13 + store float %div14, float* %dz, align 4 + %23 = load float, float* @t_chip, align 4 + %conv15 = fpext float %23 to double + %mul = fmul contract double 8.750000e+05, %conv15 + %24 = load float, float* %dx, align 4 + %conv16 = fpext float %24 to double + %mul17 = fmul contract double %mul, %conv16 + %25 = load float, float* %dy, align 4 + %conv18 = fpext float %25 to double + %mul19 = fmul contract double %mul17, %conv18 + %conv20 = fptrunc double %mul19 to float + store float %conv20, float* %Cap, align 4 + %26 = load float, float* %dy, align 4 + %conv21 = fpext float %26 to double + %27 = load float, float* @t_chip, align 4 + %conv22 = fpext float %27 to double + %mul23 = fmul contract double 2.000000e+02, %conv22 + %28 = load float, float* %dx, align 4 + %conv24 = fpext float %28 to double + %mul25 = fmul contract double %mul23, %conv24 + %div26 = fdiv double %conv21, %mul25 + %conv27 = fptrunc double %div26 to float + store float %conv27, float* %Rx, align 4 + %29 = load float, float* %dx, align 4 + %conv28 = fpext float %29 to double + %30 = load float, float* @t_chip, align 4 + %conv29 = fpext float %30 to double + %mul30 = fmul contract double 2.000000e+02, %conv29 + %31 = load float, float* %dy, align 4 + %conv31 = fpext float %31 to double + %mul32 = fmul contract double %mul30, %conv31 + %div33 = fdiv double %conv28, %mul32 + %conv34 = fptrunc double %div33 to float + store float %conv34, float* %Ry, align 4 + %32 = load float, float* %dz, align 4 + %33 = load float, float* %dx, align 4 + %mul35 = fmul contract float 1.000000e+02, %33 + %34 = load float, float* %dy, align 4 + %mul36 = fmul contract float %mul35, %34 + %div37 = fdiv float %32, %mul36 + store float %div37, float* %Rz, align 4 + %35 = load float, float* @t_chip, align 4 + %conv38 = fpext float %35 to double + %mul39 = fmul contract double 5.000000e-01, %conv38 + %mul40 = fmul contract double %mul39, 1.750000e+06 + %div41 = fdiv double 3.000000e+06, %mul40 + %conv42 = fptrunc double %div41 to float + store float %conv42, float* %max_slope, align 4 + %36 = load float, float* %max_slope, align 4 + %conv43 = fpext float %36 to double + %div44 = fdiv double 1.000000e-03, %conv43 + %conv45 = fptrunc double %div44 to float + store float %conv45, float* %dt, align 4 + %37 = load i32, i32* %numCols, align 4 + %38 = load i32, i32* %numRows, align 4 + %mul46 = mul nsw i32 %37, %38 + %39 = load i32, i32* %layers, align 4 + %mul47 = mul nsw i32 %mul46, %39 + store i32 %mul47, i32* %size, align 4 + %40 = load i32, i32* %size, align 4 + %conv48 = sext i32 %40 to i64 + %call49 = call noalias i8* @calloc(i64 %conv48, i64 4) #8 + %41 = bitcast i8* %call49 to float* + store float* %41, float** %powerIn, align 8 + %42 = load i32, i32* %size, align 4 + %conv50 = sext i32 %42 to i64 + %mul51 = mul i64 %conv50, 4 + %call52 = call noalias i8* @malloc(i64 %mul51) #8 + %43 = bitcast i8* %call52 to float* + store float* %43, float** %tempCopy, align 8 + %44 = load i32, i32* %size, align 4 + %conv53 = sext i32 %44 to i64 + %call54 = call noalias i8* @calloc(i64 %conv53, i64 4) #8 + %45 = bitcast i8* %call54 to float* + store float* %45, float** %tempIn, align 8 + %46 = load i32, i32* %size, align 4 + %conv55 = sext i32 %46 to i64 + %call56 = call noalias i8* @calloc(i64 %conv55, i64 4) #8 + %47 = bitcast i8* %call56 to float* + store float* %47, float** %tempOut, align 8 + %48 = load i32, i32* %size, align 4 + %conv57 = sext i32 %48 to i64 + %call58 = call noalias i8* @calloc(i64 %conv57, i64 4) #8 + %49 = bitcast i8* %call58 to float* + store float* %49, float** %answer, align 8 + %50 = load float*, float** %powerIn, align 8 + %51 = load i32, i32* %numRows, align 4 + %52 = load i32, i32* %numCols, align 4 + %53 = load i32, i32* %layers, align 4 + %54 = load i8*, i8** %pfile, align 8 + call void @_Z9readinputPfiiiPc(float* %50, i32 %51, i32 %52, i32 %53, i8* %54) + %55 = load float*, float** %tempIn, align 8 + %56 = load i32, i32* %numRows, align 4 + %57 = load i32, i32* %numCols, align 4 + %58 = load i32, i32* %layers, align 4 + %59 = load i8*, i8** %tfile, align 8 + call void @_Z9readinputPfiiiPc(float* %55, i32 %56, i32 %57, i32 %58, i8* %59) + %60 = load float*, float** %tempCopy, align 8 + %61 = bitcast float* %60 to i8* + %62 = load float*, float** %tempIn, align 8 + %63 = bitcast float* %62 to i8* + %64 = load i32, i32* %size, align 4 + %conv59 = sext i32 %64 to i64 + %mul60 = mul i64 %conv59, 4 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %61, i8* align 4 %63, i64 %mul60, i1 false) + %65 = load float*, float** %powerIn, align 8 + %66 = load float*, float** %tempIn, align 8 + %67 = load float*, float** %tempOut, align 8 + %68 = load i32, i32* %numCols, align 4 + %69 = load i32, i32* %numRows, align 4 + %70 = load i32, i32* %layers, align 4 + %71 = load float, float* %Cap, align 4 + %72 = load float, float* %Rx, align 4 + %73 = load float, float* %Ry, align 4 + %74 = load float, float* %Rz, align 4 + %75 = load float, float* %dt, align 4 + %76 = load i32, i32* %iterations, align 4 + call void @_Z12hotspot_opt1PfS_S_iiifffffi(float* %65, float* %66, float* %67, i32 %68, i32 %69, i32 %70, float %71, float %72, float %73, float %74, float %75, i32 %76) + %77 = load float*, float** %powerIn, align 8 + %78 = load float*, float** %tempCopy, align 8 + %79 = load float*, float** %answer, align 8 + %80 = load i32, i32* %numCols, align 4 + %81 = load i32, i32* %numRows, align 4 + %82 = load i32, i32* %layers, align 4 + %83 = load float, float* %Cap, align 4 + %84 = load float, float* %Rx, align 4 + %85 = load float, float* %Ry, align 4 + %86 = load float, float* %Rz, align 4 + %87 = load float, float* %dt, align 4 + %88 = load i32, i32* %iterations, align 4 + call void @_Z14computeTempCPUPfS_S_iiifffffi(float* %77, float* %78, float* %79, i32 %80, i32 %81, i32 %82, float %83, float %84, float %85, float %86, float %87, i32 %88) + %89 = load float*, float** %tempOut, align 8 + %90 = load float*, float** %answer, align 8 + %91 = load i32, i32* %numRows, align 4 + %92 = load i32, i32* %numCols, align 4 + %mul61 = mul nsw i32 %91, %92 + %93 = load i32, i32* %layers, align 4 + %mul62 = mul nsw i32 %mul61, %93 + %call63 = call float @_Z8accuracyPfS_i(float* %89, float* %90, i32 %mul62) + store float %call63, float* %acc, align 4 + %94 = load float, float* %acc, align 4 + %conv64 = fpext float %94 to double + %call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.18, i64 0, i64 0), double %conv64) + %95 = load float*, float** %tempOut, align 8 + %96 = load i32, i32* %numRows, align 4 + %97 = load i32, i32* %numCols, align 4 + %98 = load i32, i32* %layers, align 4 + %99 = load i8*, i8** %ofile, align 8 + call void @_Z11writeoutputPfiiiPc(float* %95, i32 %96, i32 %97, i32 %98, i8* %99) + %100 = load float*, float** %tempIn, align 8 + %101 = bitcast float* %100 to i8* + call void @free(i8* %101) #8 + %102 = load float*, float** %tempOut, align 8 + %103 = bitcast float* %102 to i8* + call void @free(i8* %103) #8 + %104 = load float*, float** %powerIn, align 8 + %105 = bitcast float* %104 to i8* + call void @free(i8* %105) #8 + ret i32 0 +} + +declare dso_local i32 @cudaSetDevice(i32) #4 + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #7 + +; Function Attrs: nounwind +declare dso_local noalias i8* @calloc(i64, i64) #1 + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #1 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #1 + +; Function Attrs: nounwind +declare dso_local float @sqrtf(float) #1 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff to i8*), i8* getelementptr inbounds ([33 x i8], [33 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([33 x i8], [33 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { argmemonly nounwind willreturn } +attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nounwind } +attributes #9 = { noreturn nounwind } +attributes #10 = { nounwind readonly } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/hotspot3D/3D.cu b/examples/hotspot3D/3D.cu new file mode 100644 index 0000000..51faa17 --- /dev/null +++ b/examples/hotspot3D/3D.cu @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include + +#define BLOCK_SIZE 16 +#define STR_SIZE 256 + +#define block_x_ 128 +#define block_y_ 2 +#define block_z_ 1 +#define MAX_PD (3.0e6) +/* required precision in degrees */ +#define PRECISION 0.001 +#define SPEC_HEAT_SI 1.75e6 +#define K_SI 100 +/* capacitance fitting factor */ +#define FACTOR_CHIP 0.5 + +#include "opt1.cu" + +/* chip parameters */ +float t_chip = 0.0005; +float chip_height = 0.016; +float chip_width = 0.016; /* ambient temperature, assuming no package at all + */ +float amb_temp = 80.0; + +void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); } + +void readinput(float *vect, int grid_rows, int grid_cols, int layers, + char *file) { + int i, j, k; + FILE *fp; + char str[STR_SIZE]; + float val; + + if ((fp = fopen(file, "r")) == 0) + fatal("The file was not opened"); + + for (i = 0; i <= grid_rows - 1; i++) + for (j = 0; j <= grid_cols - 1; j++) + for (k = 0; k <= layers - 1; k++) { + if (fgets(str, STR_SIZE, fp) == NULL) + fatal("Error reading file\n"); + if (feof(fp)) + fatal("not enough lines in file"); + if ((sscanf(str, "%f", &val) != 1)) + fatal("invalid file format"); + vect[i * grid_cols + j + k * grid_rows * grid_cols] = val; + } + + fclose(fp); +} + +void writeoutput(float *vect, int grid_rows, int grid_cols, int layers, + char *file) { + + int i, j, k, index = 0; + FILE *fp; + char str[STR_SIZE]; + + if ((fp = fopen(file, "w")) == 0) + printf("The file was not opened\n"); + + for (i = 0; i < grid_rows; i++) + for (j = 0; j < grid_cols; j++) + for (k = 0; k < layers; k++) { + sprintf(str, "%d\t%g\n", index, + vect[i * grid_cols + j + k * grid_rows * grid_cols]); + fputs(str, fp); + index++; + } + + fclose(fp); +} + +void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz, + float Cap, float Rx, float Ry, float Rz, float dt, + int numiter) { + float ce, cw, cn, cs, ct, cb, cc; + float stepDivCap = dt / Cap; + ce = cw = stepDivCap / Rx; + cn = cs = stepDivCap / Ry; + ct = cb = stepDivCap / Rz; + + cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct); + + int c, w, e, n, s, b, t; + int x, y, z; + int i = 0; + do { + for (z = 0; z < nz; z++) + for (y = 0; y < ny; y++) + for (x = 0; x < nx; x++) { + c = x + y * nx + z * nx * ny; + + w = (x == 0) ? c : c - 1; + e = (x == nx - 1) ? c : c + 1; + n = (y == 0) ? c : c - nx; + s = (y == ny - 1) ? c : c + nx; + b = (z == 0) ? c : c - nx * ny; + t = (z == nz - 1) ? c : c + nx * ny; + + tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce + + tIn[w] * cw + tIn[t] * ct + tIn[b] * cb + + (dt / Cap) * pIn[c] + ct * amb_temp; + } + float *temp = tIn; + tIn = tOut; + tOut = temp; + i++; + } while (i < numiter); +} + +float accuracy(float *arr1, float *arr2, int len) { + float err = 0.0; + int i; + for (i = 0; i < len; i++) { + err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]); + } + + return (float)sqrt(err / len); +} + +void usage(int argc, char **argv) { + fprintf(stderr, + "Usage: %s " + "\n", + argv[0]); + fprintf( + stderr, + "\t - number of rows/cols in the grid (positive integer)\n"); + fprintf(stderr, + "\t - number of layers in the grid (positive integer)\n"); + + fprintf(stderr, "\t - number of iterations\n"); + fprintf(stderr, "\t - name of the file containing the initial " + "power values of each cell\n"); + fprintf(stderr, "\t - name of the file containing the initial " + "temperature values of each cell\n"); + fprintf(stderr, "\t +template +__inline int compare_vectors(T *data1, T *data2, unsigned int size) { + printf("Comparing vectors: \n"); + bool match = true; + for (unsigned int i = 0; i < size; i++) + if (data1[i] != data2[i]) { + match = false; + printf("Diff: data1[%d]=%d, data1[%d]=%d.\n", i, data1[i], i, data2[i]); + } + + if (match) { + printf("PASS! vectors are matching!\n"); + return 0; + } else { + printf("FAIL! vectors are NOT matching!\n"); + exit(1); + return -1; + } +} + +#endif diff --git a/examples/huffman/cpuencode.cpp b/examples/huffman/cpuencode.cpp new file mode 100644 index 0000000..cb0ff4b --- /dev/null +++ b/examples/huffman/cpuencode.cpp @@ -0,0 +1,116 @@ +#include "stdafx.h" + +#include "cpuencode.h" +#include "print_helpers.h" + +using namespace std; + +#if 1 + +// The max. codeword length for each byte symbol is 32-bits + +extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements, + unsigned int *outdata, unsigned int *outsize, + unsigned int *codewords, + unsigned int *codewordlens) { + unsigned int *bitstreamPt = + (unsigned int *)outdata; /* Pointer to current byte */ + *bitstreamPt = 0x00000000U; + unsigned int startbit = 0; + unsigned int totalBytes = 0; + + for (unsigned int k = 0; k < num_elements; k++) { + unsigned int cw32 = 0; + unsigned int val32 = indata[k]; + unsigned int numbits = 0; + unsigned int mask32; + + for (unsigned int i = 0; i < 4; i++) { + unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i))); + cw32 = codewords[symbol]; + numbits = codewordlens[symbol]; + + while (numbits > 0) { + int writebits = min(32 - startbit, numbits); + if (numbits == writebits) + mask32 = (cw32 & ((1 << numbits) - 1)) + << (32 - startbit - + numbits); // first make sure that the start of the word + // is clean, then shift to the left as many + // places as you need + else + mask32 = cw32 >> + (numbits - writebits); // shift out the bits that can not fit + *bitstreamPt = (*bitstreamPt) | mask32; + numbits = numbits - writebits; + startbit = (startbit + writebits) % 32; + if (startbit == 0) { + bitstreamPt++; + *bitstreamPt = 0x00000000; + totalBytes += 4; + } + } + } + } + totalBytes += (startbit / 8) + + ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits + *outsize = totalBytes; +} + +////////////////////////////////////////////////////////////////////// +/// ALTERNATIVE CODER +/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data, +/// i.e. g 64 bits +/////////////////////////////////////////////////////////////////////// + +#else + +extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements, + unsigned int *outdata, unsigned int *outsize, + unsigned int *codewords, + unsigned int *codewordlens) { + unsigned int *bitstreamPt = + (unsigned int *)outdata; /* Pointer to current byte */ + // assume memset is done. + *bitstreamPt = 0x00000000U; + unsigned int startbit = 0; + unsigned int totalBytes = 0; + + for (unsigned int k = 0; k < num_elements; k++) { + unsigned long long cw64 = 0, mask64 = 0; + unsigned int val32 = indata[k]; + unsigned int numbits = 0; + unsigned int mask32, temp32; + + for (unsigned int i = 0; i < 4; i++) { + unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i))); + cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol]; + numbits += codewordlens[symbol]; + // if (numbits>32) printf("WARRNING! Element %d is combined into numbits = + // %d!!!!!!!\n", k, numbits); + } + + while (numbits > 0) { + int writebits = min(32 - startbit, numbits); + if (numbits == writebits) { + temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF); + mask32 = temp32 << (32 - startbit - numbits); + } else { + mask32 = (unsigned int)(cw64 >> (numbits - writebits)); + cw64 = cw64 & ((1 << (numbits - writebits)) - 1); + } + *bitstreamPt = (*bitstreamPt) | mask32; + numbits = numbits - writebits; + startbit = (startbit + writebits) % 32; + if (startbit == 0) { + bitstreamPt++; + *bitstreamPt = 0x00000000; + totalBytes += 4; + } + } + } + totalBytes += (startbit / 8) + + ((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits + *outsize = totalBytes; +} +#endif diff --git a/examples/huffman/cpuencode.h b/examples/huffman/cpuencode.h new file mode 100644 index 0000000..6c331fa --- /dev/null +++ b/examples/huffman/cpuencode.h @@ -0,0 +1,8 @@ +#ifndef _CE_H_ +#define _CE_H_ + +extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements, + unsigned int *outdata, unsigned int *outsize, + unsigned int *codewords, + unsigned int *codewordlens); +#endif diff --git a/examples/huffman/cuda_helpers.h b/examples/huffman/cuda_helpers.h new file mode 100644 index 0000000..3cf4524 --- /dev/null +++ b/examples/huffman/cuda_helpers.h @@ -0,0 +1,20 @@ +#ifndef __CUDA_HELPERS__ +#define __CUDA_HELPERS__ +#include +/************************************************************************/ +/* Init CUDA */ +/************************************************************************/ +#if __DEVICE_EMULATION__ + +bool InitCUDA(void) { return true; } + +#else +bool InitCUDA(void) { + + cudaSetDevice(0); + + printf("CUDA initialized.\n"); + return true; +} +#endif +#endif diff --git a/examples/huffman/cutil.h b/examples/huffman/cutil.h new file mode 100644 index 0000000..8757a22 --- /dev/null +++ b/examples/huffman/cutil.h @@ -0,0 +1,931 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* CUda UTility Library */ + +#ifndef _CUTIL_H_ +#define _CUTIL_H_ + +#ifdef _WIN32 +#pragma warning(disable : 4996) // disable deprecated warning +#endif + +#include +#include + +// helper typedefs for building DLL +#ifdef _WIN32 +#ifdef BUILD_DLL +#define DLL_MAPPING __declspec(dllexport) +#else +#define DLL_MAPPING __declspec(dllimport) +#endif +#else +#define DLL_MAPPING +#endif + +#ifdef _WIN32 +#define CUTIL_API __stdcall +#else +#define CUTIL_API +#endif + +//////////////////////////////////////////////////////////////////////////// +//! CUT bool type +//////////////////////////////////////////////////////////////////////////// +enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 }; + +//////////////////////////////////////////////////////////////////////////// +//! Deallocate memory allocated within Cutil +//! @param pointer to memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +void CUTIL_API cutFree(void *ptr); + +//////////////////////////////////////////////////////////////////////////// +//! Helper for bank conflict checking (should only be used with the +//! CUT_BANK_CHECKER macro) +//! @param tidx thread id in x dimension of block +//! @param tidy thread id in y dimension of block +//! @param tidz thread id in z dimension of block +//! @param bdimx block size in x dimension +//! @param bdimy block size in y dimension +//! @param bdimz block size in z dimension +//! @param file name of the source file where the access takes place +//! @param line line in the source file where the access takes place +//! @param aname name of the array which is accessed +//! @param index index into the array +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +void CUTIL_API cutCheckBankAccess(unsigned int tidx, unsigned int tidy, + unsigned int tidz, unsigned int bdimx, + unsigned int bdimy, unsigned int bdimz, + const char *file, const int line, + const char *aname, const int index); + +//////////////////////////////////////////////////////////////////////////// +//! Find the path for a filename +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executablePath optional absolute path of the executable +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +char *CUTIL_API cutFindFilePath(const char *filename, + const char *executablePath); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing single precision floating point data +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutReadFilef(const char *filename, float **data, + unsigned int *len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing double precision floating point data +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutReadFiled(const char *filename, double **data, + unsigned int *len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing integer data +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutReadFilei(const char *filename, int **data, + unsigned int *len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned integer data +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutReadFileui(const char *filename, unsigned int **data, + unsigned int *len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing char / byte data +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutReadFileb(const char *filename, char **data, + unsigned int *len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned char / byte data +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutReadFileub(const char *filename, unsigned char **data, + unsigned int *len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing single precision floating point +//! data +//! @return CUTTrue if writing the file succeeded, otherwise false +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutWriteFilef(const char *filename, const float *data, + unsigned int len, const float epsilon, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing double precision floating point +//! data +//! @return CUTTrue if writing the file succeeded, otherwise false +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutWriteFiled(const char *filename, const float *data, + unsigned int len, const double epsilon, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing integer data +//! @return CUTTrue if writing the file succeeded, otherwise false +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutWriteFilei(const char *filename, const int *data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned integer data +//! @return CUTTrue if writing the file succeeded, otherwise false +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutWriteFileui(const char *filename, + const unsigned int *data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing char / byte data +//! @return CUTTrue if writing the file succeeded, otherwise false +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutWriteFileb(const char *filename, const char *data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned char / byte data +//! @return CUTTrue if writing the file succeeded, otherwise false +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutWriteFileub(const char *filename, + const unsigned char *data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with unsigned char as data element type) +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutLoadPGMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PPM image file (with unsigned char as data element type) +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PPM image file (with unsigned char as data element type), padding +//! 4th component +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with unsigned int as data element type) +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is +//! initialized within Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutLoadPGMi(const char *file, unsigned int **data, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with unsigned short as data element type) +//! @return CUTTrue if reading the file succeeded, otherwise false +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is +//! initialized withing Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutLoadPGMs(const char *file, unsigned short **data, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with float as data element type) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is +//! initialized withing Cutil then cutFree() has to be used to +//! deallocate the memory +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutLoadPGMf(const char *file, float **data, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with unsigned char as data element type) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutSavePGMub(const char *file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PPM image file (with unsigned char as data element type) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutSavePPMub(const char *file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PPM image file (with unsigned char as data element type, padded to +//! 4 bytes) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutSavePPM4ub(const char *file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with unsigned int as data element type) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutSavePGMi(const char *file, unsigned int *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with unsigned short as data element type) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutSavePGMs(const char *file, unsigned short *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with float as data element type) +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutSavePGMf(const char *file, float *data, unsigned int w, + unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +// Command line arguments: General notes +// * All command line arguments begin with '--' followed by the token; +// token and value are seperated by '='; example --samples=50 +// * Arrays have the form --model=[one.obj,two.obj,three.obj] +// (without whitespaces) +//////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////// +//! Check if command line argument \a flag-name is given +//! @return CUTTrue if command line argument \a flag_name has been given, +//! otherwise 0 +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param flag_name name of command line flag +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCheckCmdLineFlag(const int argc, const char **argv, + const char *flag_name); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type int +//! @return CUTTrue if command line argument \a arg_name has been given and +//! is of the requested type, otherwise CUTFalse +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutGetCmdLineArgumenti(const int argc, const char **argv, + const char *arg_name, int *val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type float +//! @return CUTTrue if command line argument \a arg_name has been given and +//! is of the requested type, otherwise CUTFalse +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutGetCmdLineArgumentf(const int argc, const char **argv, + const char *arg_name, float *val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type string +//! @return CUTTrue if command line argument \a arg_name has been given and +//! is of the requested type, otherwise CUTFalse +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutGetCmdLineArgumentstr(const int argc, const char **argv, + const char *arg_name, char **val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument list those element are strings +//! @return CUTTrue if command line argument \a arg_name has been given and +//! is of the requested type, otherwise CUTFalse +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val command line argument list +//! @param len length of the list / number of elements +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutGetCmdLineArgumentListstr(const int argc, + const char **argv, + const char *arg_name, + char **val, + unsigned int *len); + +//////////////////////////////////////////////////////////////////////////// +//! Extended assert +//! @return CUTTrue if the condition \a val holds, otherwise CUTFalse +//! @param val condition to test +//! @param file __FILE__ macro +//! @param line __LINE__ macro +//! @note This function should be used via the CONDITION(val) macro +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCheckCondition(int val, const char *file, + const int line); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutComparef(const float *reference, const float *data, + const unsigned int len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutComparei(const int *reference, const int *data, + const unsigned int len); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned integer arrays, with epsilon and threshold +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCompareuit(const unsigned int *reference, + const unsigned int *data, + const unsigned int len, const float epsilon, + const float threshold); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned char arrays +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCompareub(const unsigned char *reference, + const unsigned char *data, + const unsigned int len); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integers with a tolernance for # of byte errors +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCompareubt(const unsigned char *reference, + const unsigned char *data, + const unsigned int len, const float epsilon, + const float threshold); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays witha n epsilon tolerance for equality +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCompareube(const unsigned char *reference, + const unsigned char *data, + const unsigned int len, const float epsilon); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutComparefe(const float *reference, const float *data, + const unsigned int len, const float epsilon); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality and a +//! threshold for # pixel errors +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutComparefet(const float *reference, const float *data, + const unsigned int len, const float epsilon, + const float threshold); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays using L2-norm with an epsilon tolerance for +//! equality +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCompareL2fe(const float *reference, const float *data, + const unsigned int len, + const float epsilon); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PPM image files with an epsilon tolerance for equality +//! @return CUTTrue if \a reference and \a data are identical, +//! otherwise CUTFalse +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. +//! 0.15f = 15% must pass) $param verboseErrors output details of image mismatch +//! to std::err +//////////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors = false); + +//////////////////////////////////////////////////////////////////////////// +//! Timer functionality + +//////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return CUTTrue if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutCreateTimer(unsigned int *name); + +//////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return CUTTrue if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutDeleteTimer(unsigned int name); + +//////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutStartTimer(const unsigned int name); + +//////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutStopTimer(const unsigned int name); + +//////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +CUTBoolean CUTIL_API cutResetTimer(const unsigned int name); + +//////////////////////////////////////////////////////////////////////////// +//! Returns total execution time in milliseconds for the timer over all +//! runs since the last reset or timer creation. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +float CUTIL_API cutGetTimerValue(const unsigned int name); + +//////////////////////////////////////////////////////////////////////////// +//! Return the average time in milliseconds for timer execution as the +//! total time for the timer dividied by the number of completed (stopped) +//! runs the timer has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////// +DLL_MAPPING +float CUTIL_API cutGetAverageTimerValue(const unsigned int name); + +//////////////////////////////////////////////////////////////////////////// +//! Macros + +#if CUDART_VERSION >= 4000 +#define CUT_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize(); +#else +#define CUT_DEVICE_SYNCHRONIZE() cudaThreadSynchronize(); +#endif + +#if CUDART_VERSION >= 4000 +#define CUT_DEVICE_RESET() cudaDeviceReset(); +#else +#define CUT_DEVICE_RESET() cudaThreadExit(); +#endif + +// This is for the CUTIL bank checker +#ifdef _DEBUG +#if __DEVICE_EMULATION__ +// Interface for bank conflict checker +#define CUT_BANK_CHECKER(array, index) \ + (cutCheckBankAccess(threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x, \ + blockDim.y, blockDim.z, __FILE__, __LINE__, #array, \ + index), \ + array[index]) +#else +#define CUT_BANK_CHECKER(array, index) array[index] +#endif +#else +#define CUT_BANK_CHECKER(array, index) array[index] +#endif + +#define CU_SAFE_CALL_NO_SYNC(call) \ + { \ + CUresult err = call; \ + if (CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err, \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define CU_SAFE_CALL(call) CU_SAFE_CALL_NO_SYNC(call); + +#define CU_SAFE_CTX_SYNC() \ + { \ + CUresult err = cuCtxSynchronize(); \ + if (CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err, \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define CUDA_SAFE_CALL_NO_SYNC(call) \ + { \ + cudaError err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call); + +#define CUDA_SAFE_THREAD_SYNC() \ + { \ + cudaError err = CUT_DEVICE_SYNCHRONIZE(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + } \ + } + +#define CUFFT_SAFE_CALL(call) \ + { \ + cufftResult err = call; \ + if (CUFFT_SUCCESS != err) { \ + fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define CUT_SAFE_CALL(call) \ + if (CUTTrue != call) { \ + fprintf(stderr, "Cut error in file '%s' in line %i.\n", __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } + +//! Check for CUDA error +#ifdef _DEBUG +#define CUT_CHECK_ERROR(errorMessage) \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + err = CUT_DEVICE_SYNCHRONIZE(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } +#else +#define CUT_CHECK_ERROR(errorMessage) \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } +#endif + +//! Check for malloc error +#define CUT_SAFE_MALLOC(mallocCall) \ + { \ + if (!(mallocCall)) { \ + fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } \ + while (0) \ + ; + +//! Check if conditon is true (flexible assert) +#define CUT_CONDITION(val) \ + if (CUTFalse == cutCheckCondition(val, __FILE__, __LINE__)) { \ + exit(EXIT_FAILURE); \ + } + +#if __DEVICE_EMULATION__ + +#define CUT_DEVICE_INIT(ARGC, ARGV) + +#else + +#define CUT_DEVICE_INIT(ARGC, ARGV) \ + { \ + int deviceCount; \ + CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); \ + if (deviceCount == 0) { \ + fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); \ + exit(EXIT_FAILURE); \ + } \ + int dev = 0; \ + cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev); \ + if (dev < 0) \ + dev = 0; \ + if (dev > deviceCount - 1) \ + dev = deviceCount - 1; \ + cudaDeviceProp deviceProp; \ + CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev)); \ + if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse) \ + fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); \ + CUDA_SAFE_CALL(cudaSetDevice(dev)); \ + } + +//! Check for CUDA context lost +#define CUDA_CHECK_CTX_LOST(errorMessage) \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + err = CUT_DEVICE_SYNCHRONIZE(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +//! Check for CUDA context lost +#define CU_CHECK_CTX_LOST(errorMessage) \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (CUDA_ERROR_INVALID_CONTEXT != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + err = CUT_DEVICE_SYNCHRONIZE(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#endif + +#define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) \ + { \ + cuDevice = 0; \ + int deviceCount = 0; \ + CUresult err = cuInit(0); \ + if (CUDA_SUCCESS == err) \ + CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); \ + if (deviceCount == 0) { \ + fprintf(stderr, "cutil error: no devices supporting CUDA\n"); \ + exit(EXIT_FAILURE); \ + } \ + int dev = 0; \ + cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev); \ + if (dev < 0) \ + dev = 0; \ + if (dev > deviceCount - 1) \ + dev = deviceCount - 1; \ + CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev)); \ + char name[100]; \ + cuDeviceGetName(name, 100, cuDevice); \ + if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse) \ + fprintf(stderr, "Using device %d: %s\n", dev, name); \ + } + +#define CUT_EXIT(argc, argv) \ + if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { \ + printf("\nPress ENTER to exit...\n"); \ + fflush(stdout); \ + fflush(stderr); \ + getchar(); \ + } \ + exit(EXIT_SUCCESS); + +#endif // #ifndef _CUTIL_H_ diff --git a/examples/huffman/hist.cu b/examples/huffman/hist.cu new file mode 100644 index 0000000..0ff3f31 --- /dev/null +++ b/examples/huffman/hist.cu @@ -0,0 +1,104 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and * + * proprietary rights in and to this software and related documentation. Any + * use, reproduction, disclosure, or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation is + * strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + +#include +#include + +#define CHECK(ans) \ + { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, + bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, + line); + if (abort) + exit(code); + } +} + +using namespace std; + +#define SIZE (100 * 1024 * 1024) + +__global__ void histo_kernel(unsigned char *buffer, long size, + unsigned int *histo) { + + __shared__ unsigned int temp[256]; + + temp[threadIdx.x] = 0; + __syncthreads(); + + int i = threadIdx.x + blockIdx.x * blockDim.x; + int offset = blockDim.x * gridDim.x; + while (i < size) { + atomicAdd(&temp[buffer[i]], 1); + i += offset; + } + + __syncthreads(); + atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]); +} + +int runHisto(char *file, unsigned int *freq, unsigned int memSize, + unsigned int *source) { + + FILE *f = fopen(file, "rb"); + if (!f) { + perror(file); + exit(1); + } + fseek(f, 0, SEEK_SET); + size_t result = fread(source, 1, memSize, f); + if (result != memSize) + fputs("Cannot read input file", stderr); + + fclose(f); + + unsigned char *buffer = (unsigned char *)source; + + int blocks = 2; + + // allocate memory on the GPU for the file's data + int partSize = memSize / 32; + int totalNum = memSize / sizeof(unsigned int); + int partialNum = partSize / sizeof(unsigned int); + + unsigned char *dev_buffer0; + unsigned char *dev_buffer1; + unsigned int *dev_histo; + cudaMalloc((void **)&dev_buffer0, partSize); + cudaMalloc((void **)&dev_buffer1, partSize); + cudaMalloc((void **)&dev_histo, 256 * sizeof(int)); + cudaMemset(dev_histo, 0, 256 * sizeof(int)); + + for (int i = 0; i < totalNum; i += partialNum * 2) { + CHECK( + cudaMemcpy(dev_buffer0, buffer + i, partSize, cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(dev_buffer1, buffer + i + partialNum, partSize, + cudaMemcpyHostToDevice)); + + // kernel launch - 2x the number of mps gave best timing + histo_kernel<<>>(dev_buffer0, partSize, dev_histo); + cudaDeviceSynchronize(); + histo_kernel<<>>(dev_buffer1, partSize, dev_histo); + cudaDeviceSynchronize(); + } + cudaMemcpy(freq, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_histo); + cudaFree(dev_buffer0); + cudaFree(dev_buffer1); + return 0; +} diff --git a/examples/huffman/huffTree.h b/examples/huffman/huffTree.h new file mode 100644 index 0000000..8a37568 --- /dev/null +++ b/examples/huffman/huffTree.h @@ -0,0 +1,90 @@ +#include "stdio.h" +#include +#include // for CHAR_BIT +#include +#include +#include +#include +#include + +using namespace std; + +const int UniqueSymbols = 1 << CHAR_BIT; +void printBits(unsigned int val, int numbits) { + for (int i = numbits - 1; i >= 0; i--) + putchar('0' + ((val >> i) & 1)); +} + +typedef vector HuffCode; +typedef map HuffCodeMap; + +class INode { +public: + const int f; + virtual ~INode() {} + +protected: + INode(int f) : f(f) {} +}; + +class InternalNode : public INode { +public: + INode *const left; + INode *const right; + + InternalNode(INode *c0, INode *c1) + : INode(c0->f + c1->f), left(c0), right(c1) {} + ~InternalNode() { + delete left; + delete right; + } +}; + +class LeafNode : public INode { +public: + const char c; + + LeafNode(int f, char c) : INode(f), c(c) {} +}; + +struct NodeCmp { + bool operator()(const INode *lhs, const INode *rhs) const { + return lhs->f > rhs->f; + } +}; + +INode *BuildTree(unsigned int (&frequencies)[UniqueSymbols]) { + std::priority_queue, NodeCmp> trees; + + for (int i = 0; i < UniqueSymbols; ++i) { + if (frequencies[i] != 0) + trees.push(new LeafNode(frequencies[i], (char)i)); + } + while (trees.size() > 1) { + INode *childR = trees.top(); + trees.pop(); + + INode *childL = trees.top(); + trees.pop(); + + INode *parent = new InternalNode(childR, childL); + trees.push(parent); + } + return trees.top(); +} + +void GenerateCodes(const INode *node, const HuffCode &prefix, + HuffCodeMap &outCodes) { + if (const LeafNode *lf = dynamic_cast(node)) { + outCodes[lf->c] = prefix; + } else if (const InternalNode *in = + dynamic_cast(node)) { + HuffCode leftPrefix = prefix; + leftPrefix.push_back(false); + GenerateCodes(in->left, leftPrefix, outCodes); + + HuffCode rightPrefix = prefix; + rightPrefix.push_back(true); + GenerateCodes(in->right, rightPrefix, outCodes); + } +} diff --git a/examples/huffman/load_data.h b/examples/huffman/load_data.h new file mode 100644 index 0000000..b0149ab --- /dev/null +++ b/examples/huffman/load_data.h @@ -0,0 +1,65 @@ +#ifndef _LOADTESTDATA_H_ +#define _LOADTESTDATA_H_ + +//#include "testdatagen.h" +#include "hist.cu" +#include "huffTree.h" + +inline void initParams(char *file_name, uint num_block_threads, + uint &num_blocks, uint &num_elements, uint &mem_size, + uint symbol_type_size) { + if (file_name == NULL) { + num_elements = num_blocks * num_block_threads; + mem_size = num_elements * symbol_type_size; + } else { + FILE *f = fopen(file_name, "rb"); + if (!f) { + perror(file_name); + exit(1); + } + fseek(f, 0, SEEK_END); + mem_size = ftell(f); + fclose(f); + num_elements = mem_size / symbol_type_size; + // todo add check if we need 1 more block! + num_blocks = num_elements / num_block_threads; + } +} + +inline void loadData(char *file_name, uint *sourceData, uint *codewords, + uint *codewordlens, uint num_elements, uint mem_size, + double &H) { + if (file_name == NULL) { + printf("No input file\n"); + exit(-1); + } else { + unsigned int freqs[UniqueSymbols] = {0}; + runHisto(file_name, freqs, mem_size, sourceData); + INode *root = BuildTree(freqs); + + HuffCodeMap codes; + GenerateCodes(root, HuffCode(), codes); + delete root; + + for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); + ++it) { + unsigned int count = distance(it->second.begin(), it->second.end()); + for (int i = 0; i < count; i++) + if (it->second[i]) + codewords[(unsigned int)(it->first)] += + (uint)pow(2.0f, (int)count - i - 1); + codewordlens[(unsigned int)(it->first)] = count; + } + + H = 0.0; + for (unsigned int i = 0; i < 256; i++) + if (freqs[i] > 0) { + double p = (double)freqs[i] / (double)mem_size; + H += p * log(p) / log(2.0); + } + H = -H; + printf("\n%s, %u bytes, entropy %f\n\n", file_name, mem_size, H); + } +} + +#endif diff --git a/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..47206d1 --- /dev/null +++ b/examples/huffman/main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,1933 @@ +; ModuleID = 'main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "main_test_cu.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_blockDim_t = type { i8 } +%struct.__cuda_builtin_gridDim_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any + +$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any + +@_ZZ12histo_kernelPhlPjE4temp = internal addrspace(3) global [256 x i32] undef, align 4 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 +@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1 +@_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm = internal addrspace(3) global [3072 x i32] undef, align 4 +@_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax = internal addrspace(3) global i32 undef, align 4 +@_ZZL10uniformAddPjS_iiiE3uni = internal addrspace(3) global i32 undef, align 4 +@_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 +@_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 +@_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 +@_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z12histo_kernelPhlPj(i8* %buffer, i64 %size, i32* %histo) #0 { +entry: + %buffer.addr = alloca i8*, align 8 + %size.addr = alloca i64, align 8 + %histo.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %offset = alloca i32, align 4 + store i8* %buffer, i8** %buffer.addr, align 8 + store i64 %size, i64* %size.addr, align 8 + store i32* %histo, i32** %histo.addr, align 8 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom = zext i32 %call to i64 + %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + call void @llvm.nvvm.barrier0() + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %mul = mul i32 %call2, %call3 + %add = add i32 %call1, %mul + store i32 %add, i32* %i, align 4 + %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %call5 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 + %mul6 = mul i32 %call4, %call5 + store i32 %mul6, i32* %offset, align 4 + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %0 = load i32, i32* %i, align 4 + %conv = sext i32 %0 to i64 + %1 = load i64, i64* %size.addr, align 8 + %cmp = icmp slt i64 %conv, %1 + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %2 = load i8*, i8** %buffer.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom7 = sext i32 %3 to i64 + %arrayidx8 = getelementptr inbounds i8, i8* %2, i64 %idxprom7 + %4 = load i8, i8* %arrayidx8, align 1 + %idxprom9 = zext i8 %4 to i64 + %arrayidx10 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom9 + %call11 = call i32 @_ZL9atomicAddPjj(i32* %arrayidx10, i32 1) #2 + %5 = load i32, i32* %offset, align 4 + %6 = load i32, i32* %i, align 4 + %add12 = add nsw i32 %6, %5 + store i32 %add12, i32* %i, align 4 + br label %while.cond + +while.end: ; preds = %while.cond + call void @llvm.nvvm.barrier0() + %7 = load i32*, i32** %histo.addr, align 8 + %call13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom14 = zext i32 %call13 to i64 + %arrayidx15 = getelementptr inbounds i32, i32* %7, i64 %idxprom14 + %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom17 = zext i32 %call16 to i64 + %arrayidx18 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom17 + %8 = load i32, i32* %arrayidx18, align 4 + %call19 = call i32 @_ZL9atomicAddPjj(i32* %arrayidx15, i32 %8) #2 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + ret i32 %0 +} + +; Function Attrs: convergent noinline nounwind optnone +define internal i32 @_ZL9atomicAddPjj(i32* %address, i32 %val) #0 { +entry: + %address.addr = alloca i32*, align 8 + %val.addr = alloca i32, align 4 + store i32* %address, i32** %address.addr, align 8 + store i32 %val, i32* %val.addr, align 4 + %0 = load i32*, i32** %address.addr, align 8 + %1 = load i32, i32* %val.addr, align 4 + %call = call i32 @_ZL12__uAtomicAddPjj(i32* %0, i32 %1) #2 + ret i32 %call +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_(i32* %data, i32* %gm_codewords, i32* %gm_codewordlens, i32* %cw32, i32* %cw32len, i32* %cw32idx, i32* %out, i32* %outidx) #0 { +entry: + %data.addr = alloca i32*, align 8 + %gm_codewords.addr = alloca i32*, align 8 + %gm_codewordlens.addr = alloca i32*, align 8 + %cw32.addr = alloca i32*, align 8 + %cw32len.addr = alloca i32*, align 8 + %cw32idx.addr = alloca i32*, align 8 + %out.addr = alloca i32*, align 8 + %outidx.addr = alloca i32*, align 8 + %kn = alloca i32, align 4 + %k = alloca i32, align 4 + %kc = alloca i32, align 4 + %startbit = alloca i32, align 4 + %wrbits = alloca i32, align 4 + %cw64 = alloca i64, align 8 + %val32 = alloca i32, align 4 + %codewordlen = alloca i32, align 4 + %tmpbyte = alloca i8, align 1 + %tmpcwlen = alloca i8, align 1 + %tmpcw32 = alloca i32, align 4 + %codewords = alloca i32*, align 8 + %codewordlens = alloca i32*, align 8 + %as = alloca i32*, align 8 + %i = alloca i32, align 4 + %offset = alloca i32, align 4 + %d = alloca i32, align 4 + %ai = alloca i8, align 1 + %bi = alloca i8, align 1 + %d56 = alloca i32, align 4 + %ai64 = alloca i8, align 1 + %bi70 = alloca i8, align 1 + %t = alloca i32, align 4 + store i32* %data, i32** %data.addr, align 8 + store i32* %gm_codewords, i32** %gm_codewords.addr, align 8 + store i32* %gm_codewordlens, i32** %gm_codewordlens.addr, align 8 + store i32* %cw32, i32** %cw32.addr, align 8 + store i32* %cw32len, i32** %cw32len.addr, align 8 + store i32* %cw32idx, i32** %cw32idx.addr, align 8 + store i32* %out, i32** %out.addr, align 8 + store i32* %outidx, i32** %outidx.addr, align 8 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %mul = mul i32 %call, %call1 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add = add i32 %mul, %call2 + store i32 %add, i32* %kn, align 4 + %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call3, i32* %k, align 4 + store i64 0, i64* %cw64, align 8 + store i32 0, i32* %codewordlen, align 4 + store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 0), i32** %codewords, align 8 + store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 256), i32** %codewordlens, align 8 + store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 512), i32** %as, align 8 + %0 = load i32*, i32** %gm_codewords.addr, align 8 + %1 = load i32, i32* %k, align 4 + %idxprom = zext i32 %1 to i64 + %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom + %2 = load i32, i32* %arrayidx, align 4 + %3 = load i32*, i32** %codewords, align 8 + %4 = load i32, i32* %k, align 4 + %idxprom4 = zext i32 %4 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %3, i64 %idxprom4 + store i32 %2, i32* %arrayidx5, align 4 + %5 = load i32*, i32** %gm_codewordlens.addr, align 8 + %6 = load i32, i32* %k, align 4 + %idxprom6 = zext i32 %6 to i64 + %arrayidx7 = getelementptr inbounds i32, i32* %5, i64 %idxprom6 + %7 = load i32, i32* %arrayidx7, align 4 + %8 = load i32*, i32** %codewordlens, align 8 + %9 = load i32, i32* %k, align 4 + %idxprom8 = zext i32 %9 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %8, i64 %idxprom8 + store i32 %7, i32* %arrayidx9, align 4 + %10 = load i32*, i32** %data.addr, align 8 + %11 = load i32, i32* %kn, align 4 + %idxprom10 = zext i32 %11 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %10, i64 %idxprom10 + %12 = load i32, i32* %arrayidx11, align 4 + store i32 %12, i32* %val32, align 4 + call void @llvm.nvvm.barrier0() + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %13 = load i32, i32* %i, align 4 + %cmp = icmp ult i32 %13, 4 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %14 = load i32, i32* %val32, align 4 + %15 = load i32, i32* %i, align 4 + %sub = sub i32 3, %15 + %mul12 = mul i32 %sub, 8 + %shr = lshr i32 %14, %mul12 + %conv = trunc i32 %shr to i8 + store i8 %conv, i8* %tmpbyte, align 1 + %16 = load i32*, i32** %codewords, align 8 + %17 = load i8, i8* %tmpbyte, align 1 + %idxprom13 = zext i8 %17 to i64 + %arrayidx14 = getelementptr inbounds i32, i32* %16, i64 %idxprom13 + %18 = load i32, i32* %arrayidx14, align 4 + store i32 %18, i32* %tmpcw32, align 4 + %19 = load i32*, i32** %codewordlens, align 8 + %20 = load i8, i8* %tmpbyte, align 1 + %idxprom15 = zext i8 %20 to i64 + %arrayidx16 = getelementptr inbounds i32, i32* %19, i64 %idxprom15 + %21 = load i32, i32* %arrayidx16, align 4 + %conv17 = trunc i32 %21 to i8 + store i8 %conv17, i8* %tmpcwlen, align 1 + %22 = load i64, i64* %cw64, align 8 + %23 = load i8, i8* %tmpcwlen, align 1 + %conv18 = zext i8 %23 to i32 + %sh_prom = zext i32 %conv18 to i64 + %shl = shl i64 %22, %sh_prom + %24 = load i32, i32* %tmpcw32, align 4 + %conv19 = zext i32 %24 to i64 + %or = or i64 %shl, %conv19 + store i64 %or, i64* %cw64, align 8 + %25 = load i8, i8* %tmpcwlen, align 1 + %conv20 = zext i8 %25 to i32 + %26 = load i32, i32* %codewordlen, align 4 + %add21 = add i32 %26, %conv20 + store i32 %add21, i32* %codewordlen, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %27 = load i32, i32* %i, align 4 + %inc = add i32 %27, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %28 = load i32, i32* %codewordlen, align 4 + %29 = load i32*, i32** %as, align 8 + %30 = load i32, i32* %k, align 4 + %idxprom22 = zext i32 %30 to i64 + %arrayidx23 = getelementptr inbounds i32, i32* %29, i64 %idxprom22 + store i32 %28, i32* %arrayidx23, align 4 + call void @llvm.nvvm.barrier0() + store i32 1, i32* %offset, align 4 + %call24 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shr25 = lshr i32 %call24, 1 + store i32 %shr25, i32* %d, align 4 + br label %for.cond26 + +for.cond26: ; preds = %for.inc46, %for.end + %31 = load i32, i32* %d, align 4 + %cmp27 = icmp ugt i32 %31, 0 + br i1 %cmp27, label %for.body28, label %for.end48 + +for.body28: ; preds = %for.cond26 + call void @llvm.nvvm.barrier0() + %32 = load i32, i32* %k, align 4 + %33 = load i32, i32* %d, align 4 + %cmp29 = icmp ult i32 %32, %33 + br i1 %cmp29, label %if.then, label %if.end + +if.then: ; preds = %for.body28 + %34 = load i32, i32* %offset, align 4 + %35 = load i32, i32* %k, align 4 + %mul30 = mul i32 2, %35 + %add31 = add i32 %mul30, 1 + %mul32 = mul i32 %34, %add31 + %sub33 = sub i32 %mul32, 1 + %conv34 = trunc i32 %sub33 to i8 + store i8 %conv34, i8* %ai, align 1 + %36 = load i32, i32* %offset, align 4 + %37 = load i32, i32* %k, align 4 + %mul35 = mul i32 2, %37 + %add36 = add i32 %mul35, 2 + %mul37 = mul i32 %36, %add36 + %sub38 = sub i32 %mul37, 1 + %conv39 = trunc i32 %sub38 to i8 + store i8 %conv39, i8* %bi, align 1 + %38 = load i32*, i32** %as, align 8 + %39 = load i8, i8* %ai, align 1 + %idxprom40 = zext i8 %39 to i64 + %arrayidx41 = getelementptr inbounds i32, i32* %38, i64 %idxprom40 + %40 = load i32, i32* %arrayidx41, align 4 + %41 = load i32*, i32** %as, align 8 + %42 = load i8, i8* %bi, align 1 + %idxprom42 = zext i8 %42 to i64 + %arrayidx43 = getelementptr inbounds i32, i32* %41, i64 %idxprom42 + %43 = load i32, i32* %arrayidx43, align 4 + %add44 = add i32 %43, %40 + store i32 %add44, i32* %arrayidx43, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body28 + %44 = load i32, i32* %offset, align 4 + %mul45 = mul i32 %44, 2 + store i32 %mul45, i32* %offset, align 4 + br label %for.inc46 + +for.inc46: ; preds = %if.end + %45 = load i32, i32* %d, align 4 + %shr47 = lshr i32 %45, 1 + store i32 %shr47, i32* %d, align 4 + br label %for.cond26 + +for.end48: ; preds = %for.cond26 + %46 = load i32, i32* %k, align 4 + %cmp49 = icmp eq i32 %46, 0 + br i1 %cmp49, label %if.then50, label %if.end55 + +if.then50: ; preds = %for.end48 + %47 = load i32*, i32** %as, align 8 + %call51 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %sub52 = sub i32 %call51, 1 + %idxprom53 = zext i32 %sub52 to i64 + %arrayidx54 = getelementptr inbounds i32, i32* %47, i64 %idxprom53 + store i32 0, i32* %arrayidx54, align 4 + br label %if.end55 + +if.end55: ; preds = %if.then50, %for.end48 + store i32 1, i32* %d56, align 4 + br label %for.cond57 + +for.cond57: ; preds = %for.inc86, %if.end55 + %48 = load i32, i32* %d56, align 4 + %call58 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %cmp59 = icmp ult i32 %48, %call58 + br i1 %cmp59, label %for.body60, label %for.end88 + +for.body60: ; preds = %for.cond57 + %49 = load i32, i32* %offset, align 4 + %shr61 = lshr i32 %49, 1 + store i32 %shr61, i32* %offset, align 4 + call void @llvm.nvvm.barrier0() + %50 = load i32, i32* %k, align 4 + %51 = load i32, i32* %d56, align 4 + %cmp62 = icmp ult i32 %50, %51 + br i1 %cmp62, label %if.then63, label %if.end85 + +if.then63: ; preds = %for.body60 + %52 = load i32, i32* %offset, align 4 + %53 = load i32, i32* %k, align 4 + %mul65 = mul i32 2, %53 + %add66 = add i32 %mul65, 1 + %mul67 = mul i32 %52, %add66 + %sub68 = sub i32 %mul67, 1 + %conv69 = trunc i32 %sub68 to i8 + store i8 %conv69, i8* %ai64, align 1 + %54 = load i32, i32* %offset, align 4 + %55 = load i32, i32* %k, align 4 + %mul71 = mul i32 2, %55 + %add72 = add i32 %mul71, 2 + %mul73 = mul i32 %54, %add72 + %sub74 = sub i32 %mul73, 1 + %conv75 = trunc i32 %sub74 to i8 + store i8 %conv75, i8* %bi70, align 1 + %56 = load i32*, i32** %as, align 8 + %57 = load i8, i8* %ai64, align 1 + %idxprom76 = zext i8 %57 to i64 + %arrayidx77 = getelementptr inbounds i32, i32* %56, i64 %idxprom76 + %58 = load i32, i32* %arrayidx77, align 4 + store i32 %58, i32* %t, align 4 + %59 = load i32*, i32** %as, align 8 + %60 = load i8, i8* %bi70, align 1 + %idxprom78 = zext i8 %60 to i64 + %arrayidx79 = getelementptr inbounds i32, i32* %59, i64 %idxprom78 + %61 = load i32, i32* %arrayidx79, align 4 + %62 = load i32*, i32** %as, align 8 + %63 = load i8, i8* %ai64, align 1 + %idxprom80 = zext i8 %63 to i64 + %arrayidx81 = getelementptr inbounds i32, i32* %62, i64 %idxprom80 + store i32 %61, i32* %arrayidx81, align 4 + %64 = load i32, i32* %t, align 4 + %65 = load i32*, i32** %as, align 8 + %66 = load i8, i8* %bi70, align 1 + %idxprom82 = zext i8 %66 to i64 + %arrayidx83 = getelementptr inbounds i32, i32* %65, i64 %idxprom82 + %67 = load i32, i32* %arrayidx83, align 4 + %add84 = add i32 %67, %64 + store i32 %add84, i32* %arrayidx83, align 4 + br label %if.end85 + +if.end85: ; preds = %if.then63, %for.body60 + br label %for.inc86 + +for.inc86: ; preds = %if.end85 + %68 = load i32, i32* %d56, align 4 + %mul87 = mul i32 %68, 2 + store i32 %mul87, i32* %d56, align 4 + br label %for.cond57 + +for.end88: ; preds = %for.cond57 + call void @llvm.nvvm.barrier0() + %69 = load i32, i32* %k, align 4 + %call89 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %sub90 = sub i32 %call89, 1 + %cmp91 = icmp eq i32 %69, %sub90 + br i1 %cmp91, label %if.then92, label %if.end102 + +if.then92: ; preds = %for.end88 + %70 = load i32*, i32** %as, align 8 + %71 = load i32, i32* %k, align 4 + %idxprom93 = zext i32 %71 to i64 + %arrayidx94 = getelementptr inbounds i32, i32* %70, i64 %idxprom93 + %72 = load i32, i32* %arrayidx94, align 4 + %73 = load i32, i32* %codewordlen, align 4 + %add95 = add i32 %72, %73 + %74 = load i32*, i32** %outidx.addr, align 8 + %call96 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %idxprom97 = zext i32 %call96 to i64 + %arrayidx98 = getelementptr inbounds i32, i32* %74, i64 %idxprom97 + store i32 %add95, i32* %arrayidx98, align 4 + %75 = load i32*, i32** %as, align 8 + %76 = load i32, i32* %k, align 4 + %idxprom99 = zext i32 %76 to i64 + %arrayidx100 = getelementptr inbounds i32, i32* %75, i64 %idxprom99 + %77 = load i32, i32* %arrayidx100, align 4 + %78 = load i32, i32* %codewordlen, align 4 + %add101 = add i32 %77, %78 + %div = udiv i32 %add101, 32 + store i32 %div, i32* addrspacecast (i32 addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax to i32*), align 4 + br label %if.end102 + +if.end102: ; preds = %if.then92, %for.end88 + %79 = load i32*, i32** %as, align 8 + %80 = load i32, i32* %k, align 4 + %idxprom103 = zext i32 %80 to i64 + %arrayidx104 = getelementptr inbounds i32, i32* %79, i64 %idxprom103 + %81 = load i32, i32* %arrayidx104, align 4 + %div105 = udiv i32 %81, 32 + store i32 %div105, i32* %kc, align 4 + %82 = load i32*, i32** %as, align 8 + %83 = load i32, i32* %k, align 4 + %idxprom106 = zext i32 %83 to i64 + %arrayidx107 = getelementptr inbounds i32, i32* %82, i64 %idxprom106 + %84 = load i32, i32* %arrayidx107, align 4 + %rem = urem i32 %84, 32 + store i32 %rem, i32* %startbit, align 4 + %85 = load i32*, i32** %as, align 8 + %86 = load i32, i32* %k, align 4 + %idxprom108 = zext i32 %86 to i64 + %arrayidx109 = getelementptr inbounds i32, i32* %85, i64 %idxprom108 + store i32 0, i32* %arrayidx109, align 4 + call void @llvm.nvvm.barrier0() + %87 = load i32, i32* %codewordlen, align 4 + %88 = load i32, i32* %startbit, align 4 + %sub110 = sub i32 32, %88 + %cmp111 = icmp ugt i32 %87, %sub110 + br i1 %cmp111, label %cond.true, label %cond.false + +cond.true: ; preds = %if.end102 + %89 = load i32, i32* %startbit, align 4 + %sub112 = sub i32 32, %89 + br label %cond.end + +cond.false: ; preds = %if.end102 + %90 = load i32, i32* %codewordlen, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %sub112, %cond.true ], [ %90, %cond.false ] + store i32 %cond, i32* %wrbits, align 4 + %91 = load i64, i64* %cw64, align 8 + %92 = load i32, i32* %codewordlen, align 4 + %93 = load i32, i32* %wrbits, align 4 + %sub113 = sub i32 %92, %93 + %sh_prom114 = zext i32 %sub113 to i64 + %shr115 = lshr i64 %91, %sh_prom114 + %conv116 = trunc i64 %shr115 to i32 + store i32 %conv116, i32* %tmpcw32, align 4 + %94 = load i32*, i32** %as, align 8 + %95 = load i32, i32* %kc, align 4 + %idxprom117 = zext i32 %95 to i64 + %arrayidx118 = getelementptr inbounds i32, i32* %94, i64 %idxprom117 + %96 = load i32, i32* %tmpcw32, align 4 + %97 = load i32, i32* %startbit, align 4 + %sub119 = sub i32 32, %97 + %98 = load i32, i32* %wrbits, align 4 + %sub120 = sub i32 %sub119, %98 + %shl121 = shl i32 %96, %sub120 + %call122 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx118, i32 %shl121) #2 + %99 = load i32, i32* %wrbits, align 4 + %100 = load i32, i32* %codewordlen, align 4 + %sub123 = sub i32 %100, %99 + store i32 %sub123, i32* %codewordlen, align 4 + %101 = load i32, i32* %codewordlen, align 4 + %tobool = icmp ne i32 %101, 0 + br i1 %tobool, label %if.then124, label %if.end143 + +if.then124: ; preds = %cond.end + %102 = load i32, i32* %codewordlen, align 4 + %cmp125 = icmp ugt i32 %102, 32 + br i1 %cmp125, label %cond.true126, label %cond.false127 + +cond.true126: ; preds = %if.then124 + br label %cond.end128 + +cond.false127: ; preds = %if.then124 + %103 = load i32, i32* %codewordlen, align 4 + br label %cond.end128 + +cond.end128: ; preds = %cond.false127, %cond.true126 + %cond129 = phi i32 [ 32, %cond.true126 ], [ %103, %cond.false127 ] + store i32 %cond129, i32* %wrbits, align 4 + %104 = load i64, i64* %cw64, align 8 + %105 = load i32, i32* %codewordlen, align 4 + %106 = load i32, i32* %wrbits, align 4 + %sub130 = sub i32 %105, %106 + %sh_prom131 = zext i32 %sub130 to i64 + %shr132 = lshr i64 %104, %sh_prom131 + %conv133 = trunc i64 %shr132 to i32 + %107 = load i32, i32* %wrbits, align 4 + %shl134 = shl i32 1, %107 + %sub135 = sub nsw i32 %shl134, 1 + %and = and i32 %conv133, %sub135 + store i32 %and, i32* %tmpcw32, align 4 + %108 = load i32*, i32** %as, align 8 + %109 = load i32, i32* %kc, align 4 + %add136 = add i32 %109, 1 + %idxprom137 = zext i32 %add136 to i64 + %arrayidx138 = getelementptr inbounds i32, i32* %108, i64 %idxprom137 + %110 = load i32, i32* %tmpcw32, align 4 + %111 = load i32, i32* %wrbits, align 4 + %sub139 = sub i32 32, %111 + %shl140 = shl i32 %110, %sub139 + %call141 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx138, i32 %shl140) #2 + %112 = load i32, i32* %wrbits, align 4 + %113 = load i32, i32* %codewordlen, align 4 + %sub142 = sub i32 %113, %112 + store i32 %sub142, i32* %codewordlen, align 4 + br label %if.end143 + +if.end143: ; preds = %cond.end128, %cond.end + %114 = load i32, i32* %codewordlen, align 4 + %tobool144 = icmp ne i32 %114, 0 + br i1 %tobool144, label %if.then145, label %if.end157 + +if.then145: ; preds = %if.end143 + %115 = load i64, i64* %cw64, align 8 + %116 = load i32, i32* %codewordlen, align 4 + %shl146 = shl i32 1, %116 + %sub147 = sub nsw i32 %shl146, 1 + %conv148 = sext i32 %sub147 to i64 + %and149 = and i64 %115, %conv148 + %conv150 = trunc i64 %and149 to i32 + store i32 %conv150, i32* %tmpcw32, align 4 + %117 = load i32*, i32** %as, align 8 + %118 = load i32, i32* %kc, align 4 + %add151 = add i32 %118, 2 + %idxprom152 = zext i32 %add151 to i64 + %arrayidx153 = getelementptr inbounds i32, i32* %117, i64 %idxprom152 + %119 = load i32, i32* %tmpcw32, align 4 + %120 = load i32, i32* %codewordlen, align 4 + %sub154 = sub i32 32, %120 + %shl155 = shl i32 %119, %sub154 + %call156 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx153, i32 %shl155) #2 + br label %if.end157 + +if.end157: ; preds = %if.then145, %if.end143 + call void @llvm.nvvm.barrier0() + %121 = load i32, i32* %k, align 4 + %122 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax to i32*), align 4 + %cmp158 = icmp ule i32 %121, %122 + br i1 %cmp158, label %if.then159, label %if.end164 + +if.then159: ; preds = %if.end157 + %123 = load i32*, i32** %as, align 8 + %124 = load i32, i32* %k, align 4 + %idxprom160 = zext i32 %124 to i64 + %arrayidx161 = getelementptr inbounds i32, i32* %123, i64 %idxprom160 + %125 = load i32, i32* %arrayidx161, align 4 + %126 = load i32*, i32** %out.addr, align 8 + %127 = load i32, i32* %kn, align 4 + %idxprom162 = zext i32 %127 to i64 + %arrayidx163 = getelementptr inbounds i32, i32* %126, i64 %idxprom162 + store i32 %125, i32* %arrayidx163, align 4 + br label %if.end164 + +if.end164: ; preds = %if.then159, %if.end157 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal i32 @_ZL8atomicOrPjj(i32* %address, i32 %val) #0 { +entry: + %address.addr = alloca i32*, align 8 + %val.addr = alloca i32, align 4 + store i32* %address, i32** %address.addr, align 8 + store i32 %val, i32* %val.addr, align 4 + %0 = load i32*, i32** %address.addr, align 8 + %1 = load i32, i32* %val.addr, align 4 + %call = call i32 @_ZL11__uAtomicOrPjj(i32* %0, i32 %1) #2 + ret i32 %call +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL5pack2PjS_S_S_j(i32* %srcData, i32* %cindex, i32* %cindex2, i32* %dstData, i32 %original_num_block_elements) #0 { +entry: + %srcData.addr = alloca i32*, align 8 + %cindex.addr = alloca i32*, align 8 + %cindex2.addr = alloca i32*, align 8 + %dstData.addr = alloca i32*, align 8 + %original_num_block_elements.addr = alloca i32, align 4 + %tid = alloca i32, align 4 + %offset = alloca i32, align 4 + %bitsize = alloca i32, align 4 + %pos = alloca i32, align 4 + %dword = alloca i32, align 4 + %bit = alloca i32, align 4 + %i = alloca i32, align 4 + %dw = alloca i32, align 4 + %tmp = alloca i32, align 4 + store i32* %srcData, i32** %srcData.addr, align 8 + store i32* %cindex, i32** %cindex.addr, align 8 + store i32* %cindex2, i32** %cindex2.addr, align 8 + store i32* %dstData, i32** %dstData.addr, align 8 + store i32 %original_num_block_elements, i32* %original_num_block_elements.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %mul = mul i32 %call, %call1 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add = add i32 %mul, %call2 + store i32 %add, i32* %tid, align 4 + %0 = load i32, i32* %tid, align 4 + %1 = load i32, i32* %original_num_block_elements.addr, align 4 + %mul3 = mul i32 %0, %1 + store i32 %mul3, i32* %offset, align 4 + %2 = load i32*, i32** %cindex.addr, align 8 + %3 = load i32, i32* %tid, align 4 + %idxprom = zext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + store i32 %4, i32* %bitsize, align 4 + %5 = load i32*, i32** %cindex2.addr, align 8 + %6 = load i32, i32* %tid, align 4 + %idxprom4 = zext i32 %6 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %5, i64 %idxprom4 + %7 = load i32, i32* %arrayidx5, align 4 + store i32 %7, i32* %pos, align 4 + %8 = load i32, i32* %pos, align 4 + %div = udiv i32 %8, 32 + store i32 %div, i32* %dword, align 4 + %9 = load i32, i32* %pos, align 4 + %rem = urem i32 %9, 32 + store i32 %rem, i32* %bit, align 4 + %10 = load i32*, i32** %srcData.addr, align 8 + %11 = load i32, i32* %offset, align 4 + %idxprom6 = zext i32 %11 to i64 + %arrayidx7 = getelementptr inbounds i32, i32* %10, i64 %idxprom6 + %12 = load i32, i32* %arrayidx7, align 4 + store i32 %12, i32* %dw, align 4 + %13 = load i32, i32* %dw, align 4 + %14 = load i32, i32* %bit, align 4 + %shr = lshr i32 %13, %14 + store i32 %shr, i32* %tmp, align 4 + %15 = load i32*, i32** %dstData.addr, align 8 + %16 = load i32, i32* %dword, align 4 + %idxprom8 = zext i32 %16 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %15, i64 %idxprom8 + %17 = load i32, i32* %tmp, align 4 + %call10 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx9, i32 %17) #2 + %18 = load i32, i32* %bit, align 4 + %cmp = icmp eq i32 %18, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %19 = load i32, i32* %dw, align 4 + %20 = load i32, i32* %bit, align 4 + %sub = sub i32 32, %20 + %shl = shl i32 %19, %sub + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 0, %cond.true ], [ %shl, %cond.false ] + store i32 %cond, i32* %tmp, align 4 + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %cond.end + %21 = load i32, i32* %i, align 4 + %22 = load i32, i32* %bitsize, align 4 + %div11 = udiv i32 %22, 32 + %cmp12 = icmp ult i32 %21, %div11 + br i1 %cmp12, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %23 = load i32*, i32** %srcData.addr, align 8 + %24 = load i32, i32* %offset, align 4 + %25 = load i32, i32* %i, align 4 + %add13 = add i32 %24, %25 + %idxprom14 = zext i32 %add13 to i64 + %arrayidx15 = getelementptr inbounds i32, i32* %23, i64 %idxprom14 + %26 = load i32, i32* %arrayidx15, align 4 + store i32 %26, i32* %dw, align 4 + %27 = load i32, i32* %dw, align 4 + %28 = load i32, i32* %bit, align 4 + %shr16 = lshr i32 %27, %28 + %29 = load i32, i32* %tmp, align 4 + %or = or i32 %29, %shr16 + store i32 %or, i32* %tmp, align 4 + %30 = load i32, i32* %tmp, align 4 + %31 = load i32*, i32** %dstData.addr, align 8 + %32 = load i32, i32* %dword, align 4 + %33 = load i32, i32* %i, align 4 + %add17 = add i32 %32, %33 + %idxprom18 = zext i32 %add17 to i64 + %arrayidx19 = getelementptr inbounds i32, i32* %31, i64 %idxprom18 + store i32 %30, i32* %arrayidx19, align 4 + %34 = load i32, i32* %bit, align 4 + %cmp20 = icmp eq i32 %34, 0 + br i1 %cmp20, label %cond.true21, label %cond.false22 + +cond.true21: ; preds = %for.body + br label %cond.end25 + +cond.false22: ; preds = %for.body + %35 = load i32, i32* %dw, align 4 + %36 = load i32, i32* %bit, align 4 + %sub23 = sub i32 32, %36 + %shl24 = shl i32 %35, %sub23 + br label %cond.end25 + +cond.end25: ; preds = %cond.false22, %cond.true21 + %cond26 = phi i32 [ 0, %cond.true21 ], [ %shl24, %cond.false22 ] + store i32 %cond26, i32* %tmp, align 4 + br label %for.inc + +for.inc: ; preds = %cond.end25 + %37 = load i32, i32* %i, align 4 + %inc = add i32 %37, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %38 = load i32, i32* %bit, align 4 + %cmp27 = icmp ne i32 %38, 0 + br i1 %cmp27, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %for.end + %39 = load i32, i32* %bitsize, align 4 + %rem28 = urem i32 %39, 32 + %cmp29 = icmp ne i32 %rem28, 0 + br i1 %cmp29, label %if.then, label %if.end + +if.then: ; preds = %lor.lhs.false, %for.end + %40 = load i32*, i32** %dstData.addr, align 8 + %41 = load i32, i32* %dword, align 4 + %42 = load i32, i32* %i, align 4 + %add30 = add i32 %41, %42 + %idxprom31 = zext i32 %add30 to i64 + %arrayidx32 = getelementptr inbounds i32, i32* %40, i64 %idxprom31 + %43 = load i32, i32* %tmp, align 4 + %call33 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx32, i32 %43) #2 + br label %if.end + +if.end: ; preds = %if.then, %lor.lhs.false + %44 = load i32, i32* %bitsize, align 4 + %rem34 = urem i32 %44, 32 + %cmp35 = icmp ne i32 %rem34, 0 + br i1 %cmp35, label %if.then36, label %if.end57 + +if.then36: ; preds = %if.end + %45 = load i32*, i32** %srcData.addr, align 8 + %46 = load i32, i32* %offset, align 4 + %47 = load i32, i32* %i, align 4 + %add37 = add i32 %46, %47 + %idxprom38 = zext i32 %add37 to i64 + %arrayidx39 = getelementptr inbounds i32, i32* %45, i64 %idxprom38 + %48 = load i32, i32* %arrayidx39, align 4 + store i32 %48, i32* %dw, align 4 + %49 = load i32*, i32** %dstData.addr, align 8 + %50 = load i32, i32* %dword, align 4 + %51 = load i32, i32* %i, align 4 + %add40 = add i32 %50, %51 + %idxprom41 = zext i32 %add40 to i64 + %arrayidx42 = getelementptr inbounds i32, i32* %49, i64 %idxprom41 + %52 = load i32, i32* %dw, align 4 + %53 = load i32, i32* %bit, align 4 + %shr43 = lshr i32 %52, %53 + %call44 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx42, i32 %shr43) #2 + %54 = load i32*, i32** %dstData.addr, align 8 + %55 = load i32, i32* %dword, align 4 + %56 = load i32, i32* %i, align 4 + %add45 = add i32 %55, %56 + %add46 = add i32 %add45, 1 + %idxprom47 = zext i32 %add46 to i64 + %arrayidx48 = getelementptr inbounds i32, i32* %54, i64 %idxprom47 + %57 = load i32, i32* %bit, align 4 + %cmp49 = icmp eq i32 %57, 0 + br i1 %cmp49, label %cond.true50, label %cond.false51 + +cond.true50: ; preds = %if.then36 + br label %cond.end54 + +cond.false51: ; preds = %if.then36 + %58 = load i32, i32* %dw, align 4 + %59 = load i32, i32* %bit, align 4 + %sub52 = sub i32 32, %59 + %shl53 = shl i32 %58, %sub52 + br label %cond.end54 + +cond.end54: ; preds = %cond.false51, %cond.true50 + %cond55 = phi i32 [ 0, %cond.true50 ], [ %shl53, %cond.false51 ] + %call56 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx48, i32 %cond55) #2 + br label %if.end57 + +if.end57: ; preds = %cond.end54, %if.end + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL10uniformAddPjS_iii(i32* %g_data, i32* %uniforms, i32 %n, i32 %blockOffset, i32 %baseIndex) #0 { +entry: + %g_data.addr = alloca i32*, align 8 + %uniforms.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockOffset.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %address = alloca i32, align 4 + store i32* %g_data, i32** %g_data.addr, align 8 + store i32* %uniforms, i32** %uniforms.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockOffset, i32* %blockOffset.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %0 = load i32*, i32** %uniforms.addr, align 8 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %1 = load i32, i32* %blockOffset.addr, align 4 + %add = add i32 %call1, %1 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom + %2 = load i32, i32* %arrayidx, align 4 + store i32 %2, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call3, 1 + %call4 = call i32 @_ZL7__mul24ii(i32 %call2, i32 %shl) #2 + %3 = load i32, i32* %baseIndex.addr, align 4 + %add5 = add nsw i32 %call4, %3 + %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add7 = add i32 %add5, %call6 + store i32 %add7, i32* %address, align 4 + call void @llvm.nvvm.barrier0() + %4 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4 + %5 = load i32*, i32** %g_data.addr, align 8 + %6 = load i32, i32* %address, align 4 + %idxprom8 = zext i32 %6 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %5, i64 %idxprom8 + %7 = load i32, i32* %arrayidx9, align 4 + %add10 = add i32 %7, %4 + store i32 %add10, i32* %arrayidx9, align 4 + %call11 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %call12 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %add13 = add i32 %call11, %call12 + %8 = load i32, i32* %n.addr, align 4 + %cmp14 = icmp ult i32 %add13, %8 + %conv = zext i1 %cmp14 to i32 + %9 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4 + %mul = mul i32 %conv, %9 + %10 = load i32*, i32** %g_data.addr, align 8 + %11 = load i32, i32* %address, align 4 + %call15 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %add16 = add i32 %11, %call15 + %idxprom17 = zext i32 %add16 to i64 + %arrayidx18 = getelementptr inbounds i32, i32* %10, i64 %idxprom17 + %12 = load i32, i32* %arrayidx18, align 4 + %add19 = add i32 %12, %mul + store i32 %add19, i32* %arrayidx18, align 4 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define internal i32 @_ZL7__mul24ii(i32 %__a, i32 %__b) #1 { +entry: + %__a.addr = alloca i32, align 4 + %__b.addr = alloca i32, align 4 + store i32 %__a, i32* %__a.addr, align 4 + store i32 %__b, i32* %__b.addr, align 4 + %0 = load i32, i32* %__a.addr, align 4 + %1 = load i32, i32* %__b.addr, align 4 + %call = call i32 @__nv_mul24(i32 %0, i32 %1) #2 + ret i32 %call +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #3 + +; Function Attrs: alwaysinline convergent nounwind +define internal i32 @_ZL12__uAtomicAddPjj(i32* %__p, i32 %__v) #1 { +entry: + %__p.addr = alloca i32*, align 8 + %__v.addr = alloca i32, align 4 + store i32* %__p, i32** %__p.addr, align 8 + store i32 %__v, i32* %__v.addr, align 4 + %0 = load i32*, i32** %__p.addr, align 8 + %1 = load i32, i32* %__v.addr, align 4 + %2 = atomicrmw add i32* %0, i32 %1 seq_cst + ret i32 %2 +} + +; Function Attrs: alwaysinline convergent nounwind +define internal i32 @_ZL11__uAtomicOrPjj(i32* %__p, i32 %__v) #1 { +entry: + %__p.addr = alloca i32*, align 8 + %__v.addr = alloca i32, align 4 + store i32* %__p, i32** %__p.addr, align 8 + store i32 %__v, i32* %__v.addr, align 4 + %0 = load i32*, i32** %__p.addr, align 8 + %1 = load i32, i32* %__v.addr, align 4 + %2 = atomicrmw or i32* %0, i32 %1 seq_cst + ret i32 %2 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL7prescanILb1ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %ai = alloca i32, align 4 + %bi = alloca i32, align 4 + %mem_ai = alloca i32, align 4 + %mem_bi = alloca i32, align 4 + %bankOffsetA = alloca i32, align 4 + %bankOffsetB = alloca i32, align 4 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %0 = load i32*, i32** %g_idata.addr, align 8 + %1 = load i32, i32* %n.addr, align 4 + %2 = load i32, i32* %baseIndex.addr, align 4 + %cmp = icmp eq i32 %2, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call1, 1 + %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %baseIndex.addr, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] + call void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 + %4 = load i32, i32* %blockIndex.addr, align 4 + %5 = load i32*, i32** %g_blockSums.addr, align 8 + call void @_ZL12prescanBlockILb1EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 + %6 = load i32*, i32** %g_odata.addr, align 8 + %7 = load i32, i32* %n.addr, align 4 + %8 = load i32, i32* %ai, align 4 + %9 = load i32, i32* %bi, align 4 + %10 = load i32, i32* %mem_ai, align 4 + %11 = load i32, i32* %mem_bi, align 4 + %12 = load i32, i32* %bankOffsetA, align 4 + %13 = load i32, i32* %bankOffsetB, align 4 + call void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* %s_data, i32* %g_idata, i32 %n, i32 %baseIndex, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #0 { +entry: + %s_data.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %ai.addr = alloca i32*, align 8 + %bi.addr = alloca i32*, align 8 + %mem_ai.addr = alloca i32*, align 8 + %mem_bi.addr = alloca i32*, align 8 + %bankOffsetA.addr = alloca i32*, align 8 + %bankOffsetB.addr = alloca i32*, align 8 + %thid = alloca i32, align 4 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + store i32* %ai, i32** %ai.addr, align 8 + store i32* %bi, i32** %bi.addr, align 8 + store i32* %mem_ai, i32** %mem_ai.addr, align 8 + store i32* %mem_bi, i32** %mem_bi.addr, align 8 + store i32* %bankOffsetA, i32** %bankOffsetA.addr, align 8 + store i32* %bankOffsetB, i32** %bankOffsetB.addr, align 8 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %thid, align 4 + %0 = load i32, i32* %baseIndex.addr, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add = add i32 %0, %call1 + %1 = load i32*, i32** %mem_ai.addr, align 8 + store i32 %add, i32* %1, align 4 + %2 = load i32*, i32** %mem_ai.addr, align 8 + %3 = load i32, i32* %2, align 4 + %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %add3 = add i32 %3, %call2 + %4 = load i32*, i32** %mem_bi.addr, align 8 + store i32 %add3, i32* %4, align 4 + %5 = load i32, i32* %thid, align 4 + %6 = load i32*, i32** %ai.addr, align 8 + store i32 %5, i32* %6, align 4 + %7 = load i32, i32* %thid, align 4 + %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %add5 = add i32 %7, %call4 + %8 = load i32*, i32** %bi.addr, align 8 + store i32 %add5, i32* %8, align 4 + %9 = load i32*, i32** %ai.addr, align 8 + %10 = load i32, i32* %9, align 4 + %shr = ashr i32 %10, 4 + %11 = load i32*, i32** %bankOffsetA.addr, align 8 + store i32 %shr, i32* %11, align 4 + %12 = load i32*, i32** %bi.addr, align 8 + %13 = load i32, i32* %12, align 4 + %shr6 = ashr i32 %13, 4 + %14 = load i32*, i32** %bankOffsetB.addr, align 8 + store i32 %shr6, i32* %14, align 4 + %15 = load i32*, i32** %g_idata.addr, align 8 + %16 = load i32*, i32** %mem_ai.addr, align 8 + %17 = load i32, i32* %16, align 4 + %idxprom = sext i32 %17 to i64 + %arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom + %18 = load i32, i32* %arrayidx, align 4 + %19 = load i32*, i32** %s_data.addr, align 8 + %20 = load i32*, i32** %ai.addr, align 8 + %21 = load i32, i32* %20, align 4 + %22 = load i32*, i32** %bankOffsetA.addr, align 8 + %23 = load i32, i32* %22, align 4 + %add7 = add nsw i32 %21, %23 + %idxprom8 = sext i32 %add7 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %19, i64 %idxprom8 + store i32 %18, i32* %arrayidx9, align 4 + %24 = load i32*, i32** %g_idata.addr, align 8 + %25 = load i32*, i32** %mem_bi.addr, align 8 + %26 = load i32, i32* %25, align 4 + %idxprom10 = sext i32 %26 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %24, i64 %idxprom10 + %27 = load i32, i32* %arrayidx11, align 4 + %28 = load i32*, i32** %s_data.addr, align 8 + %29 = load i32*, i32** %bi.addr, align 8 + %30 = load i32, i32* %29, align 4 + %31 = load i32*, i32** %bankOffsetB.addr, align 8 + %32 = load i32, i32* %31, align 4 + %add12 = add nsw i32 %30, %32 + %idxprom13 = sext i32 %add12 to i64 + %arrayidx14 = getelementptr inbounds i32, i32* %28, i64 %idxprom13 + store i32 %27, i32* %arrayidx14, align 4 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL12prescanBlockILb1EEvPjiS0_(i32* %data, i32 %blockIndex, i32* %blockSums) #0 { +entry: + %data.addr = alloca i32*, align 8 + %blockIndex.addr = alloca i32, align 4 + %blockSums.addr = alloca i32*, align 8 + %stride = alloca i32, align 4 + store i32* %data, i32** %data.addr, align 8 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32* %blockSums, i32** %blockSums.addr, align 8 + %0 = load i32*, i32** %data.addr, align 8 + %call = call i32 @_ZL8buildSumPj(i32* %0) #2 + store i32 %call, i32* %stride, align 4 + %1 = load i32*, i32** %data.addr, align 8 + %2 = load i32*, i32** %blockSums.addr, align 8 + %3 = load i32, i32* %blockIndex.addr, align 4 + %cmp = icmp eq i32 %3, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + br label %cond.end + +cond.false: ; preds = %entry + %4 = load i32, i32* %blockIndex.addr, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %call1, %cond.true ], [ %4, %cond.false ] + call void @_ZL16clearLastElementILb1EEvPjS0_i(i32* %1, i32* %2, i32 %cond) #2 + %5 = load i32*, i32** %data.addr, align 8 + %6 = load i32, i32* %stride, align 4 + call void @_ZL16scanRootToLeavesPjj(i32* %5, i32 %6) #2 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %g_odata, i32* %s_data, i32 %n, i32 %ai, i32 %bi, i32 %mem_ai, i32 %mem_bi, i32 %bankOffsetA, i32 %bankOffsetB) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %s_data.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %ai.addr = alloca i32, align 4 + %bi.addr = alloca i32, align 4 + %mem_ai.addr = alloca i32, align 4 + %mem_bi.addr = alloca i32, align 4 + %bankOffsetA.addr = alloca i32, align 4 + %bankOffsetB.addr = alloca i32, align 4 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %ai, i32* %ai.addr, align 4 + store i32 %bi, i32* %bi.addr, align 4 + store i32 %mem_ai, i32* %mem_ai.addr, align 4 + store i32 %mem_bi, i32* %mem_bi.addr, align 4 + store i32 %bankOffsetA, i32* %bankOffsetA.addr, align 4 + store i32 %bankOffsetB, i32* %bankOffsetB.addr, align 4 + call void @llvm.nvvm.barrier0() + %0 = load i32*, i32** %s_data.addr, align 8 + %1 = load i32, i32* %ai.addr, align 4 + %2 = load i32, i32* %bankOffsetA.addr, align 4 + %add = add nsw i32 %1, %2 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i32*, i32** %g_odata.addr, align 8 + %5 = load i32, i32* %mem_ai.addr, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %4, i64 %idxprom1 + store i32 %3, i32* %arrayidx2, align 4 + %6 = load i32*, i32** %s_data.addr, align 8 + %7 = load i32, i32* %bi.addr, align 4 + %8 = load i32, i32* %bankOffsetB.addr, align 4 + %add3 = add nsw i32 %7, %8 + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %6, i64 %idxprom4 + %9 = load i32, i32* %arrayidx5, align 4 + %10 = load i32*, i32** %g_odata.addr, align 8 + %11 = load i32, i32* %mem_bi.addr, align 4 + %idxprom6 = sext i32 %11 to i64 + %arrayidx7 = getelementptr inbounds i32, i32* %10, i64 %idxprom6 + store i32 %9, i32* %arrayidx7, align 4 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal i32 @_ZL8buildSumPj(i32* %s_data) #0 { +entry: + %s_data.addr = alloca i32*, align 8 + %thid = alloca i32, align 4 + %stride = alloca i32, align 4 + %d = alloca i32, align 4 + %i = alloca i32, align 4 + %ai = alloca i32, align 4 + %bi = alloca i32, align 4 + store i32* %s_data, i32** %s_data.addr, align 8 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %thid, align 4 + store i32 1, i32* %stride, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %d, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %d, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + call void @llvm.nvvm.barrier0() + %1 = load i32, i32* %thid, align 4 + %2 = load i32, i32* %d, align 4 + %cmp2 = icmp ult i32 %1, %2 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %for.body + %3 = load i32, i32* %stride, align 4 + %call3 = call i32 @_ZL7__mul24ii(i32 2, i32 %3) #2 + %4 = load i32, i32* %thid, align 4 + %call4 = call i32 @_ZL7__mul24ii(i32 %call3, i32 %4) #2 + store i32 %call4, i32* %i, align 4 + %5 = load i32, i32* %i, align 4 + %6 = load i32, i32* %stride, align 4 + %add = add i32 %5, %6 + %sub = sub i32 %add, 1 + store i32 %sub, i32* %ai, align 4 + %7 = load i32, i32* %ai, align 4 + %8 = load i32, i32* %stride, align 4 + %add5 = add i32 %7, %8 + store i32 %add5, i32* %bi, align 4 + %9 = load i32, i32* %ai, align 4 + %shr = ashr i32 %9, 4 + %10 = load i32, i32* %ai, align 4 + %add6 = add nsw i32 %10, %shr + store i32 %add6, i32* %ai, align 4 + %11 = load i32, i32* %bi, align 4 + %shr7 = ashr i32 %11, 4 + %12 = load i32, i32* %bi, align 4 + %add8 = add nsw i32 %12, %shr7 + store i32 %add8, i32* %bi, align 4 + %13 = load i32*, i32** %s_data.addr, align 8 + %14 = load i32, i32* %ai, align 4 + %idxprom = sext i32 %14 to i64 + %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom + %15 = load i32, i32* %arrayidx, align 4 + %16 = load i32*, i32** %s_data.addr, align 8 + %17 = load i32, i32* %bi, align 4 + %idxprom9 = sext i32 %17 to i64 + %arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9 + %18 = load i32, i32* %arrayidx10, align 4 + %add11 = add i32 %18, %15 + store i32 %add11, i32* %arrayidx10, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %19 = load i32, i32* %stride, align 4 + %mul = mul i32 %19, 2 + store i32 %mul, i32* %stride, align 4 + br label %for.inc + +for.inc: ; preds = %if.end + %20 = load i32, i32* %d, align 4 + %shr12 = ashr i32 %20, 1 + store i32 %shr12, i32* %d, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %21 = load i32, i32* %stride, align 4 + ret i32 %21 +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL16clearLastElementILb1EEvPjS0_i(i32* %s_data, i32* %g_blockSums, i32 %blockIndex) #0 { +entry: + %s_data.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %blockIndex.addr = alloca i32, align 4 + %index = alloca i32, align 4 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call1, 1 + %sub = sub i32 %shl, 1 + store i32 %sub, i32* %index, align 4 + %0 = load i32, i32* %index, align 4 + %shr = ashr i32 %0, 4 + %1 = load i32, i32* %index, align 4 + %add = add nsw i32 %1, %shr + store i32 %add, i32* %index, align 4 + %2 = load i32*, i32** %s_data.addr, align 8 + %3 = load i32, i32* %index, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + %5 = load i32*, i32** %g_blockSums.addr, align 8 + %6 = load i32, i32* %blockIndex.addr, align 4 + %idxprom2 = sext i32 %6 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 + store i32 %4, i32* %arrayidx3, align 4 + %7 = load i32*, i32** %s_data.addr, align 8 + %8 = load i32, i32* %index, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %7, i64 %idxprom4 + store i32 0, i32* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL16scanRootToLeavesPjj(i32* %s_data, i32 %stride) #0 { +entry: + %s_data.addr = alloca i32*, align 8 + %stride.addr = alloca i32, align 4 + %thid = alloca i32, align 4 + %d = alloca i32, align 4 + %i = alloca i32, align 4 + %ai = alloca i32, align 4 + %bi = alloca i32, align 4 + %t = alloca i32, align 4 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32 %stride, i32* %stride.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %thid, align 4 + store i32 1, i32* %d, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %d, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %cmp = icmp ule i32 %0, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %stride.addr, align 4 + %shr = lshr i32 %1, 1 + store i32 %shr, i32* %stride.addr, align 4 + call void @llvm.nvvm.barrier0() + %2 = load i32, i32* %thid, align 4 + %3 = load i32, i32* %d, align 4 + %cmp2 = icmp ult i32 %2, %3 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %stride.addr, align 4 + %call3 = call i32 @_ZL7__mul24ii(i32 2, i32 %4) #2 + %5 = load i32, i32* %thid, align 4 + %call4 = call i32 @_ZL7__mul24ii(i32 %call3, i32 %5) #2 + store i32 %call4, i32* %i, align 4 + %6 = load i32, i32* %i, align 4 + %7 = load i32, i32* %stride.addr, align 4 + %add = add i32 %6, %7 + %sub = sub i32 %add, 1 + store i32 %sub, i32* %ai, align 4 + %8 = load i32, i32* %ai, align 4 + %9 = load i32, i32* %stride.addr, align 4 + %add5 = add i32 %8, %9 + store i32 %add5, i32* %bi, align 4 + %10 = load i32, i32* %ai, align 4 + %shr6 = ashr i32 %10, 4 + %11 = load i32, i32* %ai, align 4 + %add7 = add nsw i32 %11, %shr6 + store i32 %add7, i32* %ai, align 4 + %12 = load i32, i32* %bi, align 4 + %shr8 = ashr i32 %12, 4 + %13 = load i32, i32* %bi, align 4 + %add9 = add nsw i32 %13, %shr8 + store i32 %add9, i32* %bi, align 4 + %14 = load i32*, i32** %s_data.addr, align 8 + %15 = load i32, i32* %ai, align 4 + %idxprom = sext i32 %15 to i64 + %arrayidx = getelementptr inbounds i32, i32* %14, i64 %idxprom + %16 = load i32, i32* %arrayidx, align 4 + store i32 %16, i32* %t, align 4 + %17 = load i32*, i32** %s_data.addr, align 8 + %18 = load i32, i32* %bi, align 4 + %idxprom10 = sext i32 %18 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %17, i64 %idxprom10 + %19 = load i32, i32* %arrayidx11, align 4 + %20 = load i32*, i32** %s_data.addr, align 8 + %21 = load i32, i32* %ai, align 4 + %idxprom12 = sext i32 %21 to i64 + %arrayidx13 = getelementptr inbounds i32, i32* %20, i64 %idxprom12 + store i32 %19, i32* %arrayidx13, align 4 + %22 = load i32, i32* %t, align 4 + %23 = load i32*, i32** %s_data.addr, align 8 + %24 = load i32, i32* %bi, align 4 + %idxprom14 = sext i32 %24 to i64 + %arrayidx15 = getelementptr inbounds i32, i32* %23, i64 %idxprom14 + %25 = load i32, i32* %arrayidx15, align 4 + %add16 = add i32 %25, %22 + store i32 %add16, i32* %arrayidx15, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %26 = load i32, i32* %d, align 4 + %mul = mul nsw i32 %26, 2 + store i32 %mul, i32* %d, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL7prescanILb1ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %ai = alloca i32, align 4 + %bi = alloca i32, align 4 + %mem_ai = alloca i32, align 4 + %mem_bi = alloca i32, align 4 + %bankOffsetA = alloca i32, align 4 + %bankOffsetB = alloca i32, align 4 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %0 = load i32*, i32** %g_idata.addr, align 8 + %1 = load i32, i32* %n.addr, align 4 + %2 = load i32, i32* %baseIndex.addr, align 4 + %cmp = icmp eq i32 %2, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call1, 1 + %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %baseIndex.addr, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] + call void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 + %4 = load i32, i32* %blockIndex.addr, align 4 + %5 = load i32*, i32** %g_blockSums.addr, align 8 + call void @_ZL12prescanBlockILb1EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 + %6 = load i32*, i32** %g_odata.addr, align 8 + %7 = load i32, i32* %n.addr, align 4 + %8 = load i32, i32* %ai, align 4 + %9 = load i32, i32* %bi, align 4 + %10 = load i32, i32* %mem_ai, align 4 + %11 = load i32, i32* %mem_bi, align 4 + %12 = load i32, i32* %bankOffsetA, align 4 + %13 = load i32, i32* %bankOffsetB, align 4 + call void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* %s_data, i32* %g_idata, i32 %n, i32 %baseIndex, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #0 { +entry: + %s_data.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %ai.addr = alloca i32*, align 8 + %bi.addr = alloca i32*, align 8 + %mem_ai.addr = alloca i32*, align 8 + %mem_bi.addr = alloca i32*, align 8 + %bankOffsetA.addr = alloca i32*, align 8 + %bankOffsetB.addr = alloca i32*, align 8 + %thid = alloca i32, align 4 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + store i32* %ai, i32** %ai.addr, align 8 + store i32* %bi, i32** %bi.addr, align 8 + store i32* %mem_ai, i32** %mem_ai.addr, align 8 + store i32* %mem_bi, i32** %mem_bi.addr, align 8 + store i32* %bankOffsetA, i32** %bankOffsetA.addr, align 8 + store i32* %bankOffsetB, i32** %bankOffsetB.addr, align 8 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %thid, align 4 + %0 = load i32, i32* %baseIndex.addr, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add = add i32 %0, %call1 + %1 = load i32*, i32** %mem_ai.addr, align 8 + store i32 %add, i32* %1, align 4 + %2 = load i32*, i32** %mem_ai.addr, align 8 + %3 = load i32, i32* %2, align 4 + %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %add3 = add i32 %3, %call2 + %4 = load i32*, i32** %mem_bi.addr, align 8 + store i32 %add3, i32* %4, align 4 + %5 = load i32, i32* %thid, align 4 + %6 = load i32*, i32** %ai.addr, align 8 + store i32 %5, i32* %6, align 4 + %7 = load i32, i32* %thid, align 4 + %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %add5 = add i32 %7, %call4 + %8 = load i32*, i32** %bi.addr, align 8 + store i32 %add5, i32* %8, align 4 + %9 = load i32*, i32** %ai.addr, align 8 + %10 = load i32, i32* %9, align 4 + %shr = ashr i32 %10, 4 + %11 = load i32*, i32** %bankOffsetA.addr, align 8 + store i32 %shr, i32* %11, align 4 + %12 = load i32*, i32** %bi.addr, align 8 + %13 = load i32, i32* %12, align 4 + %shr6 = ashr i32 %13, 4 + %14 = load i32*, i32** %bankOffsetB.addr, align 8 + store i32 %shr6, i32* %14, align 4 + %15 = load i32*, i32** %g_idata.addr, align 8 + %16 = load i32*, i32** %mem_ai.addr, align 8 + %17 = load i32, i32* %16, align 4 + %idxprom = sext i32 %17 to i64 + %arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom + %18 = load i32, i32* %arrayidx, align 4 + %19 = load i32*, i32** %s_data.addr, align 8 + %20 = load i32*, i32** %ai.addr, align 8 + %21 = load i32, i32* %20, align 4 + %22 = load i32*, i32** %bankOffsetA.addr, align 8 + %23 = load i32, i32* %22, align 4 + %add7 = add nsw i32 %21, %23 + %idxprom8 = sext i32 %add7 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %19, i64 %idxprom8 + store i32 %18, i32* %arrayidx9, align 4 + %24 = load i32*, i32** %bi.addr, align 8 + %25 = load i32, i32* %24, align 4 + %26 = load i32, i32* %n.addr, align 4 + %cmp = icmp slt i32 %25, %26 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %27 = load i32*, i32** %g_idata.addr, align 8 + %28 = load i32*, i32** %mem_bi.addr, align 8 + %29 = load i32, i32* %28, align 4 + %idxprom10 = sext i32 %29 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %27, i64 %idxprom10 + %30 = load i32, i32* %arrayidx11, align 4 + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %30, %cond.true ], [ 0, %cond.false ] + %31 = load i32*, i32** %s_data.addr, align 8 + %32 = load i32*, i32** %bi.addr, align 8 + %33 = load i32, i32* %32, align 4 + %34 = load i32*, i32** %bankOffsetB.addr, align 8 + %35 = load i32, i32* %34, align 4 + %add12 = add nsw i32 %33, %35 + %idxprom13 = sext i32 %add12 to i64 + %arrayidx14 = getelementptr inbounds i32, i32* %31, i64 %idxprom13 + store i32 %cond, i32* %arrayidx14, align 4 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %g_odata, i32* %s_data, i32 %n, i32 %ai, i32 %bi, i32 %mem_ai, i32 %mem_bi, i32 %bankOffsetA, i32 %bankOffsetB) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %s_data.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %ai.addr = alloca i32, align 4 + %bi.addr = alloca i32, align 4 + %mem_ai.addr = alloca i32, align 4 + %mem_bi.addr = alloca i32, align 4 + %bankOffsetA.addr = alloca i32, align 4 + %bankOffsetB.addr = alloca i32, align 4 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %ai, i32* %ai.addr, align 4 + store i32 %bi, i32* %bi.addr, align 4 + store i32 %mem_ai, i32* %mem_ai.addr, align 4 + store i32 %mem_bi, i32* %mem_bi.addr, align 4 + store i32 %bankOffsetA, i32* %bankOffsetA.addr, align 4 + store i32 %bankOffsetB, i32* %bankOffsetB.addr, align 4 + call void @llvm.nvvm.barrier0() + %0 = load i32*, i32** %s_data.addr, align 8 + %1 = load i32, i32* %ai.addr, align 4 + %2 = load i32, i32* %bankOffsetA.addr, align 4 + %add = add nsw i32 %1, %2 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i32*, i32** %g_odata.addr, align 8 + %5 = load i32, i32* %mem_ai.addr, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %4, i64 %idxprom1 + store i32 %3, i32* %arrayidx2, align 4 + %6 = load i32, i32* %bi.addr, align 4 + %7 = load i32, i32* %n.addr, align 4 + %cmp = icmp slt i32 %6, %7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %8 = load i32*, i32** %s_data.addr, align 8 + %9 = load i32, i32* %bi.addr, align 4 + %10 = load i32, i32* %bankOffsetB.addr, align 4 + %add3 = add nsw i32 %9, %10 + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4 + %11 = load i32, i32* %arrayidx5, align 4 + %12 = load i32*, i32** %g_odata.addr, align 8 + %13 = load i32, i32* %mem_bi.addr, align 4 + %idxprom6 = sext i32 %13 to i64 + %arrayidx7 = getelementptr inbounds i32, i32* %12, i64 %idxprom6 + store i32 %11, i32* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL7prescanILb0ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %ai = alloca i32, align 4 + %bi = alloca i32, align 4 + %mem_ai = alloca i32, align 4 + %mem_bi = alloca i32, align 4 + %bankOffsetA = alloca i32, align 4 + %bankOffsetB = alloca i32, align 4 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %0 = load i32*, i32** %g_idata.addr, align 8 + %1 = load i32, i32* %n.addr, align 4 + %2 = load i32, i32* %baseIndex.addr, align 4 + %cmp = icmp eq i32 %2, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call1, 1 + %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %baseIndex.addr, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] + call void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 + %4 = load i32, i32* %blockIndex.addr, align 4 + %5 = load i32*, i32** %g_blockSums.addr, align 8 + call void @_ZL12prescanBlockILb0EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 + %6 = load i32*, i32** %g_odata.addr, align 8 + %7 = load i32, i32* %n.addr, align 4 + %8 = load i32, i32* %ai, align 4 + %9 = load i32, i32* %bi, align 4 + %10 = load i32, i32* %mem_ai, align 4 + %11 = load i32, i32* %mem_bi, align 4 + %12 = load i32, i32* %bankOffsetA, align 4 + %13 = load i32, i32* %bankOffsetB, align 4 + call void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL12prescanBlockILb0EEvPjiS0_(i32* %data, i32 %blockIndex, i32* %blockSums) #0 { +entry: + %data.addr = alloca i32*, align 8 + %blockIndex.addr = alloca i32, align 4 + %blockSums.addr = alloca i32*, align 8 + %stride = alloca i32, align 4 + store i32* %data, i32** %data.addr, align 8 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32* %blockSums, i32** %blockSums.addr, align 8 + %0 = load i32*, i32** %data.addr, align 8 + %call = call i32 @_ZL8buildSumPj(i32* %0) #2 + store i32 %call, i32* %stride, align 4 + %1 = load i32*, i32** %data.addr, align 8 + %2 = load i32*, i32** %blockSums.addr, align 8 + %3 = load i32, i32* %blockIndex.addr, align 4 + %cmp = icmp eq i32 %3, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + br label %cond.end + +cond.false: ; preds = %entry + %4 = load i32, i32* %blockIndex.addr, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %call1, %cond.true ], [ %4, %cond.false ] + call void @_ZL16clearLastElementILb0EEvPjS0_i(i32* %1, i32* %2, i32 %cond) #2 + %5 = load i32*, i32** %data.addr, align 8 + %6 = load i32, i32* %stride, align 4 + call void @_ZL16scanRootToLeavesPjj(i32* %5, i32 %6) #2 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal void @_ZL16clearLastElementILb0EEvPjS0_i(i32* %s_data, i32* %g_blockSums, i32 %blockIndex) #0 { +entry: + %s_data.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %blockIndex.addr = alloca i32, align 4 + %index = alloca i32, align 4 + store i32* %s_data, i32** %s_data.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call1, 1 + %sub = sub i32 %shl, 1 + store i32 %sub, i32* %index, align 4 + %0 = load i32, i32* %index, align 4 + %shr = ashr i32 %0, 4 + %1 = load i32, i32* %index, align 4 + %add = add nsw i32 %1, %shr + store i32 %add, i32* %index, align 4 + %2 = load i32*, i32** %s_data.addr, align 8 + %3 = load i32, i32* %index, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_ZL7prescanILb0ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %ai = alloca i32, align 4 + %bi = alloca i32, align 4 + %mem_ai = alloca i32, align 4 + %mem_bi = alloca i32, align 4 + %bankOffsetA = alloca i32, align 4 + %bankOffsetB = alloca i32, align 4 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %0 = load i32*, i32** %g_idata.addr, align 8 + %1 = load i32, i32* %n.addr, align 4 + %2 = load i32, i32* %baseIndex.addr, align 4 + %cmp = icmp eq i32 %2, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 + %shl = shl i32 %call1, 1 + %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %baseIndex.addr, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] + call void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 + %4 = load i32, i32* %blockIndex.addr, align 4 + %5 = load i32*, i32** %g_blockSums.addr, align 8 + call void @_ZL12prescanBlockILb0EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 + %6 = load i32*, i32** %g_odata.addr, align 8 + %7 = load i32, i32* %n.addr, align 4 + %8 = load i32, i32* %ai, align 4 + %9 = load i32, i32* %bi, align 4 + %10 = load i32, i32* %mem_ai, align 4 + %11 = load i32, i32* %mem_bi, align 4 + %12 = load i32, i32* %bankOffsetA, align 4 + %13 = load i32, i32* %bankOffsetB, align 4 + call void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 + ret void +} + +; Function Attrs: alwaysinline convergent inlinehint nounwind +define internal i32 @__nv_mul24(i32 %x, i32 %y) #4 { + %1 = call i32 @llvm.nvvm.mul24.i(i32 %x, i32 %y) + ret i32 %1 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.mul24.i(i32, i32) #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } +attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !11, !13, !13, !13, !13, !14, !14, !13} +!llvm.ident = !{!15} +!nvvmir.version = !{!16} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i8*, i64, i32*)* @_Z12histo_kernelPhlPj, !"kernel", i32 1} +!4 = !{void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, i32*)* @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_, !"kernel", i32 1} +!5 = !{void (i32*, i32*, i32*, i32*, i32)* @_ZL5pack2PjS_S_S_j, !"kernel", i32 1} +!6 = !{void (i32*, i32*, i32, i32, i32)* @_ZL10uniformAddPjS_iii, !"kernel", i32 1} +!7 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb0EEvPjPKjS0_iii, !"kernel", i32 1} +!8 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb1EEvPjPKjS0_iii, !"kernel", i32 1} +!9 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb0EEvPjPKjS0_iii, !"kernel", i32 1} +!10 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb1EEvPjPKjS0_iii, !"kernel", i32 1} +!11 = !{null, !"align", i32 8} +!12 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!13 = !{null, !"align", i32 16} +!14 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!15 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!16 = !{i32 1, i32 4} diff --git a/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll b/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..27e66c9 --- /dev/null +++ b/examples/huffman/main_test_cu-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,12230 @@ +; ModuleID = 'main_test_cu-host-x86_64-unknown-linux-gnu.bc' +source_filename = "main_test_cu.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::ios_base::Init" = type { i8 } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque +%class.INode = type <{ i32 (...)**, i32, [4 x i8] }> +%"class.std::priority_queue" = type <{ %"class.std::vector", %struct.NodeCmp, [7 x i8] }> +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base >::_Vector_impl" } +%"struct.std::_Vector_base >::_Vector_impl" = type { %class.INode**, %class.INode**, %class.INode** } +%struct.NodeCmp = type { i8 } +%class.LeafNode = type { %class.INode.base, i8, [3 x i8] } +%class.INode.base = type <{ i32 (...)**, i32 }> +%class.InternalNode = type { %class.INode.base, %class.INode*, %class.INode* } +%"class.__gnu_cxx::__normal_iterator" = type { %class.INode** } +%"class.std::allocator" = type { i8 } +%"class.std::vector.0" = type { %"struct.std::_Bvector_base" } +%"struct.std::_Bvector_base" = type { %"struct.std::_Bvector_base >::_Bvector_impl" } +%"struct.std::_Bvector_base >::_Bvector_impl" = type { %"struct.std::_Bit_iterator", %"struct.std::_Bit_iterator", i64* } +%"struct.std::_Bit_iterator" = type { %"struct.std::_Bit_iterator_base.base", [4 x i8] } +%"struct.std::_Bit_iterator_base.base" = type <{ i64*, i32 }> +%"class.std::map" = type { %"class.std::_Rb_tree" } +%"class.std::_Rb_tree" = type { %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl" } +%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl" = type { %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_header" } +%"struct.std::_Rb_tree_key_compare" = type { %"struct.std::less" } +%"struct.std::less" = type { i8 } +%"struct.std::_Rb_tree_header" = type { %"struct.std::_Rb_tree_node_base", i64 } +%"struct.std::_Rb_tree_node_base" = type { i32, %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } +%"struct.std::_Rb_tree_iterator" = type { %"struct.std::_Rb_tree_node_base"* } +%"struct.std::pair" = type { i8, %"class.std::vector.0" } +%"struct.std::_Bit_const_iterator" = type { %"struct.std::_Bit_iterator_base.base", [4 x i8] } +%"class.std::allocator.13" = type { i8 } +%"class.std::allocator.1" = type { i8 } +%"struct.std::_Bit_reference" = type { i64*, i64 } +%"struct.std::_Bit_iterator_base" = type <{ i64*, i32, [4 x i8] }> +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%"struct.std::_Rb_tree_const_iterator" = type { %"struct.std::_Rb_tree_node_base"* } +%"struct.std::random_access_iterator_tag" = type { i8 } +%"struct.std::_Rb_tree_node" = type { %"struct.std::_Rb_tree_node_base", %"struct.std::pair" } +%"class.std::allocator.4" = type { i8 } +%"class.__gnu_cxx::new_allocator.5" = type { i8 } +%"class.__gnu_cxx::new_allocator.2" = type { i8 } +%"struct.std::iterator" = type { i8 } +%"class.std::allocator.7" = type { i8 } +%"class.__gnu_cxx::new_allocator.8" = type { i8 } +%"class.__gnu_cxx::new_allocator" = type { i8 } +%"class.__gnu_cxx::__normal_iterator.10" = type { %class.INode** } +%"struct.__gnu_cxx::__ops::_Iter_comp_iter" = type { %struct.NodeCmp } +%"struct.__gnu_cxx::__ops::_Iter_comp_val" = type { %struct.NodeCmp } +%"class.std::__pair_base" = type { i8 } +%"struct.std::_Select1st" = type { i8 } +%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node" = type { %"class.std::_Rb_tree"* } +%"struct.std::pair.11" = type { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } +%"class.std::__pair_base.12" = type { i8 } +%"class.__gnu_cxx::new_allocator.14" = type { i8 } + +$_Z9gpuAssert9cudaErrorPKcib = comdat any + +$_ZN4dim3C2Ejjj = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EEC2Ev = comdat any + +$_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpEC2ERKS5_RKS4_ = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EED2Ev = comdat any + +$__clang_call_terminate = comdat any + +$_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4pushERKS1_ = comdat any + +$_ZN8LeafNodeC2Eic = comdat any + +$_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4sizeEv = comdat any + +$_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3topEv = comdat any + +$_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3popEv = comdat any + +$_ZN12InternalNodeC2EP5INodeS1_ = comdat any + +$_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpED2Ev = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEEixERS6_ = comdat any + +$_ZNSt6vectorIbSaIbEEaSERKS1_ = comdat any + +$_ZNSt6vectorIbSaIbEEC2ERKS1_ = comdat any + +$_ZNSt6vectorIbSaIbEE9push_backEb = comdat any + +$_ZNSt6vectorIbSaIbEED2Ev = comdat any + +$_Z10initParamsPcjRjS0_S0_j = comdat any + +$_Z8loadDataPcPjS0_S0_jjRd = comdat any + +$_Z15compare_vectorsIjEiPT_S1_j = comdat any + +$_ZN5INodeC2Ei = comdat any + +$_ZN8LeafNodeD2Ev = comdat any + +$_ZN8LeafNodeD0Ev = comdat any + +$_ZN5INodeD2Ev = comdat any + +$_ZN5INodeD0Ev = comdat any + +$_ZN12InternalNodeD2Ev = comdat any + +$_ZN12InternalNodeD0Ev = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEEC2Ev = comdat any + +$_ZNSt6vectorIbSaIbEEC2Ev = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE5beginEv = comdat any + +$_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2ERKSt17_Rb_tree_iteratorIS5_E = comdat any + +$_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEneERKS6_ = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE3endEv = comdat any + +$_ZSt8distanceISt19_Bit_const_iteratorENSt15iterator_traitsIT_E15difference_typeES2_S2_ = comdat any + +$_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv = comdat any + +$_ZNKSt6vectorIbSaIbEE5beginEv = comdat any + +$_ZNKSt6vectorIbSaIbEE3endEv = comdat any + +$_ZNKSt6vectorIbSaIbEEixEm = comdat any + +$_ZSt3powfi = comdat any + +$_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEppEv = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEED2Ev = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EEC2Ev = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EEC2Ev = comdat any + +$_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEC2Ev = comdat any + +$_ZNSt20_Rb_tree_key_compareISt4lessIhEEC2Ev = comdat any + +$_ZNSt15_Rb_tree_headerC2Ev = comdat any + +$_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEC2Ev = comdat any + +$_ZNSt15_Rb_tree_header8_M_resetEv = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev = comdat any + +$_ZNSt13_Bvector_baseISaIbEEC2Ev = comdat any + +$_ZNSt13_Bvector_baseISaIbEE13_Bvector_implC2Ev = comdat any + +$_ZNSaImEC2Ev = comdat any + +$_ZNSt13_Bit_iteratorC2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorImEC2Ev = comdat any + +$_ZNSt18_Bit_iterator_baseC2EPmj = comdat any + +$_ZN9__gnu_cxx13new_allocatorImED2Ev = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE5beginEv = comdat any + +$_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE3endEv = comdat any + +$_ZSt10__distanceISt19_Bit_const_iteratorENSt15iterator_traitsIT_E15difference_typeES2_S2_St26random_access_iterator_tag = comdat any + +$_ZSt19__iterator_categoryISt19_Bit_const_iteratorENSt15iterator_traitsIT_E17iterator_categoryERKS2_ = comdat any + +$_ZStmiRKSt18_Bit_iterator_baseS1_ = comdat any + +$_ZNKSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv = comdat any + +$_ZSt11__addressofIKSt4pairIKhSt6vectorIbSaIbEEEEPT_RS7_ = comdat any + +$_ZNSt19_Bit_const_iteratorC2ERKSt13_Bit_iterator = comdat any + +$_ZNSt19_Bit_const_iteratorC2EPmj = comdat any + +$_ZNKSt19_Bit_const_iteratordeEv = comdat any + +$_ZNSt14_Bit_referenceC2EPmm = comdat any + +$_ZNKSt14_Bit_referencecvbEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EED2Ev = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_eraseEPSt13_Rb_tree_nodeIS5_E = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_beginEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EED2Ev = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE7_S_leftEPSt18_Rb_tree_node_base = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_drop_nodeEPSt13_Rb_tree_nodeIS5_E = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE15_M_destroy_nodeEPSt13_Rb_tree_nodeIS5_E = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_put_nodeEPSt13_Rb_tree_nodeIS5_E = comdat any + +$_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13get_allocatorEv = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEE7destroyEPS6_ = comdat any + +$_ZNSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv = comdat any + +$_ZNSaISt4pairIKhSt6vectorIbSaIbEEEED2Ev = comdat any + +$_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv = comdat any + +$_ZNSaISt4pairIKhSt6vectorIbSaIbEEEEC2ISt13_Rb_tree_nodeIS4_EEERKSaIT_E = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEEC2Ev = comdat any + +$_ZNSt4pairIKhSt6vectorIbSaIbEEED2Ev = comdat any + +$_ZSt11__addressofISt4pairIKhSt6vectorIbSaIbEEEEPT_RS6_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEED2Ev = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEE10deallocateERS9_PS8_m = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE10deallocateEPS8_m = comdat any + +$_ZSt4ceilf = comdat any + +$_Z12isPowerOfTwoi = comdat any + +$_Z9floorPow2i = comdat any + +$_ZSt5frexpfPi = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EEC2Ev = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implC2Ev = comdat any + +$_ZNSaIP5INodeEC2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeEC2Ev = comdat any + +$_ZSt8_DestroyIPP5INodeS1_EvT_S3_RSaIT0_E = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EED2Ev = comdat any + +$_ZSt8_DestroyIPP5INodeEvT_S3_ = comdat any + +$_ZNSt12_Destroy_auxILb1EE9__destroyIPP5INodeEEvT_S5_ = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE13_M_deallocateEPS1_m = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implD2Ev = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE10deallocateERS3_PS2_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeE10deallocateEPS2_m = comdat any + +$_ZNSaIP5INodeED2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeED2Ev = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EEC2ERKS3_ = comdat any + +$_ZSt9make_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_ = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EE5beginEv = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EE3endEv = comdat any + +$_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE17_S_select_on_copyERKS3_ = comdat any + +$_ZNKSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EEC2EmRKS2_ = comdat any + +$_ZSt22__uninitialized_copy_aIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_S3_ET0_T_SC_SB_RSaIT1_E = comdat any + +$_ZNKSt6vectorIP5INodeSaIS1_EE5beginEv = comdat any + +$_ZNKSt6vectorIP5INodeSaIS1_EE3endEv = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implC2ERKS2_ = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE17_M_create_storageEm = comdat any + +$_ZNSaIP5INodeEC2ERKS1_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeEC2ERKS3_ = comdat any + +$_ZNSt12_Vector_baseIP5INodeSaIS1_EE11_M_allocateEm = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE8allocateERS3_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeE8allocateEmPKv = comdat any + +$_ZNK9__gnu_cxx13new_allocatorIP5INodeE8max_sizeEv = comdat any + +$_ZSt18uninitialized_copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET0_T_SC_SB_ = comdat any + +$_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS5_SaIS5_EEEEPS5_EET0_T_SE_SD_ = comdat any + +$_ZSt4copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET0_T_SC_SB_ = comdat any + +$_ZSt14__copy_move_a2ILb0EN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET1_T0_SC_SB_ = comdat any + +$_ZSt12__miter_baseIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEET_SA_ = comdat any + +$_ZSt13__copy_move_aILb0EPKP5INodePS1_ET1_T0_S6_S5_ = comdat any + +$_ZSt12__niter_baseIPKP5INodeSt6vectorIS1_SaIS1_EEET_N9__gnu_cxx17__normal_iteratorIS7_T0_EE = comdat any + +$_ZSt12__niter_baseIPP5INodeET_S3_ = comdat any + +$_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mIP5INodeEEPT_PKS5_S8_S6_ = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEE4baseEv = comdat any + +$_ZN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS4_ = comdat any + +$_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEC2ES2_ = comdat any + +$_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEENS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_SD_RT0_ = comdat any + +$_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_ = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv = comdat any + +$_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_T0_SE_T1_T2_ = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEE4baseEv = comdat any + +$_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS3_ = comdat any + +$_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEclINS_17__normal_iteratorIPP5INodeSt6vectorIS7_SaIS7_EEEESC_EEbT_T0_ = comdat any + +$_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEC2ERKNS0_15_Iter_comp_iterIS2_EE = comdat any + +$_ZSt11__push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops14_Iter_comp_valI7NodeCmpEEEvT_T0_SE_T1_RT2_ = comdat any + +$_ZNK7NodeCmpclEPK5INodeS2_ = comdat any + +$_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEclINS_17__normal_iteratorIPP5INodeSt6vectorIS7_SaIS7_EEEES7_EEbT_RT0_ = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EE9push_backERKS1_ = comdat any + +$_ZSt9push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE9constructIS2_EEvRS3_PS2_RKT_ = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS1_S3_EERKS1_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeE9constructEPS2_RKS2_ = comdat any + +$_ZNKSt6vectorIP5INodeSaIS1_EE12_M_check_lenEmPKc = comdat any + +$_ZSt34__uninitialized_move_if_noexcept_aIPP5INodeS2_SaIS1_EET0_T_S5_S4_RT1_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE7destroyERS3_PS2_ = comdat any + +$_ZNKSt6vectorIP5INodeSaIS1_EE8max_sizeEv = comdat any + +$_ZSt3maxImERKT_S2_S2_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE8max_sizeERKS3_ = comdat any + +$_ZSt22__uninitialized_copy_aIPP5INodeS2_S1_ET0_T_S4_S3_RSaIT1_E = comdat any + +$_ZSt18uninitialized_copyIPP5INodeS2_ET0_T_S4_S3_ = comdat any + +$_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIPP5INodeS4_EET0_T_S6_S5_ = comdat any + +$_ZSt4copyIPP5INodeS2_ET0_T_S4_S3_ = comdat any + +$_ZSt14__copy_move_a2ILb0EPP5INodeS2_ET1_T0_S4_S3_ = comdat any + +$_ZSt12__miter_baseIPP5INodeET_S3_ = comdat any + +$_ZSt13__copy_move_aILb0EPP5INodeS2_ET1_T0_S4_S3_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorIP5INodeE7destroyEPS2_ = comdat any + +$_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEC2ES2_ = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEmiEl = comdat any + +$_ZNKSt6vectorIP5INodeSaIS1_EE5frontEv = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEdeEv = comdat any + +$_ZSt8pop_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_ = comdat any + +$_ZNSt6vectorIP5INodeSaIS1_EE8pop_backEv = comdat any + +$_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEmmEv = comdat any + +$_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEENS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_SD_SD_RT0_ = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE11lower_boundERS6_ = comdat any + +$_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEeqERKS6_ = comdat any + +$_ZNKSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE8key_compEv = comdat any + +$_ZNKSt4lessIhEclERKhS2_ = comdat any + +$_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEdeEv = comdat any + +$_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE6insertESt17_Rb_tree_iteratorIS7_ERKS7_ = comdat any + +$_ZNSt4pairIKhSt6vectorIbSaIbEEEC2ERS0_RKS3_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11lower_boundERS1_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE14_M_lower_boundEPSt13_Rb_tree_nodeIS5_EPSt18_Rb_tree_node_baseRS1_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_M_endEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt13_Rb_tree_nodeIS5_E = comdat any + +$_ZNKSt10_Select1stISt4pairIKhSt6vectorIbSaIbEEEEclERKS5_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_valueEPKSt13_Rb_tree_nodeIS5_E = comdat any + +$_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8key_compEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_insert_unique_ESt23_Rb_tree_const_iteratorIS5_ERKS5_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_Alloc_nodeC2ERSB_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_insert_unique_INSB_11_Alloc_nodeEEESt17_Rb_tree_iteratorIS5_ESt23_Rb_tree_const_iteratorIS5_ERKS5_RT_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS5_ERS1_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE10_M_insert_INSB_11_Alloc_nodeEEESt17_Rb_tree_iteratorIS5_EPSt18_Rb_tree_node_baseSH_RKS5_RT_ = comdat any + +$_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEE13_M_const_castEv = comdat any + +$_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE4sizeEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_rightmostEv = comdat any + +$_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE24_M_get_insert_unique_posERS1_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_leftmostEv = comdat any + +$_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEmmEv = comdat any + +$_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEppEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_valueEPKSt18_Rb_tree_node_base = comdat any + +$_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_Alloc_nodeclIS5_EEPSt13_Rb_tree_nodeIS5_ERKT_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE14_M_create_nodeERKS5_ = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_get_nodeEv = comdat any + +$_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_construct_nodeEPSt13_Rb_tree_nodeIS5_ERKS5_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEE8allocateERS9_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE8allocateEmPKv = comdat any + +$_ZNK9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE8max_sizeEv = comdat any + +$_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEE9constructEPS6_RKS6_ = comdat any + +$_ZNSt4pairIKhSt6vectorIbSaIbEEEC2ERKS4_ = comdat any + +$_ZNKSt6vectorIbSaIbEE4sizeEv = comdat any + +$_ZNKSt6vectorIbSaIbEE8capacityEv = comdat any + +$_ZNSt13_Bvector_baseISaIbEE13_M_deallocateEv = comdat any + +$_ZNSt6vectorIbSaIbEE13_M_initializeEm = comdat any + +$_ZNSt6vectorIbSaIbEE15_M_copy_alignedESt19_Bit_const_iteratorS2_St13_Bit_iterator = comdat any + +$_ZNSt6vectorIbSaIbEE5beginEv = comdat any + +$_ZNKSt13_Bvector_baseISaIbEE13_Bvector_impl11_M_end_addrEv = comdat any + +$_ZSt11__addressofImEPT_RS0_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaImEE10deallocateERS1_Pmm = comdat any + +$_ZN9__gnu_cxx13new_allocatorImE10deallocateEPmm = comdat any + +$_ZNSt13_Bvector_baseISaIbEE11_M_allocateEm = comdat any + +$_ZNSt13_Bvector_baseISaIbEE8_S_nwordEm = comdat any + +$_ZNSt13_Bit_iteratorC2EPmj = comdat any + +$_ZNKSt13_Bit_iteratorplEl = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaImEE8allocateERS1_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorImE8allocateEmPKv = comdat any + +$_ZNK9__gnu_cxx13new_allocatorImE8max_sizeEv = comdat any + +$_ZNSt13_Bit_iteratorpLEl = comdat any + +$_ZNSt18_Bit_iterator_base7_M_incrEl = comdat any + +$_ZSt4copyIPmS0_ET0_T_S2_S1_ = comdat any + +$_ZSt4copyISt19_Bit_const_iteratorSt13_Bit_iteratorET0_T_S3_S2_ = comdat any + +$_ZSt14__copy_move_a2ILb0EPmS0_ET1_T0_S2_S1_ = comdat any + +$_ZSt12__miter_baseIPmET_S1_ = comdat any + +$_ZSt13__copy_move_aILb0EPmS0_ET1_T0_S2_S1_ = comdat any + +$_ZSt12__niter_baseIPmET_S1_ = comdat any + +$_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mImEEPT_PKS3_S6_S4_ = comdat any + +$_ZSt14__copy_move_a2ILb0ESt19_Bit_const_iteratorSt13_Bit_iteratorET1_T0_S3_S2_ = comdat any + +$_ZSt12__miter_baseISt19_Bit_const_iteratorET_S1_ = comdat any + +$_ZSt13__copy_move_aILb0ESt19_Bit_const_iteratorSt13_Bit_iteratorET1_T0_S3_S2_ = comdat any + +$_ZSt12__niter_baseISt19_Bit_const_iteratorET_S1_ = comdat any + +$_ZSt12__niter_baseISt13_Bit_iteratorET_S1_ = comdat any + +$_ZNSt11__copy_moveILb0ELb0ESt26random_access_iterator_tagE8__copy_mISt19_Bit_const_iteratorSt13_Bit_iteratorEET0_T_S6_S5_ = comdat any + +$_ZNKSt13_Bit_iteratordeEv = comdat any + +$_ZNSt14_Bit_referenceaSEb = comdat any + +$_ZNSt19_Bit_const_iteratorppEv = comdat any + +$_ZNSt13_Bit_iteratorppEv = comdat any + +$_ZNSt18_Bit_iterator_base10_M_bump_upEv = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaImEE17_S_select_on_copyERKS1_ = comdat any + +$_ZNKSt13_Bvector_baseISaIbEE20_M_get_Bit_allocatorEv = comdat any + +$_ZNSaIbEC2ImEERKSaIT_E = comdat any + +$_ZNSt13_Bvector_baseISaIbEEC2ERKS0_ = comdat any + +$_ZNSaIbED2Ev = comdat any + +$_ZNSt13_Bvector_baseISaIbEED2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorIbEC2Ev = comdat any + +$_ZNSaImEC2IbEERKSaIT_E = comdat any + +$_ZNSt13_Bvector_baseISaIbEE13_Bvector_implC2ERKSaImE = comdat any + +$_ZNSaImED2Ev = comdat any + +$_ZNSaImEC2ERKS_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorImEC2ERKS1_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorIbED2Ev = comdat any + +$_ZNSt13_Bvector_baseISaIbEE13_Bvector_implD2Ev = comdat any + +$_ZNSt13_Bit_iteratorppEi = comdat any + +$_ZNSt6vectorIbSaIbEE13_M_insert_auxESt13_Bit_iteratorb = comdat any + +$_ZNSt6vectorIbSaIbEE3endEv = comdat any + +$_ZSt13copy_backwardISt13_Bit_iteratorS0_ET0_T_S2_S1_ = comdat any + +$_ZNKSt6vectorIbSaIbEE12_M_check_lenEmPKc = comdat any + +$_ZSt4copyISt13_Bit_iteratorS0_ET0_T_S2_S1_ = comdat any + +$_ZSt23__copy_move_backward_a2ILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_ = comdat any + +$_ZSt12__miter_baseISt13_Bit_iteratorET_S1_ = comdat any + +$_ZSt22__copy_move_backward_aILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_ = comdat any + +$_ZNSt20__copy_move_backwardILb0ELb0ESt26random_access_iterator_tagE13__copy_move_bISt13_Bit_iteratorS3_EET0_T_S5_S4_ = comdat any + +$_ZNSt13_Bit_iteratormmEv = comdat any + +$_ZNSt14_Bit_referenceaSERKS_ = comdat any + +$_ZNSt18_Bit_iterator_base12_M_bump_downEv = comdat any + +$_ZNKSt6vectorIbSaIbEE8max_sizeEv = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaImEE8max_sizeERKS1_ = comdat any + +$_ZSt14__copy_move_a2ILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_ = comdat any + +$_ZSt13__copy_move_aILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_ = comdat any + +$_ZNSt11__copy_moveILb0ELb0ESt26random_access_iterator_tagE8__copy_mISt13_Bit_iteratorS3_EET0_T_S5_S4_ = comdat any + +$_ZTS5INode = comdat any + +$_ZTI5INode = comdat any + +$_ZTS8LeafNode = comdat any + +$_ZTI8LeafNode = comdat any + +$_ZTS12InternalNode = comdat any + +$_ZTI12InternalNode = comdat any + +$_ZTV8LeafNode = comdat any + +$_ZTV5INode = comdat any + +$_ZTV12InternalNode = comdat any + +@.str = private unnamed_addr constant [19 x i8] c"CUDA initialized.\0A\00", align 1 +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external hidden global i8 +@.str.1 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 +@.str.2 = private unnamed_addr constant [23 x i8] c"Cannot read input file\00", align 1 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.3 = private unnamed_addr constant [10 x i8] c"./hist.cu\00", align 1 +@_ZTVN10__cxxabiv117__class_type_infoE = external dso_local global i8* +@_ZTS5INode = linkonce_odr dso_local constant [7 x i8] c"5INode\00", comdat, align 1 +@_ZTI5INode = linkonce_odr dso_local constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([7 x i8], [7 x i8]* @_ZTS5INode, i32 0, i32 0) }, comdat, align 8 +@_ZTVN10__cxxabiv120__si_class_type_infoE = external dso_local global i8* +@_ZTS8LeafNode = linkonce_odr dso_local constant [10 x i8] c"8LeafNode\00", comdat, align 1 +@_ZTI8LeafNode = linkonce_odr dso_local constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @_ZTS8LeafNode, i32 0, i32 0), i8* bitcast ({ i8*, i8* }* @_ZTI5INode to i8*) }, comdat, align 8 +@_ZTS12InternalNode = linkonce_odr dso_local constant [15 x i8] c"12InternalNode\00", comdat, align 1 +@_ZTI12InternalNode = linkonce_odr dso_local constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([15 x i8], [15 x i8]* @_ZTS12InternalNode, i32 0, i32 0), i8* bitcast ({ i8*, i8* }* @_ZTI5INode to i8*) }, comdat, align 8 +@.str.4 = private unnamed_addr constant [27 x i8] c"CUDA! Starting VLC Tests!\0A\00", align 1 +@.str.5 = private unnamed_addr constant [98 x i8] c"Parameters: num_elements: %d, num_blocks: %d, num_block_threads: %d\0A----------------------------\0A\00", align 1 +@.str.6 = private unnamed_addr constant [42 x i8] c"Cuda error in file '%s' in line %i : %s.\0A\00", align 1 +@.str.7 = private unnamed_addr constant [16 x i8] c"main_test_cu.cu\00", align 1 +@.str.8 = private unnamed_addr constant [34 x i8] c"CPU Encoding time (CPU): %f (ms)\0A\00", align 1 +@.str.9 = private unnamed_addr constant [23 x i8] c"CPU Encoded to %d [B]\0A\00", align 1 +@.str.10 = private unnamed_addr constant [40 x i8] c"Num_blocks to be passed to scan is %d.\0A\00", align 1 +@.str.11 = private unnamed_addr constant [46 x i8] c"Cuda error: %s in file '%s' in line %i : %s.\0A\00", align 1 +@.str.12 = private unnamed_addr constant [31 x i8] c"Pack2 Kernel execution failed\0A\00", align 1 +@.str.13 = private unnamed_addr constant [21 x i8] c"GPUassert: %s %s %d\0A\00", align 1 +@_ZTV8LeafNode = linkonce_odr dso_local unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI8LeafNode to i8*), i8* bitcast (void (%class.LeafNode*)* @_ZN8LeafNodeD2Ev to i8*), i8* bitcast (void (%class.LeafNode*)* @_ZN8LeafNodeD0Ev to i8*)] }, comdat, align 8 +@_ZTV5INode = linkonce_odr dso_local unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI5INode to i8*), i8* bitcast (void (%class.INode*)* @_ZN5INodeD2Ev to i8*), i8* bitcast (void (%class.INode*)* @_ZN5INodeD0Ev to i8*)] }, comdat, align 8 +@_ZTV12InternalNode = linkonce_odr dso_local unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI12InternalNode to i8*), i8* bitcast (void (%class.InternalNode*)* @_ZN12InternalNodeD2Ev to i8*), i8* bitcast (void (%class.InternalNode*)* @_ZN12InternalNodeD0Ev to i8*)] }, comdat, align 8 +@.str.14 = private unnamed_addr constant [15 x i8] c"No input file\0A\00", align 1 +@.str.15 = private unnamed_addr constant [28 x i8] c"\0A%s, %u bytes, entropy %f\0A\0A\00", align 1 +@_ZL18g_numEltsAllocated = internal global i32 0, align 4 +@.str.16 = private unnamed_addr constant [24 x i8] c"g_numEltsAllocated == 0\00", align 1 +@.str.17 = private unnamed_addr constant [10 x i8] c"./scan.cu\00", align 1 +@__PRETTY_FUNCTION__._ZL17preallocBlockSumsj = private unnamed_addr constant [37 x i8] c"void preallocBlockSums(unsigned int)\00", align 1 +@_ZL15g_scanBlockSums = internal global i32** null, align 8 +@_ZL20g_numLevelsAllocated = internal global i32 0, align 4 +@.str.18 = private unnamed_addr constant [18 x i8] c"preallocBlockSums\00", align 1 +@.str.19 = private unnamed_addr constant [37 x i8] c"prescanArrayRecursive before kernels\00", align 1 +@.str.20 = private unnamed_addr constant [21 x i8] c"prescanWithBlockSums\00", align 1 +@.str.21 = private unnamed_addr constant [24 x i8] c"prescanNP2WithBlockSums\00", align 1 +@.str.22 = private unnamed_addr constant [11 x i8] c"uniformAdd\00", align 1 +@.str.23 = private unnamed_addr constant [8 x i8] c"prescan\00", align 1 +@.str.24 = private unnamed_addr constant [11 x i8] c"prescanNP2\00", align 1 +@.str.25 = private unnamed_addr constant [17 x i8] c"deallocBlockSums\00", align 1 +@.str.26 = private unnamed_addr constant [26 x i8] c"vector::_M_realloc_insert\00", align 1 +@.str.27 = private unnamed_addr constant [28 x i8] c"vector::_M_insert_aux\00", align 1 +@.str.28 = private unnamed_addr constant [21 x i8] c"Comparing vectors: \0A\00", align 1 +@.str.29 = private unnamed_addr constant [36 x i8] c"Diff: data1[%d]=%d, data1[%d]=%d.\0A\00", align 1 +@.str.30 = private unnamed_addr constant [29 x i8] c"PASS! vectors are matching!\0A\00", align 1 +@.str.31 = private unnamed_addr constant [33 x i8] c"FAIL! vectors are NOT matching!\0A\00", align 1 +@0 = private unnamed_addr constant [22 x i8] c"_Z12histo_kernelPhlPj\00", align 1 +@1 = private unnamed_addr constant [50 x i8] c"_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00", align 1 +@2 = private unnamed_addr constant [34 x i8] c"_ZL7prescanILb1ELb0EEvPjPKjS0_iii\00", align 1 +@3 = private unnamed_addr constant [34 x i8] c"_ZL7prescanILb1ELb1EEvPjPKjS0_iii\00", align 1 +@4 = private unnamed_addr constant [23 x i8] c"_ZL10uniformAddPjS_iii\00", align 1 +@5 = private unnamed_addr constant [34 x i8] c"_ZL7prescanILb0ELb0EEvPjPKjS0_iii\00", align 1 +@6 = private unnamed_addr constant [34 x i8] c"_ZL7prescanILb0ELb1EEvPjPKjS0_iii\00", align 1 +@7 = private unnamed_addr constant [19 x i8] c"_ZL5pack2PjS_S_S_j\00", align 1 +@8 = private constant [176489 x i8] c"P\EDU\BA\01\00\10\00X\B1\02\00\00\00\00\00\02\00\01\01@\00\00\00h\82\02\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\C0\81\02\00\00\00\00\00\80x\02\00\00\00\00\00=\05=\00@\008\00\03\00@\00%\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.nv.global\00.nv.constant0._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.text._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.nv.constant0._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.text._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.nv.constant0._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.text._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.nv.constant0._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.text._ZL10uniformAddPjS_iii\00.nv.info._ZL10uniformAddPjS_iii\00.nv.shared._ZL10uniformAddPjS_iii\00.nv.constant0._ZL10uniformAddPjS_iii\00.text._ZL5pack2PjS_S_S_j\00.nv.info._ZL5pack2PjS_S_S_j\00.nv.shared._ZL5pack2PjS_S_S_j\00.nv.constant0._ZL5pack2PjS_S_S_j\00.text._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.nv.info._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.nv.shared._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.nv.constant0._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.text._Z12histo_kernelPhlPj\00.nv.info._Z12histo_kernelPhlPj\00.nv.shared._Z12histo_kernelPhlPj\00.nv.constant0._Z12histo_kernelPhlPj\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.text._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00.nv.global\00threadIdx\00blockIdx\00blockDim\00gridDim\00$_ZL7prescanILb0ELb1EEvPjPKjS0_iii$_ZL12prescanBlockILb0EEvPjiS0_\00$_ZL7prescanILb0ELb1EEvPjPKjS0_iii$_ZL16clearLastElementILb0EEvPjS0_i\00$_ZL7prescanILb0ELb1EEvPjPKjS0_iii$_ZL16scanRootToLeavesPjj\00$_ZL7prescanILb0ELb1EEvPjPKjS0_iii$_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii\00$_ZL7prescanILb0ELb1EEvPjPKjS0_iii$_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_\00$_ZL7prescanILb0ELb1EEvPjPKjS0_iii$_ZL8buildSumPj\00$___ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data__2725\00.nv.constant0._ZL7prescanILb0ELb1EEvPjPKjS0_iii\00_param\00_ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.text._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00$_ZL7prescanILb0ELb0EEvPjPKjS0_iii$_ZL12prescanBlockILb0EEvPjiS0_\00$_ZL7prescanILb0ELb0EEvPjPKjS0_iii$_ZL16clearLastElementILb0EEvPjS0_i\00$_ZL7prescanILb0ELb0EEvPjPKjS0_iii$_ZL16scanRootToLeavesPjj\00$_ZL7prescanILb0ELb0EEvPjPKjS0_iii$_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii\00$_ZL7prescanILb0ELb0EEvPjPKjS0_iii$_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_\00$_ZL7prescanILb0ELb0EEvPjPKjS0_iii$_ZL8buildSumPj\00$___ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data__2426\00.nv.constant0._ZL7prescanILb0ELb0EEvPjPKjS0_iii\00_ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.text._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00$_ZL7prescanILb1ELb1EEvPjPKjS0_iii$_ZL12prescanBlockILb1EEvPjiS0_\00$_ZL7prescanILb1ELb1EEvPjPKjS0_iii$_ZL16clearLastElementILb1EEvPjS0_i\00$_ZL7prescanILb1ELb1EEvPjPKjS0_iii$_ZL16scanRootToLeavesPjj\00$_ZL7prescanILb1ELb1EEvPjPKjS0_iii$_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii\00$_ZL7prescanILb1ELb1EEvPjPKjS0_iii$_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_\00$_ZL7prescanILb1ELb1EEvPjPKjS0_iii$_ZL8buildSumPj\00$___ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data__2059\00.nv.constant0._ZL7prescanILb1ELb1EEvPjPKjS0_iii\00_ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.text._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.nv.info._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00.nv.shared._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00$_ZL7prescanILb1ELb0EEvPjPKjS0_iii$_ZL12prescanBlockILb1EEvPjiS0_\00$_ZL7prescanILb1ELb0EEvPjPKjS0_iii$_ZL16clearLastElementILb1EEvPjS0_i\00$_ZL7prescanILb1ELb0EEvPjPKjS0_iii$_ZL16scanRootToLeavesPjj\00$_ZL7prescanILb1ELb0EEvPjPKjS0_iii$_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii\00$_ZL7prescanILb1ELb0EEvPjPKjS0_iii$_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_\00$_ZL7prescanILb1ELb0EEvPjPKjS0_iii$_ZL8buildSumPj\00$___ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data__1377\00.nv.constant0._ZL7prescanILb1ELb0EEvPjPKjS0_iii\00_ZL10uniformAddPjS_iii\00.text._ZL10uniformAddPjS_iii\00.nv.info._ZL10uniformAddPjS_iii\00.nv.shared._ZL10uniformAddPjS_iii\00$___ZZL10uniformAddPjS_iiiE3uni__1283\00.nv.constant0._ZL10uniformAddPjS_iii\00_ZL5pack2PjS_S_S_j\00.text._ZL5pack2PjS_S_S_j\00.nv.info._ZL5pack2PjS_S_S_j\00.nv.shared._ZL5pack2PjS_S_S_j\00$_ZL5pack2PjS_S_S_j$_ZL8atomicOrPjj\00.nv.constant0._ZL5pack2PjS_S_S_j\00_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.text._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.nv.info._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00.nv.shared._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00$_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_$_ZL8atomicOrPjj\00$___ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm__437\00$___ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax__439\00.nv.constant0._ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_\00_Z12histo_kernelPhlPj\00.text._Z12histo_kernelPhlPj\00.nv.info._Z12histo_kernelPhlPj\00.nv.shared._Z12histo_kernelPhlPj\00$_Z12histo_kernelPhlPj$_ZL9atomicAddPjj\00$___ZZ12histo_kernelPhlPjE4temp__294\00.nv.constant0._Z12histo_kernelPhlPj\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00T\00\00\00\03\00\15\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A7\00\00\00\03\00\1D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D4\00\00\00\03\00\1E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DF\00\00\00\01\00\1E\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\E9\00\00\00\01\00\1E\00\03\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F2\00\00\00\01\00\1E\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\FB\00\00\00\01\00\1E\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\03\01\00\00\02\00\15\00\80\0F\00\00\00\00\00\00\A0\06\00\00\00\00\00\00E\01\00\00\02\00\15\00 \16\00\00\00\00\00\00\10\05\00\00\00\00\00\00\8B\01\00\00\02\00\15\000\1B\00\00\00\00\00\00\A0\14\00\00\00\00\00\00\C7\01\00\00\02\00\15\00\D0/\00\00\00\00\00\00p\0C\00\00\00\00\00\00\18\02\00\00\02\00\15\00@<\00\00\00\00\00\00\F0\18\00\00\00\00\00\00v\02\00\00\02\00\15\000U\00\00\00\00\00\00\90\11\00\00\00\00\00\00\DC\02\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\005\03\00\00\03\00\16\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\88\03\00\00\03\00\1F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B5\03\00\00\02\00\16\00\80\0F\00\00\00\00\00\00\A0\06\00\00\00\00\00\00\F7\03\00\00\02\00\16\00 \16\00\00\00\00\00\00\10\05\00\00\00\00\00\00=\04\00\00\02\00\16\000\1B\00\00\00\00\00\00\A0\14\00\00\00\00\00\00y\04\00\00\02\00\16\00\D0/\00\00\00\00\00\00\88\0B\00\00\00\00\00\00\CA\04\00\00\02\00\16\00X;\00\00\00\00\00\00(\17\00\00\00\00\00\00(\05\00\00\02\00\16\00\80R\00\00\00\00\00\00\80\11\00\00\00\00\00\00\8E\05\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E0\05\00\00\03\00\17\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\003\06\00\00\03\00 \00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00`\06\00\00\02\00\17\00\80\0F\00\00\00\00\00\00\A0\06\00\00\00\00\00\00\A2\06\00\00\02\00\17\00 \16\00\00\00\00\00\00\F8\07\00\00\00\00\00\00\E8\06\00\00\02\00\17\00\18\1E\00\00\00\00\00\00\A0\14\00\00\00\00\00\00$\07\00\00\02\00\17\00\B82\00\00\00\00\00\00x\0C\00\00\00\00\00\00u\07\00\00\02\00\17\000?\00\00\00\00\00\00\E8\18\00\00\00\00\00\00\D3\07\00\00\02\00\17\00\18X\00\00\00\00\00\00h\11\00\00\00\00\00\009\08\00\00\03\00\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8B\08\00\00\03\00\18\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DE\08\00\00\03\00!\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0B\09\00\00\02\00\18\00\80\0F\00\00\00\00\00\00\A0\06\00\00\00\00\00\00M\09\00\00\02\00\18\00 \16\00\00\00\00\00\00\F8\07\00\00\00\00\00\00\93\09\00\00\02\00\18\00\18\1E\00\00\00\00\00\00\A0\14\00\00\00\00\00\00\CF\09\00\00\02\00\18\00\B82\00\00\00\00\00\00\88\0B\00\00\00\00\00\00 \0A\00\00\02\00\18\00@>\00\00\00\00\00\000\17\00\00\00\00\00\00~\0A\00\00\02\00\18\00pU\00\00\00\00\00\00\90\11\00\00\00\00\00\00\E4\0A\00\00\03\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00+\0B\00\00\03\00\19\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00h\0B\00\00\03\00\22\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B0\0B\00\00\03\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E8\0B\00\00\03\00\1A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00;\0C\00\00\02\00\1A\00\98)\00\00\00\00\00\00h\04\00\00\00\00\00\00_\0C\00\00\03\00\12\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B2\0C\00\00\03\00\1B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00%\0D\00\00\03\00#\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00b\0D\00\00\02\00\1B\00\B0Z\00\00\00\00\00\00\90\04\00\00\00\00\00\00&\0E\00\00\03\00\13\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00|\0E\00\00\03\00\1C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B7\0E\00\00\03\00$\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\0E\00\00\02\00\1C\00`\0C\00\00\00\00\00\00\A0\04\00\00\00\00\00\00%\0F\00\00\03\00\14\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\15\00\00\00\00\00\00\00\00\00\C0f\00\00\00\00\00\00\13\03\00\00\12\10\16\00\00\00\00\00\00\00\00\00\00d\00\00\00\00\00\00\BE\05\00\00\12\10\17\00\00\00\00\00\00\00\00\00\80i\00\00\00\00\00\00i\08\00\00\12\10\18\00\00\00\00\00\00\00\00\00\00g\00\00\00\00\00\00\14\0B\00\00\12\10\19\00\00\00\00\00\00\00\00\00\00\0F\00\00\00\00\00\00\D5\0B\00\00\12\10\1A\00\00\00\00\00\00\00\00\00\00.\00\00\00\00\00\00\80\0C\00\00\12\10\1B\00\00\00\00\00\00\00\00\00@_\00\00\00\00\00\00f\0E\00\00\12\10\1C\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\04/\08\00?\00\00\00\0C\00\00\00\04#\08\006\00\00\00\00\00\00\00\04\12\08\006\00\00\00\00\00\00\00\04\11\08\006\00\00\00\00\00\00\00\04#\08\00?\00\00\00\00\00\00\00\04\12\08\00?\00\00\00@\00\00\00\04\11\08\00?\00\00\00@\00\00\00\04/\08\00>\00\00\00\18\00\00\00\04#\08\002\00\00\00\00\00\00\00\04\12\08\002\00\00\00\00\00\00\00\04\11\08\002\00\00\00\00\00\00\00\04#\08\00>\00\00\00\00\00\00\00\04\12\08\00>\00\00\00\C8\00\00\00\04\11\08\00>\00\00\00\C8\00\00\00\04/\08\00=\00\00\00\0F\00\00\00\04#\08\00.\00\00\00\00\00\00\00\04\12\08\00.\00\00\00\00\00\00\00\04\11\08\00.\00\00\00\00\00\00\00\04#\08\00=\00\00\00\00\00\00\00\04\12\08\00=\00\00\00h\00\00\00\04\11\08\00=\00\00\00h\00\00\00\04/\08\00<\00\00\00\0E\00\00\00\04#\08\00<\00\00\00\00\00\00\00\04\12\08\00<\00\00\00(\00\00\00\04\11\08\00<\00\00\00(\00\00\00\04/\08\00;\00\00\00\1C\00\00\00\04#\08\00(\00\00\00\00\00\00\00\04\12\08\00(\00\00\00\00\00\00\00\04\11\08\00(\00\00\00\00\00\00\00\04#\08\00'\00\00\00\00\00\00\00\04\12\08\00'\00\00\00\00\00\00\00\04\11\08\00'\00\00\00\00\00\00\00\04#\08\00&\00\00\00\00\00\00\00\04\12\08\00&\00\00\00\00\00\00\00\04\11\08\00&\00\00\00\00\00\00\00\04#\08\00%\00\00\00\00\00\00\00\04\12\08\00%\00\00\00\00\00\00\00\04\11\08\00%\00\00\00\00\00\00\00\04#\08\00$\00\00\00\00\00\00\00\04\12\08\00$\00\00\00\00\00\00\00\04\11\08\00$\00\00\00\00\00\00\00\04#\08\00#\00\00\00\00\00\00\00\04\12\08\00#\00\00\00\00\00\00\00\04\11\08\00#\00\00\00\00\00\00\00\04#\08\00;\00\00\00\00\00\00\00\04\12\08\00;\00\00\00\A0\00\00\00\04\11\08\00;\00\00\00\A0\00\00\00\04/\08\00:\00\00\00\1C\00\00\00\04#\08\00\1F\00\00\00\00\00\00\00\04\12\08\00\1F\00\00\00\00\00\00\00\04\11\08\00\1F\00\00\00\00\00\00\00\04#\08\00\1E\00\00\00\00\00\00\00\04\12\08\00\1E\00\00\00\00\00\00\00\04\11\08\00\1E\00\00\00\00\00\00\00\04#\08\00\1D\00\00\00\00\00\00\00\04\12\08\00\1D\00\00\00\00\00\00\00\04\11\08\00\1D\00\00\00\00\00\00\00\04#\08\00\1C\00\00\00\00\00\00\00\04\12\08\00\1C\00\00\00\00\00\00\00\04\11\08\00\1C\00\00\00\00\00\00\00\04#\08\00\1B\00\00\00\00\00\00\00\04\12\08\00\1B\00\00\00\00\00\00\00\04\11\08\00\1B\00\00\00\00\00\00\00\04#\08\00\1A\00\00\00\00\00\00\00\04\12\08\00\1A\00\00\00\00\00\00\00\04\11\08\00\1A\00\00\00\00\00\00\00\04#\08\00:\00\00\00\00\00\00\00\04\12\08\00:\00\00\00\A0\00\00\00\04\11\08\00:\00\00\00\A0\00\00\00\04/\08\009\00\00\00\1C\00\00\00\04#\08\00\16\00\00\00\00\00\00\00\04\12\08\00\16\00\00\00\00\00\00\00\04\11\08\00\16\00\00\00\00\00\00\00\04#\08\00\15\00\00\00\00\00\00\00\04\12\08\00\15\00\00\00\00\00\00\00\04\11\08\00\15\00\00\00\00\00\00\00\04#\08\00\14\00\00\00\00\00\00\00\04\12\08\00\14\00\00\00\00\00\00\00\04\11\08\00\14\00\00\00\00\00\00\00\04#\08\00\13\00\00\00\00\00\00\00\04\12\08\00\13\00\00\00\00\00\00\00\04\11\08\00\13\00\00\00\00\00\00\00\04#\08\00\12\00\00\00\00\00\00\00\04\12\08\00\12\00\00\00\00\00\00\00\04\11\08\00\12\00\00\00\00\00\00\00\04#\08\00\11\00\00\00\00\00\00\00\04\12\08\00\11\00\00\00\00\00\00\00\04\11\08\00\11\00\00\00\00\00\00\00\04#\08\009\00\00\00\00\00\00\00\04\12\08\009\00\00\00\A0\00\00\00\04\11\08\009\00\00\00\A0\00\00\00\04/\08\008\00\00\00\1C\00\00\00\04#\08\00\0D\00\00\00\00\00\00\00\04\12\08\00\0D\00\00\00\00\00\00\00\04\11\08\00\0D\00\00\00\00\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00\00\00\00\00\04\11\08\00\0C\00\00\00\00\00\00\00\04#\08\00\0B\00\00\00\00\00\00\00\04\12\08\00\0B\00\00\00\00\00\00\00\04\11\08\00\0B\00\00\00\00\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00\00\00\00\00\04\11\08\00\0A\00\00\00\00\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00\00\00\00\00\04\11\08\00\09\00\00\00\00\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04#\08\008\00\00\00\00\00\00\00\04\12\08\008\00\00\00\A0\00\00\00\04\11\08\008\00\00\00\A0\00\00\00\010\00\00\01*\00\00\04\0A\08\00\0E\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00x\06\00\00\10\14\00\00\04\1C\04\00x\0F\00\00\04\1E\04\00`\00\00\00\010\00\00\01*\00\00\04\0A\08\00\17\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00x\06\00\00\10\14\00\00\04\1C\04\00x\0F\00\00\04\1E\04\00`\00\00\00\010\00\00\01*\00\00\04\0A\08\00 \00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00x\06\00\00\10\14\00\00\04\1C\04\00x\0F\00\00\04\1E\04\00`\00\00\00\010\00\00\01*\00\00\04\0A\08\00)\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00x\06\00\00\10\14\00\00\04\1C\04\00x\0F\00\00\04\1E\04\00`\00\00\00\010\00\00\01*\00\00\04\0A\08\00,\00\00\00@\01\1C\00\03\19\1C\00\04\17\0C\00\00\00\00\00\04\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\14\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\D0\04\00\00\B8\06\00\00\04\1C\04\00\B8\0E\00\00\04\1E\04\00 \00\00\00\010\00\00\01*\00\00\04\0A\08\00/\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\D0\04\00\00\04\1C\04\00\90)\00\00\04\1E\04\00P\00\00\00\010\00\00\01*\00\00\04\0A\08\003\00\00\00@\01@\00\03\19@\00\04\17\0C\00\00\00\00\00\07\008\00\00\F0!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\D0\07\00\00X9\00\00\04\1C\04\00\A8Z\00\00\04\1E\04\00P\00\00\00\010\00\00\01*\00\00\04\0A\08\007\00\00\00@\01\18\00\03\19\18\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\98\04\00\00\04\1C\04\00X\0C\00\00\04\1E\04\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03\00\00\18 \00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E5\04\00\00\01\00\00\00\06\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\80g\02\00\00\00\00\00\00\11\00\00\00\00\00\00\03\00\00\00?\00\00\0C \00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\85\00\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\15\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B2\00\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\01\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\16\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F0\01\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\17\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A0\02\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\18\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00:\03\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\19\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00h\04\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\040\00\00\00\00\00\00\00\00\00\00\1B\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00 \05\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80x\02\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\1C\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\06\00\00\00\05\00\00\00\C0\81\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\C4$\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8S\02\00\00\00\00\00\A8S\02\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\06\00\00\00\80x\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\040\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01H\00\00\00h.\00\00\00\00\00\00d.\00\00@\00\00\00\04\00\06\00=\00\00\00\00\00\00\00\00\00\00\00\11 \00\00\00\00\00\00\00\00\00\00\00\00\00\00&\18\01\00\00\00\00\00\00\00\00\00\00\00\00\00\F0 \0A\0A\0A\0A.version 6.4\0A.target sm_61\0A.address_size 64.\00\F0\03func (.param .b32 \12\00\F5\0E_retval0) _ZL9atomicAddPjj\0A(\0A-\00-64\1F\00\11_\1C\00H_0,\0AS\00\0F&\00\04_1\0A)\0A;\83\00\12\128>\00/Or\82\00\02\08\1E\00\0F\81\00\06\0E%\00\09\80\00\F8\1F_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_\03\00\0E\90\00\0FI\00$\0E\BB\00\0FP\002\1D1\0B\01\0FP\00+\1F2P\00<\1F3\F0\00<\1F4P\00<\1F5P\00<\1F6P\00<\1F7P\00<\1F8P\00<\1A9f\03\F1\0312prescanBlockILb1C\00>iS0J\03\0F-\00\09\0F\E9\03\02\0F4\00\13/1,h\00\1F\1B2,\04g1store\14\01-To\12\01\01\01\00\0E\1F\04\0F<\00\17\0E\12\04\0FC\00%\0E\05\04\0FC\00\1E\1F2C\00/\1F3C\00/\1F4C\00/\1F5C\00/\1F6C\00/\1F7C\00/\1F8@\07\17\9FbuildSumP?\07\01\06\1D\00\04q\02\0B\B3\03\F5\026clearLastElement\22\03>S0_\E2\02\0F1\00\0D\0E\D7\02\0F8\00\1A\0E\CC\02\0F8\00\13\1C2\D6\00\00\F1\03\CERootToLeaves]\08\0F'\00\02\0Ff\08\05\0F.\00\03\0Fo\08!\1F1o\085\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08<\1F1o\08\1E\0FU\05\07\1F1\A9\07(\1F1\A9\07/\1F1\A9\07/\1F1\A9\07/\1F1\A9\07/\1F1\A9\07/\1F1\A9\07/\1F1\A9\07/\1F1\A9\07/\1F1\A9\07\0F\0Fn\0A\01\1F0\03\0B\19\1F0\03\0B \1F0\03\0B \1E0\03\0B\0F\16\08\0F\1F0\16\08\1D\1F0\16\08$\1F0\16\08$\1F0\16\08\03\FF\14\0A.global .align 1 .b8 threadIdx[1];#\00\03\11b.\01\0F\22\00\0E?Dim\22\00\07Dgrid!\00\04\01\00o.weak \DC\09\0E\AAcudaMalloco\01\06\18\00\0EW\01\0F \00\02\00\9D\08f{\0A.loc\9E\00\118\9E\00!__\15\00\A0_depot0[16\C7\002regI\00;%SP\0F\00\15L\10\00\8932 %r<2>!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F \06\03\0F,\00\0A\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\07B\10\0F\1C\02\10?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F1\03OccupancyMaxActive\AB\09\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\15visible .entry _Z12histo_kernelPhlPj\9E\04\00\98\00\0F#\00\02\0E}\04\0F+\00\0D\1F1+\00\10\0F\8F\0C\1B\1F6\EB\07\18xpred %pu\0A\02\B5\03.19\DE\0E\100\B7\031\09.s\16\14\04\81\00\124\81\00\1FZ\B8\00\01\A0E4temp[102e\03\0F\F1\03\08\1F69\08\1D\0F'\01\02\0F(\03\0C\0F\86\01\02\0F\07\03\0C\0F\E5\01\0A\13]\B6\00#to\F5\12\04:\00\144\AC\02\01\1F\00\0A\1C\00\115\1C\00\1F4;\00\05\146*\03\0F;\00\00\117\1C\00\1F6\\\03\02\1F7\\\03\09)64\\\03+d5\D5\0Ba%tid.xt\00\00-\00\03\19\00#d8\89\03\05\B9\01Ord9,\EB\01\0A\03\B7\00\02\22\02\05H\01\110u\00\00\93\03$hlN\02311,c\00\832;\0Aadd.s\18\00'2,4\00+11\B0\08\170\00\04&rd#\0E\ABbar.sync 0\0B\04B%cta\DD\00\06\17\00\00\9F\01\13n\F3\00\81mul.lo.s\19\00#5,5\00#r4\95\00\01\17\00#6,'\01*r5\7F\04\03\96\04\186]\00\00\A5\01\16nu\00\08_\00#8,}\00+r7H\00\02\AF\01\F2\048;\0Abra.uni LBB6_1;\0A\08\004:\0Al\8C\00Ed13,}\00\01\B4\02\04f\01$4,\FB\01\92;\0Asetp.geU\014p1,9\00\01(\00\A2;\0A@%p1 brag\00\1B3w\00\132w\00'2:_\00422,p\02\17;\8E\00\192\8E\00\07\C8\01424,7\00\01'\00\02\AB\00\118\0D\00!5,\BE\01\013\00\07\13\02426,\1F\00\192u\02/27v\02 228,<\00\0A\97\00(9,\1D\00\09\BA\01\00!\01v1;\0A{ \0A\09\0B\05\00`\00\03\F0\03Ireg;\BC\09\01\0B\00\180j\06\00i\03\02\16\00\04\98\03929;\A4\09\01\0B\00\1C1\9E\06\02\16\00\04=\0F\1A33\00\03\B6\06a;\0Acall\A7\01\14(g\0B<, \0Az(R, \0A(\0A\84\00\22, \09\0071\0A)\DE\04\02\E1\05\01/\02\06\10\07\84;\0A} \0A\09ld\0B\01%6,\9A\02&;\0A\17\00\1D7\EE\01\02\18\00#8,\1E\00\00;\00\0F$\03\02\1F1\DD\02\05)3:\BD\03\07\D1\02%5,\B6\04\08\A7\01\1F9\B0\04\06\111A\02\0Bh\04$7,\1B\00\0Bi\04$8,f\00\01'\00\09r\02\1F1\E8\04!\132\B4\04\1A9r\02(1,\1D\00\187O\01\130\09\03/1]w\02=/18w\02\1F\1F0w\02Q\1F1w\02\00\04\86\09\0F\AB+\1E\0F\AA+<\0F\\\09\10\1F7G\11\1F\1F5\FF\0C\1E\1F7\FF\0C\18\02\A5\08\0E\D3\00\0F\D4\08\0A\0F(\01\03\1F]-\08\03\1F1\D0\03\03\08\AB\03\182\AA\03\05\DF\02\182-\04\0F\0F\0C\00\1F2\05\16\01\182X\00\1A3t\06\05\D5\07\05\E8\06\00\C7\00\00\F4-\06\D9\07\00P\03\143\DF\0B\0F\03\0C\0E\1F4\03\0C\0A\D3L26vlc_encode\05\04\91_sm64huff\05\1FES1_S\02\00\0C\1F\0C\0F?\00\1C\0E;\0C\0FG\00)\1F1G\003\1F2G\003\1F3G\003\1F4G\003\1F5G\003\1F6G\003\1F7z\04\13_8[168\D7\0D\1D-13\D8\0D\1E6\90\04?143\DA\0D\0A\0F\D8\00\1C\BFE2sm[12288]V\00\03\00\AD\03\0FW\00 o5kcmaxH\0E\0A\1F8H\0E\19\128\0C\05\0F\B5\01#\0F\\\05\00\1F7P\00(\1F6P\00\00\1F6P\00(\1F5P\00\00\1F5P\00(\0F%\13\01\1F4P\00(/3]\D8\0F\00\0FP\00(\0F\F4\0F\02\0FP\00(\0F<\07\02\0FP\00(\0F,\10\07\02\91\0C\1F8\F1\0F\02\06:\0F\0F<\00\03\03E\0F\1F7=\00\03\152C\0F\0F>\00\04\143P\10\0F>\00\01\1345\0E\0F>\00\06\145K\10\0F>\00\01\03\C7\10\1F5>\00\06\197\07\11\0B>\00\158,\0B\0F>\00\04\1F9a\11\08\07\87\0B\0F6\01\03\04\92\0B\0F>\00\03\03\B1\0E\0F6\01\06/23\A2\11\08\04\F6\0E\1F3\F1\08\03\1F4\A5\11\03\0F\A6\11\04/20\04\15\03*18\18\00\03\05\15+d1\1A\12\144\0C\0C\1B4\18\00\03x\00\1B1x\00\145\F1\09\09D\11\1F6D\11\03\1F7D\11\05$8,2\11\1F7\B6\0D\04\078\0E$0,2\00\1A9\0D\0A%64\94\0C\09\17\00\02\BB\00\1A9\1F\10;5, +\01\148[\01\0A\EA\12;1, Q\00#10\FD\0C\0A\E7\12\00\8B\10\0F\F9\06#\0F\1A\0E\03\157o\10\09\E9\01\03\C0\12\0D\A5\10\04\BB\10L, 105\02$12N\02\1F8\DA\10\02|7, 20485\00\168\8F\10\07`\0B\170L\0B\05\E6\0FEd31,o\01\09M\0F432, \00\0AM\0F433,P\00\01'\00\08\E5\0E\132\95\0B)3]{\00&4,\05\01\0AL\00$5,!\00\03L\00\07\F2\13\2235\AF\01\0A'\0C\1F6\80\0C\02?d37\C8\00\05$8, \00\0B\C8\00$9,Q\00\01'\00\08\C8\00\133\C8\00\199\C8\00640,\98\01\09L\00441,!\00\03L\00\07\C8\00%41\EB\11\07L\00\1F2\F0\0C\01$d4\F2\0C\1B61\13444, \00\0A\C7\00445,P\00\01'\00\08\C7\00\03$\0D:45]\0A\03\139x\19\1A4M\15\09-\03%36-\03\07\CC\11\138\CC\11\138\A9\14\03i\00&5,6\00\04\92\14\02M\00\03\92\14\00\22\00\1A3\8D\14;8_4[\00\132[\00\182[\00549,\B5\00\18;s\00\190t\00\07\FA\03!51l\004sub{\04452,\19\00\01:\00\03Z\01\03\1D\00$3,$\00\006\00%hrH\00$4,\81\00\01'\00\02!\01\148M\04\03\B6\04)54\C9\12)31\EC\02\05\09\15\101P\03\05:\00\0A\B7\12\03R\03\01#\00\0B\B9\12\03\1A\03$13\D8\15\193\04\01\03^\15L134]\EC\04\03h%*55\9F\00\195\C3\02\07\9F\00\1F6\9F\00\07%7,#\00\0C\9F\00%8,W\00\02+\00\09\9F\00\146\9F\00+8]=\01\03\DB\03*56\9E\00%9,\D4\05\06\9D\00F157,8\00\0A\9C\00\03r\03#13\A5\01\08\80\00 d1\97\03\05\0F\01F;\0Aor8\00)2,@\00/41U\06\038142\89\00\198\89\00\07\8D\02&9,d\06\09\F6\06460,!\00\01A\00\0F\93\06\04+60\0C\03\133\0C\03\183\0C\03)61\F4\02\08g\00$2,!\00\1F1\C9\03\04/62\CA\03\04>4:\0A\0D\16\02\C9\00\07\FC\04\05\19\00\1A2A\06\1F4y\05\05448, \00\0B\B2\04$9,R\00\01'\00\0A-\05\139\AE\00\0F\E8\19\05.17\E3\00\03\FE\08)17)\00\178\B7\08\07\D7\03\02\8A\01\1F8F\00\00\03\D2\03\1B9(\01\135(\01\175(\01\00y\0A\056\00\03\F2\04\22eq\8E\013p2,\22\00\110\F2\04\162\F2\04\1C1\E7\01\136\\\00\1E6\1A\17\03R\07\188_\01\07\18\00\1E9\81\00$ges\05$2,<\00\01+\00\02y\05\07\88\00\1B8\87\00\137\87\00\187\0B\02\163\E4\00\09\A5\06\1F14\08\00\03\19\00$2, \00\121o\03\03\18\00$3,\1F\00\1B1C\0A\03\E2\04\2213(\03)33\D7\02\02X\04\01$\00*-1C\04\03\DA\0A9135\9B\00\1F6\B4\00\05\1F7\B4\00\05$8, \00\1A1|\00$9, \00\1C2\B5\00\027\04\121<\0891399\00\02\0D\08\01$\00\0D\B5\00\03\DE\02)41\F8\04-24]\03\02\96\05\02?\1B\03\EF\00\0A\FA\04\03B\1B\01#\00\0CW\1D%7,W\00\02+\00\09\15\08\03\A5\09:127l\00&8,\A6\00\0Bl\00%9,#\00\0C\02\06\1B0l\00\199l\00\143\02\06\0B\17\05\02\CF\08\2214\19\02\1A4\F9\09#13\D0\03\1C4\22\08\138\A8\02\198\22\08\195\F4\01\07\DC\01\02c\04\01!\00\0F3\04\05+46c\00\139c\00\199c\00\1A7n\03\06N\04\02\E6\04\01!\00\0FP\04\05/48Q\04\04(10R\04\0AU\03\00\D0\03#neQ\04#3,!\00\02Q\04\173Q\04\0D\D5\05\05\A0\09\181\D3\1D)50b\02\07h\1F\172&\05\06\AE\0D\02\FA\0E\00\1D\00--1|\1B\02H\09)23%\02\02d\09-d50\0C\02`\09\22d5)\02\195\C0#,24\E7\1F\125\D1\17,24\D8\00\04\1D\0A812:A\00\0F\DB\01\00\135* \1C5C\00\04T\07\181U\07\00H\03\058\00\09\9C\00\08\F7\0E\0A`\05#4,9\00\00&\00\01\93\01\174\93\01\0C\EF\01$14x\00\09j\07\1C0\B7\02\05T\02\120\B7\02\1F0\B7\02\08?06;1\06\04/071\06\03)08\E8\00\0B1\06$1,<\00\01+\00\021\06\07\AB\0B,16\D5\00\05\C5\02\195\D5\00\1F9\80\05\04/10\80\05\04\03{\00\00 \00\094\06\03\C4\06/114\06\00\02\CE\12\2210\A2\05)12[\04\02\CE\12\01$\00\0C\7F\05\145\D4\0C\09&\0D/15\B4\00\05\1F6\B4\00\05$7, \00\0A4\06\03\1D\1F/174\06\00\03y\08\121\D3\06)189\00\02\C9\12\01$\00\0D\B5\00\137\07\0F\0A2\1F/094\06\04\05j\01\00\EF\00\0B\88#%1,#\00\0B\C8\05\05\8C#\03\EB\05)11n\08\131\C8\05,12i\0C\136\92\02\1A2\D3\06/13\9F\00\05&4,\D9\00\0B\9F\00%5,#\00\0C\9F\00%6,W\00\02+\00\09\9F\00\04\D3\06*16l\00\1F7\0B\01\07%8,#\00\0Cl\00\1B9l\00\0A\A2\0A#11\C5\07\1A2\FB\10\05\F1\00\1A6:\0B\01\DC\01\0F\DE\07\03\1F1\0B\01\06\03\9B\14\01#\00\0C\DE\07%3,W\00\02+\00\09\87\00\144\DE\07*3]b\02$5, \00\01\AE\00\0B]%\133\D4\00\0D5\05\04\A2\0A;16:\1A\00\045\0A\1916\0A\0BP\05\076\03\02\97\08\01!\00\0F\B3\05\04?127\B4\05\06\1F8\22\0B\04\0A!\0B\07\D9\06\1F9\D9\06\02\02\B0\08\142h\03\09g\07#5,P\00\00'\00\01\03\05\165\02\05,20\9B\00\04*\08)19j\07\1F4)\0D\04/55)\0D\04\00'\0F\03 \00\0B:\07$7,R\00\01'\00\08\19\1F\03\91\03)57/\1F\1F2\87\0E\04\02\81\0B\02E\10\193P\13\05\D3\0E\1A5F\11(34\0C\16\0C\01\08\01\22\03)34\C7\00\02\EA\0E\01\1C\00\0A\C7\00\00\95\0E\03j\00\01'\00\09z\02\126\1D\13\193\1D\13/62D\01\04/63D\01\04464, \00\0B}\00$5,R\00\01'\00\08.\01\03>\11\196\CC\0F/36D\01\05#7,5\00\00%\00\08\B1\07\02C\0C\00 \00\195Z\16?66,\FC\1C)\0F]\16\00\136\17*\1A6D\01\02\22\06\1C3y\08$20Q\13(0:^\01\1F8^\01\05\1F9^\01\04\127\DB\01\1D6\DB\01471,R\00\01'\00\08H\01\129^\01)71\DD\08\02:\0D\00\1D\00\0B`)\127\03\04\194\E6\06\1F7\09\02\05\1F7\09\02\05474, \00\0B\AB\00$5,R\00\01'\00\07N\00#41\AB\00\00\09\023and\B1\04\02$\12\00\1D\00\1B3\B0\04\127H\08)42\AC\00\0F\22\11\05\1F7\22\11\05\1377\01\1D7\AC\00$9,R\00\01'\00\09\D1*\1C33\0B\127\90\06/43\13\0A\04-44\D4\11\02\18\00\04\91\04\1A7\FB\03\006\0D(32A\15\00\DE\0C\02\18\00\006\00\037\05\14l=\0A#6,e\00\00)\00\017\05\1767\05\0C\A1\0C$21\95\02\08B\16/48\8C\00\04\1C9\8C\00#1,\17\00\005\00\09\F0\11#63\AD$\0Ap\00\04B\06>22:\B0$\02\13\01\0DE\00\1F2E\00\06+3:t,\003\00\0B\DD$\05s0\06\D2\04,80\DA\14\02\85,\1F0\9B\01\03\00F\0D\04L\00\08\FD\00\02e\0D\02K\0D#51X\03\03b\00$1,i\00\00(\00\0F\0F\16\03)d8\A6\09\1F8_\03\05583,\96\03\09\B3\02\128_\03\1D8_\03485,R\00\01'\00\08\CD\00\193\86\15\06\FD\00\184\F4\01\06\17\00\185\FC\00\06\80\05\02'\07\02\0D\07\195\EC\1B,57%\02\02l\06\02@\07(56\86\08\02\BB\06\02\94\0E/58\8B)=/85\8B)\1E/59\8B)\1C\08\FBS\0F\8A)\14/60\01,\09\05n\07\198\84\02)63\9C\02\06\85\02\02l\07\02)\03/62\D1\16\03\09i\07*65J\00\09-\15#7,\22\00\02\DC\10\177;\04\0D\D0\06\04I\0F\182I\0F.66^\00\14l}\1A#8,\22\00\123~\1A\178_\00\0C\D3\0E$25_\00\185\E5\03\136^\02\08%\04\02\02\01\0C\AD\0A$27A\00\186\DB\04\0Fk\04\06\023\05\0FE\00\06\187\86\00\02\BE\0B/64k\04\02\09\E4\19/86k\04\02/68k\04\03/69k\04\03\02\F4\07\126A\01\1A9k\04$7,i\00\00(\00\03\1F\0A#32 \00\03\15\08\198\8E\17\00x\07\181\89\03\00\A2\07\02\17\00\02`\00\06A\15\02\C3\07\137\C0\0F\06\90\07\02\C2\07\02^\00/74\E1\04\03\1A7\0B\01\0F\EA\08\04(76\DF\04\07z\00#7,\1E\00\0E\F5\12\1189\04)77\0F\05\129\19\09\1D8\19\09491,\81\00\01'\00\08}\00\1F8\0F\05\03\197\89\01\06(\02,80\DE\04\02\EB\05\02\EA\05(79U\01\02\7F\05\02X\08\1F8\E00>/91\DE\04\1E/82\DE\04O/83\DE\04\09/85\DE\04\02/86\DE\04\04\02\DD\02\02\DC\02/85\DE\04\03\1D8\FA\03\04\99\16\182\9A\16/88\F8\04\08#9,\22\00\02\F8\04\169\99\04,30^\00\04j\0E\192j\0E/92\E9\03\02/89q\04\04+90\83\03\02\A5\02\02\C4\02\198\83\03\00[\00\02 \00\0F\FF\15\00\119\BA\03#92\9C\03\03\0A\03$4,\94\00\01#\00\0F\80\08\04\199.\1F/95\A0\03\03/93\A0\03\03\02e\00\129}\08\0C\96\00\02\112\0A\94\0E497,\1C\00\0B\A0\03$8,\81\00\01'\00\08}\00\1F5\A0\03\03/96X\01\05\0D\7F\08\02^\00\02}\00\199\7F\08399,c\00\00&\00\0F\A1\03>\0F\0A2\1F\1F9\7F\08P/10\80\08\01\0A\EF\1E\04\AB\0E/30\0A\1D\05-02\E9\11\03\CC\01\1F9g\0F=\01\B1\00\01Z\00\09\81\00\04\CD\1A.00\E5\22$0,\A4\00\01*\00\02A\17\170\D5\03\0CS\0C$31\F2\00\09\A9\19?101\94\0B\04\0B\01\01\08.\14\00\9A\00\04\22\00\0BA\22504,W\00\02+\00\09\D3\00\04.\14\1B0a8\05\9D\18\1C4\85\00\1F6\A1$\04\03=\18\02\22\00\0C\85\00%8,V\00\02+\00\0A~\14\04\AD\22\1C0\0D\09$32%\01%2:\BE1\0Fc_\1D\0Fb_:\0F\BE/\10\1F984Q\1F984\1F\0E\D2\00\0F74\09\0E%\01\0F64\A7\05V\1A\0F54=\825pack2PjS\04\0D5@\0C \00\0E\F73\0F(\00\0A\1F1(\00\14\1F2(\00\14\08\9A3/32(\00\00\0F\1FD\1BO10[7}@\1E\168\EA\02\8C16 %rs<5\B72\1E9ED\1F5nO\0E\1F0\10\03\18\03\BD1\0F\FD\00\04\0FN1\01\0F1\00\09\1F3/1\01\0F1\00\09\1F2\101\01\0F1\00\09\1F1\F10\01\0F1\00\01\0F\AE@\0F\03k\18\0Fs@\0B\0F;\00\03\1E8\C6/\0E\D90\0F\C4/\04\1F0\C4/\08\04\95\1C\1F0>\00\06\1F2\C4/\08\04\A5\1B\1F2\86D\03\1F3\86/\02/11\C9\04\03\1F9\85/\03\1F7\89D\02\09\DF\0E\08l&\0A\09/\0F\1F@\03\0A\0E/\0A\B7.\0F*\1F8\1A.\03/19\93\00\04\05=2\0C\93\00\03\C7=%18\9B1\0B\16?\03\C7=\09\93\00\03\C3\08\09\F4=\188E\09\0F\83(\01\0B`\0E\04\07\1D/9;j(\00)48\9E\18$21f(\0C\9E\18\03\E6 \0A\91\1F\1F2p.\03\05\B6\1E+40\1D\01\04\DE2\0D\1D\01\195\EDA\08\E8\13\04\87\1F?25]\E21\02\09\AA\1E\192\EA.\06\17\00\184\90\1C\064\01\02w\00\03\97$\1F4*2\02)25\F1\00\1C6\0F\14\03\1E\0Fd\0FP/68r\06\0F\05\C7\09\00\1A/\0F\A4\18\09\9110uniform\02N]S_iii\A8\18\0E$\00\0E\05n\0F,\00\0F\1D1`\18\0F,\00\07\1F2,\00\18\1F3,\00\11\0F\BC\18\1C\1E1\DC\\\0F\BC\18\0E\0DaK.31\C0c/3>\09K\0B\0F\BB\00\00_E3uni\ECJ\0A/11\DF\18\18\03L\18\0F$\01\08\1E]Z]\0F4\00\0D\0F\E5\\\01\0F4\00\06\0F\A9I\0F\0FE\02\00\0F\93\1C\0F\0F\A6\02\00\0F\F1\18\0E\1E3<\18\0E\BBY\0F\F6H\03\1F5\FDG\08\05H\19\0F^\18\00\1F6]\18\03\1F4&\1D\05\0F\1EX\00\1F2r\18\02\1C3\D9X\08\9D8\0Ag\13\1E4f\13\1C1f\13\141f\13)1_\83 \1F7\8CU\03\185\CD\18\06#\06\08=V\07\8C\11\15,\FB\05\0FkZ\03\0BH'\1F,!Z\03&0,\99\1A\09\EB\0F\04\B7 \0A\0A\22\00>\19\0Fd\03\0B\0F\ED!\01\06\02%\1A\08\89F\0A\BBE+32RB\16,7\00\1C5\81A\0F\A3\1A\00&8,6\00\1F7\A3\1A\03\1F8\D4A\00\06\C6\1B\0F\B9\01\1F\06\B7L'ld[\19\01\BF\01\0B\D0\22\1F5\AE\1A\03\1E6\AE\1A\0FJX\1D\0B\A9\19\00#\00\09I\01\06\A8\19\02\D9\19\08d\02\158\A9\19\070\00\172`Y\1F01\19\01-24\E7,%2,:\00\01\DA77elpP\19\009>\10,&\00\08N\00\175)\01\09\0B\1C\03\D8<\1E4S\19/19F\01\01\05Q\19\0B\94D\02P\19\152\C9\00\0B\06\04\02\12\01\1A2@\0E\07AY\0B\84\19'2,\0DN\04\DB\1A\07s\18\00#\00\1F]\E97\03\02\18\19\0705\132O\05\1F0\DD!\0B\137Ct\01\0Au\07\B7|.S0D\09\0F/\00\0E\0EO\09\0F7\00\19\1F17\00#\0Ee\09\0F7\00\12\1F37\00#\1F47\00#\1F5\B2\09\14?2[7\EBb/.23\B2\09\1F1\11U\09\0F\C5\00\0D\85E6s_data\05U\0F\C6\09\09\1F2\A5\22\1F\0FC\01\13\0E\9D\09\1F6?\00\18\0F\10\0A\00\1F5?\00\18\0F\01#\02\0F@\00\18\0F\10#\02\0F@\00\18\0F\1F#\02\0F@\00\18/0]\02\0A\0B\1F4\02\0A\0B\0F;\00\03\1F7\F2\22\08\03\F77\0F&T\0B\1F2&T\0C\0F\D7\22\01\1F0z\0A\03\1F8\D7\22\03\1F6e\0A\02\1F5\BC\07\02\1F6\13\22\02\197\1E\06\0F\7F!\02\181_\08\0A\C0\09\05v \0E&\1E\1E8\BF\0A\1C2\BF\0A\142\BF\0A?2_1\8C\09\9A\1A2\8B\09\0B\8CE\1Dr\E7\00\143\E7\00.2:\E4(\01B\01\0CC\00\1F3C\00\06\1D3\F4\19.22C\0B\0F\F6\04\16\0FQ\0B\0D\01\AF]\07\A9%\00\EE\04*44\18\00\144\18\00\1A8\18\00\135\18\00*52\18\00\146\18\00\1A6\18\00\137\18\00*60\18\00\148\18\00\0FQ\15>/124\00\00\1F14\00\02\1F1p\12\09\1F2p\12\02\162`s\0D2\00\1F32\00\02\163\B5*\0D\97\00\1F4\97\00\02\184)'\0D4\00\1F54\00\02\1554\00\0Fh\00\01\1F64\00\02\1564\00\1F54\00\00\1F74\00\02\1574\00\1F64\00\00\1F84\00\02\1584\00\1F74\00\00\1F94\00\02\189Hc\05\C5\13\1F\0A*\8A'\0F\E5\13\01\04\09\00\142\09\00\143\09\00\144\09\00\145\09\00\146\09\00\147\09\00\148\09\00<9\0A)\BEe\1C4\EB'\0B\AC\0B\01.\0C\0F\F8\02H\0Fqd\14\0F,\02\01\1F2g\00\02\04\F8\02\02\D6Y\08\90\01\0F\D8\8A\08\0Ft\01\0A,\0A)U\18\1F2\0DW\02/15\AD\06\02(16\C7\1B\0E!g\0A#1\1F1\99\18\03\191\BE\1E\0E\DE'\0A\89#\06\0DL\0F\BE\01?/20\B6\04 \1F2\B7\04\1F\1F5\B8\04\1E/163\00\00\1F43\00\02&4+\F3Q\0D3\00\1F53\00\02\04\B8\04\0F/g\02\1F63\00\02\04\B7\04\1F1\8B'\01\1F73\00\02\04\B6\04/203\00\00\1F83\00\02\04\B5\04,21\80\04\0FC\8B\16\0Fs\04@.\0A)\B1g\0FY\92)\0FX\92\FF\FF\FF\13\0F\06\11\11?3[8\94w\1D\1F2\95w\00\1F0P3\0D\1F3^e\1D\0F\02\01/\0Fge\01\0FY\001\1F8pe\01\0FY\001\1F7ye\01\0FY\001\1F6\82e\01\0FY\001\1F5\8Be\01\0FY\001\0FS\1C\01\0FX\001\0Fw\1C\01\0FX\001\0FDf\01\0F\B1\002\0FMf\01\0F\B2\00+\0Fhy>\05l\1C\05\F1}\0F~y\08\0Cyd\1F4xd\03\1F5wd\03\1F6vd\03\0B\C1v%64\A8\12\0A\D0u\04\AE\11\0E'L.3;aE\1B2\E4%\03\08\1D\0B\8AI\199{\12\06\15\18\1E9JE\1F1\80.\03\1C6}\1C\05\F2\1C\0EoP\09\22e\07,\13\191\1A4\08]R\04w\1C\05\B3-\189\0CA\08WR\0C=;\0A\BA\1C\0A\BB>\0FPA\03\06\FD\12\0C\A0\00\183\0A\0D\09\9BY\09\02\13\0B\0C6\0A\A3%'12c\1A\00\E5&\04Y[\04\00\13\1F4\A3\1B\02+56s\00\1F5?c\00/16\AB\0D\03\04\D0>\1A6s\00\145\CD\1C\0As\00\05\FA(\0CC\01\1F7\F26\01\1B84\0E\0Bi\0F\1B4.w\04\0F\1C/9]\E3\1A\1B\1E8\E3\1A\04\097\1A2\03\02\1C2\FE<\06\BB5\0F\8F\01\01\04\BB6\0ALx/25\99\0E\02'18\CC5\072\02\179\9E\1C\02\F09\00\C0\00\05\F5\00\156\E7\1C\07\F2\00\04\12g\0D\F2\00'8,fi*27\A5\1B\1F8\E47\00/29\84\01\02+30\8B\0F\18s\DDf\00 \00\0F\DCf\1B.29\F9,/20\DCf\0B\0A\84\01\1F3K\10\03\04HW>35]\D5f\0E\0C7\04Og/6]\9C[\00\141\BE\15\0B\84\01.37\9E[\0F\06g\0F\1F4\BAf\01%39\CC\0E\0D\07\0E\0F\FA\9C\08\0F\F9\9C\8C\0Fg\0B\11\1F4X\7F2\1D0y\0B\1F7x\0B\0D\0FW\87\1A\03\14\0A\0F\F7\00\10\0EG\09\0F<\00\16\1F14\0A\01\0F=\00\15\0F\B6\08\04\1F3\D9A\01\1F48%\06(ld_\0B\04n\03\0F\17\13;\0F\EA,\1E\06h\9C\07~\11\0F\E0,\01\0F\CC)\00\0F\09&\00\0Al\06\0E\D9\05\0FEC\06\1C7\0C~\0AK\1B\1E7K\1B\1C4K\1B\144K\1B\1A4K\1B\1F1K\1B\03\1F9\D48\00\144d\1A/4_hS\01\0A\9CY\1E9\E8\1A\08A\00\0EdS\1F9\F7+I\0F\04\15\14\0F\03\15\1F\1C3\C0\16\0F\D9\9D\0B\0F\C4\16\1C\1F6x\1C\01\1F8\F6\17G\1F6\FDM\1F\0D\00\01\0EO\9E\0F\13-\02\0E|\14\0F\A9\A2\1E\0F\A8\A2\FF\FFM\0FC\08\11?5[4\0F\92\1D.16\B0y\1F92\08\0D\1F5U$\1C\0F\F4\00\22\0Fa$\00\0FK\00$\1F7m$\00\0FK\00$\1F6\D6\08\00\0FK\00$\0FB%\00\1F3K\00$\0F\0B\13\01\0FK\00$\0F\FE\12\01\0FK\00$\0F\F1\12\01\0F\97\00%\0F\E4\12\01\0F\98\00\1E\0F\D7\12T\0Bf\09\1F3o/\02\1F4\0A%1\0F\DA\84\03\08\8CL\0E\A6\07\09\BC\07\1E9H+\0E9w\0B\ED\0D\154:w\07R?\02\B6&\0D\0Aa\16,\101\09>e\03\94\0E\0A@ \1F7\03\88\02\188\EE/\0F\C0/\1F\09\F4O\0FI\12\01\1F1\97\1F\02\0F\028\03\06c%\0C\\l\0A\EB.\0B2\01\03\19\00\1B4\E4r\03\E7\1E\0D@Q\03d.\1F1k.\00\1F5k.\0B\0A:\01\0A\B1\12\0Fk.#\0E;.,15\9A\0F\0F:\A9\16\0F9\A9\09\0F\F7\98\1B.16\EA\06\0FL5 \1D7\FC\06\0F\FB\06\0E\1F6\0D\99\1D\0E\E7\00\0F\D7P\0B\0F:\8D\00\0F\C63\05\09\C7E\0F\D63\06\0A\D8\0C\05\8F(\0F\EC3\02\08\1E\0D\04D\89\00\09\00\0F\\\\\00'32\130\0A\074\1B1\FD\0D<6_6[\00\142[\00\1F2|U\04\0FKI\03\1D7}\00\0A\16F\04\F7\16\09\14F<6_4\7F\00\143\7F\00/3:#\05\00\0D:\0E\1F2\A8)\16\1F8\03\17\01\0A\A7)\1F133\08\0BuM\0Ey\04\0Az\17\05\C8\10/12\DD\9C\02\0A=\04\1F4\81$\0A.12\90\00\0C\923\0Fb3\01\1F6\A0$\01\0B\05N\1E8\B25\0F\DA\15\01\07p\14\03\BC2\06\9Bl\0B\FD\06/20\BB$\01.40\CC\14\0Ew\00\0C\CD\14\0F\E7M\00/23 L\01)40\E3\17$25\B62\09\CF\8F\0C\CD2\0F\BE\00\01\1F6\CD2\01\05\02I\0C\CD2\0A_\00\169\E42/28\BE\00\03\1F9\C3\11\09\04\B4\15\0F\86M\04\03\96\15\0D\1BG\02\0E\09\08\1F\13\07\E2J9rd5]\00\09\BF&\07\B2W\03\F0Q\0D\CAc\07]\00\1F7sj\00\0A\C0\01'32Zj\0A\98f$8]oj\0A\F0\03\144\F0\03\09\92J\1D3[\08\05\F3K\02\1E\00\0FT\05\03\1D3O\04\145_\00,5:\ECi\06\7F\07\04\D8\01\03\BAJ\0EPK\04\87\05\1F6\88\05\06\08@?\195\BD\00\0FUV\0D\0D\B5\07\0F\96\B0\0F\0F\95\B0\98\0F2\08\11O7[24_\17/\0E~=/153\08\0D\0Fp\8D \0F\FB\00\14\0F\ED<\01\0FA\00\19\0F\F9<\01\0F\82\00\12\0FK\0D;\0A\8FN\0Fb<\07\1E2X\16\1C7X\16\147.\03\1B7\C8O\0F\8B1\02\02\EA\03\0BSs\04\C6\1F\1F-\E6\0D\03\1F5\A6\08\01\0A\1A\06\02af\0B\B9\05\09\98\1F\0FX\00\00\1F8eZ\08\06{\0D\08o\00\0F\7F\0D%\1F9~\0D\09\1F8~\0D\02/16~\0D4\1E9}\0D\0E\B9\0C\1E2\E8\00\0F4\0D\1D\0A\863\0B\D8\97\06\FE\85\0A;\02\142;\02\0D\FB]\0F\EF\B4\05\0F\EE\B4L\0F\D9\04\11O8[56\0B\0D/\0E\90\9B\1F19\1C\0E\1F8\D9\04!\0F\F1\00\00\0F\8E\04\11\0EV\01\0F#^&\0Fo\04\00\0F<\0D\12\1F16\0D\11\04\E4\80/186\0D\07\0A\E1@\0A\F0y\09\C3\84\02yl\0B\F3\04\01\C2\7F\0As\00\03\B8\02\0A\C7\84\09B\0D\07\EFQ\03\9C\04\0E\1F\01\0FU\12\04\0E\1C\0D\0E?\12\0F\8F\0D\04\04B\12\09\8F\0D\0C\85\85\141z\82\1B1{\82\09\A5N\0A\B0Z\0F\91\0D\02/11:7\02\1F0\C6@L/28\93\0D\03\1F4\93\0D\03\1F5\E71\01\0E\07\99\0E\93\0D\0F\F5@\1A\0E\052\1F3\052\03/24;@\00$19d\0D\0C%@/21\93\0D\01\0FkY\08\0B\93\0D\1E4w\00\0FkY\0D)44kY\05d\0D\0F\93\0D\02\02\17\0D\0C{\07*28>}\0F\BE\00\01\09dY\1F9\93\0D\03\06\E1?\09_\00&31\F8?/30\BE\00\02/31\93\0DX\162\93\0D\0F0\\\00\0A\8Dw\1F6\8B\00\03\1F7\C1\0D\04\03H<\0D\DCh\16,p\08,8;c\0D\00\EA(\09\E0\07\1F0\E9\00\04\05\98_\0F\B7\9F\04\1F67E\08\09\A7\88,34)5\0Be(\0B\EF\00/14\F0\00\04\04\0B\15\0E\1F|\196\F6\9E\1F5\E5w\00*16\B7\0E\176\CDw\1A4\C3\00\1F6\FA\0D\02\01*\05\03\09\00\1B:\1A\00\03s\0E\0A#\86-37\F2\15\1A3MY\0F\C4\06\04\1F8\C5\06\06/6:\B1B\19\1F1\B1B\1B\1F1\B1B#\1F1\B1B#\1F1\B1B#\1F1\B1B#\1F1\B1B#\1F1\B1B*\1F9\B1Bs\1F1\B1B(\1F9\B1B.\1F1\B1B+\1F1\B1B+\1F1\B1B,\1F1\B1B,\1F1\B1B,\1F1\B1B\FF\A9\1C9\0E\11\149\C9\05\1F9\B1B\CA\149B\0C\1F9\B1B&\07C\00\0F\B1B%\1F1\B1B\FF\FF\FE\1F1\B1B\FF\FF\FF\FFo\1F1\B1B\83\0F\16\05\09\0F\9A\CC\FF\FF\FF\13\0F0\C3\11\1E0\B1B\0F\06\11 \0E\12\1B/41E\C3\0D\1F0\C3B;\1F1\C3BE\1F1\C3BE\1F1\C3BE\1F1\C3BE\1F1\C3BE\1F1\C3B\22\0F\B1\002\0F\C5/\00\0F\B0\00\10\0F\C3BE\1F1\C3BE\1F1\C3BR\1F3\AE!\02\1F4\C3B\83\1F5\C3B\05\0F\BB\22\0B\0BF_\02\F7\8A\0F\C3B\17\1F7\C3B\10\1F8\C3B\02\0E2\93\0E\B9\A7\0F\C4B\17\0A\EC\1D\1F1\C6B#\0E\BA\12\0F\C7B\01\05RB\0C\A3\00\0F\C7B\0D\1F3\C7B\11\1E4\C7B\0ETB\0F\C7B\16\1F5\C7B\11\1F6\C7B\03\03u\B7\0F\C7B\1D\1F7\C7Bu\1F8\C7B1\1F9\C7B\19.20\C7B\0FZ\1F\00\0F\C7B\02/21\C7B8\1F8\C7B\02/32\F0A\01?29]\BE\1F\01(16\EE\BB\07\8A'\05\F8_\08_\A7\1C2Vw\142Vw\00\09\00\1F:\1A\AA\09\03\17\00\1F1UC\04\04\96B\1B1UC\035p\0D\C5\A6)34Np\1C3m\17\00\22\00\0A\14\16\1E8a>\1425w\00\D8\00\1E:C\9E\0C>\00/24?\00\06\0Etw/28\BCC\02\1F0\A5C\03\0B\B5\01\03Ia\0B\CF\12/37\D3C\03\176\EDn\07\90\02&7,y!\1E6\D3C\03~.\0Ap\01\03\BAC\0D\F8'\04\A9n\03\\\AB\0A\FA'\1D4\E6\C7\0F]\D8 \0F\\\D8\FF\FFM\0FV\0E\11\1E1s6\0FA. \1F8o=\1F/21o=9\1F1o=7\1F1o=7\1F1o=7\1F1o=7\1F1o=7\1F1o=7\1F1o=8\1F1o=8\1F1o=\FF\FF\0C\0FX=\12\0Fp\0A\05\06\0A \09p\0A\0C`n\142`n\1B2`n\0F\E6=\04\0F\EF\0B\04\1E5\E6=\0F\A0l\01\0F\E6=\03\1F6\E6=/\1F7\E6=p\1C7Y\01\142Y\01\1F2\12(\16\1F0\C3j\1B\1F0\C3j#\1F0\C3j#\1F0\C3j#\1F0\C3j#\1F0\C3j#\1F0\C3j-\1F2\C3jp\1F0\C3j+\1F2\C3j+\1F0\C3j+\1F0\C3j+\1F0\C3j,\1F0\C3j,\1F0\C3j,\1F0\C3j\FF\AC\1C2\C3j\142\C3j\1F2\C3j\CA\142\80j\1F2\C3j&\08C\00\0F\C3j!\1F0\C3j\FF\FF\FF\FF\8A\1F0\C3j\FF\FF\FFd\0F\86\03\08\0F\B2\EE\8C\0Ff\0E\11\1F3\BC\\d/23\BC\\1\1F0\BC\\(\1F0\BC\\)\1F0\BC\\\FF\92,23D\0D\143D\0D\1F3\BC\\%$23]\0C\1F3\BC\\#\08A\00\0F\BC\\\DA\1F0\BC\\\FF\1A\0FB\01\0C\0Fa\F4\98\0F\85\06\11\1E4\E2K\0F\85\06S\1F4\E1K4\1F0\E1K-\1F0\E1K-\1F0\E1K}\1C29b\1429b/24\E1K\FE\092\06\0B\F4\BA\04\98\13\0AK\01\142K\01\0F\D5\19\1B\1F1\D5\19\18\0F\E7A#\0F7\00\07\1F17\00#\0F\D5\19\0B\0F\E7A#\0E7\00\0F\D5\19\13\0F\E7A-/25\D5\19s\0F\E7A(/25\D5\19.\1F1\D5\19+\1F1\D5\19(\0F\E7A,\0E\A6\02\0F\D5\19\1C\0F\E7A,\1F0\E7A\FF\AC,25\14\07\145\14\07\1F5\D5\19\CA\145\F4\0C\1F5\D5\19&\07C\00\0F\D5\19\22\0F\E7A\FF\FF\FF\FF\8A\1F0\E7A\FF\FF\FFVP;\0A\0A}\0A\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([176489 x i8], [176489 x i8]* @8, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_main_test_cu.cu, i8* null }, { i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local zeroext i1 @_Z8InitCUDAv() #0 { +entry: + %call = call i32 @cudaSetDevice(i32 0) + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0)) + ret i1 true +} + +declare dso_local i32 @cudaSetDevice(i32) #1 + +declare dso_local i32 @printf(i8*, ...) #1 + +; Function Attrs: noinline uwtable +define internal void @__cxx_global_var_init() #2 section ".text.startup" { +entry: + call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) + %0 = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init", %"class.std::ios_base::Init"* @_ZStL8__ioinit, i32 0, i32 0), i8* @__dso_handle) #3 + ret void +} + +declare dso_local void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1 + +declare dso_local void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1 + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #3 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z12histo_kernelPhlPj(i8* %buffer, i64 %size, i32* %histo) #0 { +entry: + %buffer.addr = alloca i8*, align 8 + %size.addr = alloca i64, align 8 + %histo.addr = alloca i32*, align 8 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i8* %buffer, i8** %buffer.addr, align 8 + store i64 %size, i64* %size.addr, align 8 + store i32* %histo, i32** %histo.addr, align 8 + %kernel_args = alloca i8*, i64 3, align 16 + %0 = bitcast i8** %buffer.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i64* %size.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %histo.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %7 = load i64, i64* %shmem_size, align 8 + %8 = load i8*, i8** %stream, align 8 + %9 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %10 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false) + %11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %14 = load i32, i32* %13, align 8 + %15 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %16 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast i8* %8 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i8*, i64, i32*)* @_Z12histo_kernelPhlPj to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @_Z8runHistoPcPjjS0_(i8* %file, i32* %freq, i32 %memSize, i32* %source) #0 { +entry: + %file.addr = alloca i8*, align 8 + %freq.addr = alloca i32*, align 8 + %memSize.addr = alloca i32, align 4 + %source.addr = alloca i32*, align 8 + %f = alloca %struct._IO_FILE*, align 8 + %result = alloca i64, align 8 + %buffer = alloca i8*, align 8 + %blocks = alloca i32, align 4 + %partSize = alloca i32, align 4 + %totalNum = alloca i32, align 4 + %partialNum = alloca i32, align 4 + %dev_buffer0 = alloca i8*, align 8 + %dev_buffer1 = alloca i8*, align 8 + %dev_histo = alloca i32*, align 8 + %i = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp29 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp29.coerce = alloca { i64, i32 }, align 4 + %agg.tmp34 = alloca %struct.dim3, align 4 + %agg.tmp36 = alloca %struct.dim3, align 4 + %agg.tmp34.coerce = alloca { i64, i32 }, align 4 + %agg.tmp36.coerce = alloca { i64, i32 }, align 4 + store i8* %file, i8** %file.addr, align 8 + store i32* %freq, i32** %freq.addr, align 8 + store i32 %memSize, i32* %memSize.addr, align 4 + store i32* %source, i32** %source.addr, align 8 + %0 = load i8*, i8** %file.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.1, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %f, align 8 + %1 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %tobool = icmp ne %struct._IO_FILE* %1, null + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %2 = load i8*, i8** %file.addr, align 8 + call void @perror(i8* %2) + call void @exit(i32 1) #16 + unreachable + +if.end: ; preds = %entry + %3 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %call1 = call i32 @fseek(%struct._IO_FILE* %3, i64 0, i32 0) + %4 = load i32*, i32** %source.addr, align 8 + %5 = bitcast i32* %4 to i8* + %6 = load i32, i32* %memSize.addr, align 4 + %conv = zext i32 %6 to i64 + %7 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %call2 = call i64 @fread(i8* %5, i64 1, i64 %conv, %struct._IO_FILE* %7) + store i64 %call2, i64* %result, align 8 + %8 = load i64, i64* %result, align 8 + %9 = load i32, i32* %memSize.addr, align 4 + %conv3 = zext i32 %9 to i64 + %cmp = icmp ne i64 %8, %conv3 + br i1 %cmp, label %if.then4, label %if.end6 + +if.then4: ; preds = %if.end + %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call5 = call i32 @fputs(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.2, i64 0, i64 0), %struct._IO_FILE* %10) + br label %if.end6 + +if.end6: ; preds = %if.then4, %if.end + %11 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %call7 = call i32 @fclose(%struct._IO_FILE* %11) + %12 = load i32*, i32** %source.addr, align 8 + %13 = bitcast i32* %12 to i8* + store i8* %13, i8** %buffer, align 8 + store i32 2, i32* %blocks, align 4 + %14 = load i32, i32* %memSize.addr, align 4 + %div = udiv i32 %14, 32 + store i32 %div, i32* %partSize, align 4 + %15 = load i32, i32* %memSize.addr, align 4 + %conv8 = zext i32 %15 to i64 + %div9 = udiv i64 %conv8, 4 + %conv10 = trunc i64 %div9 to i32 + store i32 %conv10, i32* %totalNum, align 4 + %16 = load i32, i32* %partSize, align 4 + %conv11 = sext i32 %16 to i64 + %div12 = udiv i64 %conv11, 4 + %conv13 = trunc i64 %div12 to i32 + store i32 %conv13, i32* %partialNum, align 4 + %17 = load i32, i32* %partSize, align 4 + %conv14 = sext i32 %17 to i64 + %call15 = call i32 @cudaMalloc(i8** %dev_buffer0, i64 %conv14) + %18 = load i32, i32* %partSize, align 4 + %conv16 = sext i32 %18 to i64 + %call17 = call i32 @cudaMalloc(i8** %dev_buffer1, i64 %conv16) + %19 = bitcast i32** %dev_histo to i8** + %call18 = call i32 @cudaMalloc(i8** %19, i64 1024) + %20 = load i32*, i32** %dev_histo, align 8 + %21 = bitcast i32* %20 to i8* + %call19 = call i32 @cudaMemset(i8* %21, i32 0, i64 1024) + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end6 + %22 = load i32, i32* %i, align 4 + %23 = load i32, i32* %totalNum, align 4 + %cmp20 = icmp slt i32 %22, %23 + br i1 %cmp20, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %24 = load i8*, i8** %dev_buffer0, align 8 + %25 = load i8*, i8** %buffer, align 8 + %26 = load i32, i32* %i, align 4 + %idx.ext = sext i32 %26 to i64 + %add.ptr = getelementptr inbounds i8, i8* %25, i64 %idx.ext + %27 = load i32, i32* %partSize, align 4 + %conv21 = sext i32 %27 to i64 + %call22 = call i32 @cudaMemcpy(i8* %24, i8* %add.ptr, i64 %conv21, i32 1) + call void @_Z9gpuAssert9cudaErrorPKcib(i32 %call22, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.3, i64 0, i64 0), i32 88, i1 zeroext true) + %28 = load i8*, i8** %dev_buffer1, align 8 + %29 = load i8*, i8** %buffer, align 8 + %30 = load i32, i32* %i, align 4 + %idx.ext23 = sext i32 %30 to i64 + %add.ptr24 = getelementptr inbounds i8, i8* %29, i64 %idx.ext23 + %31 = load i32, i32* %partialNum, align 4 + %idx.ext25 = sext i32 %31 to i64 + %add.ptr26 = getelementptr inbounds i8, i8* %add.ptr24, i64 %idx.ext25 + %32 = load i32, i32* %partSize, align 4 + %conv27 = sext i32 %32 to i64 + %call28 = call i32 @cudaMemcpy(i8* %28, i8* %add.ptr26, i64 %conv27, i32 1) + call void @_Z9gpuAssert9cudaErrorPKcib(i32 %call28, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.3, i64 0, i64 0), i32 90, i1 zeroext true) + %33 = load i32, i32* %blocks, align 4 + %mul = mul nsw i32 %33, 2 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 %mul, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp29, i32 256, i32 1, i32 1) + %34 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %35 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %34, i8* align 4 %35, i64 12, i1 false) + %36 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %37 = load i64, i64* %36, align 4 + %38 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %39 = load i32, i32* %38, align 4 + %40 = bitcast { i64, i32 }* %agg.tmp29.coerce to i8* + %41 = bitcast %struct.dim3* %agg.tmp29 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %40, i8* align 4 %41, i64 12, i1 false) + %42 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp29.coerce, i32 0, i32 0 + %43 = load i64, i64* %42, align 4 + %44 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp29.coerce, i32 0, i32 1 + %45 = load i32, i32* %44, align 4 + %call30 = call i32 @__cudaPushCallConfiguration(i64 %37, i32 %39, i64 %43, i32 %45, i64 0, i8* null) + %tobool31 = icmp ne i32 %call30, 0 + br i1 %tobool31, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %46 = load i8*, i8** %dev_buffer0, align 8 + %47 = load i32, i32* %partSize, align 4 + %conv32 = sext i32 %47 to i64 + %48 = load i32*, i32** %dev_histo, align 8 + call void @_Z12histo_kernelPhlPj(i8* %46, i64 %conv32, i32* %48) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.body + %call33 = call i32 @cudaDeviceSynchronize() + %49 = load i32, i32* %blocks, align 4 + %mul35 = mul nsw i32 %49, 2 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp34, i32 %mul35, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp36, i32 256, i32 1, i32 1) + %50 = bitcast { i64, i32 }* %agg.tmp34.coerce to i8* + %51 = bitcast %struct.dim3* %agg.tmp34 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %50, i8* align 4 %51, i64 12, i1 false) + %52 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp34.coerce, i32 0, i32 0 + %53 = load i64, i64* %52, align 4 + %54 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp34.coerce, i32 0, i32 1 + %55 = load i32, i32* %54, align 4 + %56 = bitcast { i64, i32 }* %agg.tmp36.coerce to i8* + %57 = bitcast %struct.dim3* %agg.tmp36 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %56, i8* align 4 %57, i64 12, i1 false) + %58 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp36.coerce, i32 0, i32 0 + %59 = load i64, i64* %58, align 4 + %60 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp36.coerce, i32 0, i32 1 + %61 = load i32, i32* %60, align 4 + %call37 = call i32 @__cudaPushCallConfiguration(i64 %53, i32 %55, i64 %59, i32 %61, i64 0, i8* null) + %tobool38 = icmp ne i32 %call37, 0 + br i1 %tobool38, label %kcall.end41, label %kcall.configok39 + +kcall.configok39: ; preds = %kcall.end + %62 = load i8*, i8** %dev_buffer1, align 8 + %63 = load i32, i32* %partSize, align 4 + %conv40 = sext i32 %63 to i64 + %64 = load i32*, i32** %dev_histo, align 8 + call void @_Z12histo_kernelPhlPj(i8* %62, i64 %conv40, i32* %64) + br label %kcall.end41 + +kcall.end41: ; preds = %kcall.configok39, %kcall.end + %call42 = call i32 @cudaDeviceSynchronize() + br label %for.inc + +for.inc: ; preds = %kcall.end41 + %65 = load i32, i32* %partialNum, align 4 + %mul43 = mul nsw i32 %65, 2 + %66 = load i32, i32* %i, align 4 + %add = add nsw i32 %66, %mul43 + store i32 %add, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %67 = load i32*, i32** %freq.addr, align 8 + %68 = bitcast i32* %67 to i8* + %69 = load i32*, i32** %dev_histo, align 8 + %70 = bitcast i32* %69 to i8* + %call44 = call i32 @cudaMemcpy(i8* %68, i8* %70, i64 1024, i32 2) + %71 = load i32*, i32** %dev_histo, align 8 + %72 = bitcast i32* %71 to i8* + %call45 = call i32 @cudaFree(i8* %72) + %73 = load i8*, i8** %dev_buffer0, align 8 + %call46 = call i32 @cudaFree(i8* %73) + %74 = load i8*, i8** %dev_buffer1, align 8 + %call47 = call i32 @cudaFree(i8* %74) + ret i32 0 +} + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #1 + +declare dso_local void @perror(i8*) #1 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #5 + +declare dso_local i32 @fseek(%struct._IO_FILE*, i64, i32) #1 + +declare dso_local i64 @fread(i8*, i64, i64, %struct._IO_FILE*) #1 + +declare dso_local i32 @fputs(i8*, %struct._IO_FILE*) #1 + +declare dso_local i32 @fclose(%struct._IO_FILE*) #1 + +declare dso_local i32 @cudaMalloc(i8**, i64) #1 + +declare dso_local i32 @cudaMemset(i8*, i32, i64) #1 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_Z9gpuAssert9cudaErrorPKcib(i32 %code, i8* %file, i32 %line, i1 zeroext %abort) #0 comdat { +entry: + %code.addr = alloca i32, align 4 + %file.addr = alloca i8*, align 8 + %line.addr = alloca i32, align 4 + %abort.addr = alloca i8, align 1 + store i32 %code, i32* %code.addr, align 4 + store i8* %file, i8** %file.addr, align 8 + store i32 %line, i32* %line.addr, align 4 + %frombool = zext i1 %abort to i8 + store i8 %frombool, i8* %abort.addr, align 1 + %0 = load i32, i32* %code.addr, align 4 + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end3 + +if.then: ; preds = %entry + %1 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %2 = load i32, i32* %code.addr, align 4 + %call = call i8* @cudaGetErrorString(i32 %2) + %3 = load i8*, i8** %file.addr, align 8 + %4 = load i32, i32* %line.addr, align 4 + %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.13, i64 0, i64 0), i8* %call, i8* %3, i32 %4) + %5 = load i8, i8* %abort.addr, align 1 + %tobool = trunc i8 %5 to i1 + br i1 %tobool, label %if.then2, label %if.end + +if.then2: ; preds = %if.then + %6 = load i32, i32* %code.addr, align 4 + call void @exit(i32 %6) #16 + unreachable + +if.end: ; preds = %if.then + br label %if.end3 + +if.end3: ; preds = %if.end, %entry + ret void +} + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1 + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @cudaDeviceSynchronize() #1 + +declare dso_local i32 @cudaFree(i8*) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z9printBitsji(i32 %val, i32 %numbits) #0 { +entry: + %val.addr = alloca i32, align 4 + %numbits.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %val, i32* %val.addr, align 4 + store i32 %numbits, i32* %numbits.addr, align 4 + %0 = load i32, i32* %numbits.addr, align 4 + %sub = sub nsw i32 %0, 1 + store i32 %sub, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp sge i32 %1, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %val.addr, align 4 + %3 = load i32, i32* %i, align 4 + %shr = lshr i32 %2, %3 + %and = and i32 %shr, 1 + %add = add i32 48, %and + %call = call i32 @putchar(i32 %add) + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %dec = add nsw i32 %4, -1 + store i32 %dec, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare dso_local i32 @putchar(i32) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local %class.INode* @_Z9BuildTreeRA256_j([256 x i32]* dereferenceable(1024) %frequencies) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %frequencies.addr = alloca [256 x i32]*, align 8 + %trees = alloca %"class.std::priority_queue", align 8 + %ref.tmp = alloca %struct.NodeCmp, align 1 + %ref.tmp1 = alloca %"class.std::vector", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %i = alloca i32, align 4 + %ref.tmp4 = alloca %class.INode*, align 8 + %childR = alloca %class.INode*, align 8 + %childL = alloca %class.INode*, align 8 + %parent = alloca %class.INode*, align 8 + store [256 x i32]* %frequencies, [256 x i32]** %frequencies.addr, align 8 + call void @_ZNSt6vectorIP5INodeSaIS1_EEC2Ev(%"class.std::vector"* %ref.tmp1) + invoke void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpEC2ERKS5_RKS4_(%"class.std::priority_queue"* %trees, %struct.NodeCmp* dereferenceable(1) %ref.tmp, %"class.std::vector"* dereferenceable(24) %ref.tmp1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + call void @_ZNSt6vectorIP5INodeSaIS1_EED2Ev(%"class.std::vector"* %ref.tmp1) + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %invoke.cont + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 256 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load [256 x i32]*, [256 x i32]** %frequencies.addr, align 8 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %1, i64 0, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp3 = icmp ne i32 %3, 0 + br i1 %cmp3, label %if.then, label %if.end + +if.then: ; preds = %for.body + %call = invoke i8* @_Znwm(i64 16) #17 + to label %invoke.cont6 unwind label %lpad5 + +invoke.cont6: ; preds = %if.then + %4 = bitcast i8* %call to %class.LeafNode* + %5 = load [256 x i32]*, [256 x i32]** %frequencies.addr, align 8 + %6 = load i32, i32* %i, align 4 + %idxprom7 = sext i32 %6 to i64 + %arrayidx8 = getelementptr inbounds [256 x i32], [256 x i32]* %5, i64 0, i64 %idxprom7 + %7 = load i32, i32* %arrayidx8, align 4 + %8 = load i32, i32* %i, align 4 + %conv = trunc i32 %8 to i8 + invoke void @_ZN8LeafNodeC2Eic(%class.LeafNode* %4, i32 %7, i8 signext %conv) + to label %invoke.cont10 unwind label %lpad9 + +invoke.cont10: ; preds = %invoke.cont6 + %9 = bitcast %class.LeafNode* %4 to %class.INode* + store %class.INode* %9, %class.INode** %ref.tmp4, align 8 + invoke void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4pushERKS1_(%"class.std::priority_queue"* %trees, %class.INode** dereferenceable(8) %ref.tmp4) + to label %invoke.cont11 unwind label %lpad5 + +invoke.cont11: ; preds = %invoke.cont10 + br label %if.end + +lpad: ; preds = %entry + %10 = landingpad { i8*, i32 } + cleanup + %11 = extractvalue { i8*, i32 } %10, 0 + store i8* %11, i8** %exn.slot, align 8 + %12 = extractvalue { i8*, i32 } %10, 1 + store i32 %12, i32* %ehselector.slot, align 4 + invoke void @_ZNSt6vectorIP5INodeSaIS1_EED2Ev(%"class.std::vector"* %ref.tmp1) + to label %invoke.cont2 unwind label %terminate.lpad + +invoke.cont2: ; preds = %lpad + br label %eh.resume + +lpad5: ; preds = %while.end, %invoke.cont24, %invoke.cont20, %invoke.cont18, %invoke.cont17, %invoke.cont15, %while.body, %while.cond, %invoke.cont10, %if.then + %13 = landingpad { i8*, i32 } + cleanup + %14 = extractvalue { i8*, i32 } %13, 0 + store i8* %14, i8** %exn.slot, align 8 + %15 = extractvalue { i8*, i32 } %13, 1 + store i32 %15, i32* %ehselector.slot, align 4 + br label %ehcleanup + +lpad9: ; preds = %invoke.cont6 + %16 = landingpad { i8*, i32 } + cleanup + %17 = extractvalue { i8*, i32 } %16, 0 + store i8* %17, i8** %exn.slot, align 8 + %18 = extractvalue { i8*, i32 } %16, 1 + store i32 %18, i32* %ehselector.slot, align 4 + call void @_ZdlPv(i8* %call) #18 + br label %ehcleanup + +if.end: ; preds = %invoke.cont11, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %19 = load i32, i32* %i, align 4 + %inc = add nsw i32 %19, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + br label %while.cond + +while.cond: ; preds = %invoke.cont25, %for.end + %call13 = invoke i64 @_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4sizeEv(%"class.std::priority_queue"* %trees) + to label %invoke.cont12 unwind label %lpad5 + +invoke.cont12: ; preds = %while.cond + %cmp14 = icmp ugt i64 %call13, 1 + br i1 %cmp14, label %while.body, label %while.end + +while.body: ; preds = %invoke.cont12 + %call16 = invoke dereferenceable(8) %class.INode** @_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3topEv(%"class.std::priority_queue"* %trees) + to label %invoke.cont15 unwind label %lpad5 + +invoke.cont15: ; preds = %while.body + %20 = load %class.INode*, %class.INode** %call16, align 8 + store %class.INode* %20, %class.INode** %childR, align 8 + invoke void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3popEv(%"class.std::priority_queue"* %trees) + to label %invoke.cont17 unwind label %lpad5 + +invoke.cont17: ; preds = %invoke.cont15 + %call19 = invoke dereferenceable(8) %class.INode** @_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3topEv(%"class.std::priority_queue"* %trees) + to label %invoke.cont18 unwind label %lpad5 + +invoke.cont18: ; preds = %invoke.cont17 + %21 = load %class.INode*, %class.INode** %call19, align 8 + store %class.INode* %21, %class.INode** %childL, align 8 + invoke void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3popEv(%"class.std::priority_queue"* %trees) + to label %invoke.cont20 unwind label %lpad5 + +invoke.cont20: ; preds = %invoke.cont18 + %call22 = invoke i8* @_Znwm(i64 32) #17 + to label %invoke.cont21 unwind label %lpad5 + +invoke.cont21: ; preds = %invoke.cont20 + %22 = bitcast i8* %call22 to %class.InternalNode* + %23 = load %class.INode*, %class.INode** %childR, align 8 + %24 = load %class.INode*, %class.INode** %childL, align 8 + invoke void @_ZN12InternalNodeC2EP5INodeS1_(%class.InternalNode* %22, %class.INode* %23, %class.INode* %24) + to label %invoke.cont24 unwind label %lpad23 + +invoke.cont24: ; preds = %invoke.cont21 + %25 = bitcast %class.InternalNode* %22 to %class.INode* + store %class.INode* %25, %class.INode** %parent, align 8 + invoke void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4pushERKS1_(%"class.std::priority_queue"* %trees, %class.INode** dereferenceable(8) %parent) + to label %invoke.cont25 unwind label %lpad5 + +invoke.cont25: ; preds = %invoke.cont24 + br label %while.cond + +lpad23: ; preds = %invoke.cont21 + %26 = landingpad { i8*, i32 } + cleanup + %27 = extractvalue { i8*, i32 } %26, 0 + store i8* %27, i8** %exn.slot, align 8 + %28 = extractvalue { i8*, i32 } %26, 1 + store i32 %28, i32* %ehselector.slot, align 4 + call void @_ZdlPv(i8* %call22) #18 + br label %ehcleanup + +while.end: ; preds = %invoke.cont12 + %call27 = invoke dereferenceable(8) %class.INode** @_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3topEv(%"class.std::priority_queue"* %trees) + to label %invoke.cont26 unwind label %lpad5 + +invoke.cont26: ; preds = %while.end + %29 = load %class.INode*, %class.INode** %call27, align 8 + call void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpED2Ev(%"class.std::priority_queue"* %trees) + ret %class.INode* %29 + +ehcleanup: ; preds = %lpad23, %lpad9, %lpad5 + invoke void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpED2Ev(%"class.std::priority_queue"* %trees) + to label %invoke.cont28 unwind label %terminate.lpad + +invoke.cont28: ; preds = %ehcleanup + br label %eh.resume + +eh.resume: ; preds = %invoke.cont28, %invoke.cont2 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val29 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val29 + +terminate.lpad: ; preds = %ehcleanup, %lpad + %30 = landingpad { i8*, i32 } + catch i8* null + %31 = extractvalue { i8*, i32 } %30, 0 + call void @__clang_call_terminate(i8* %31) #16 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIP5INodeSaIS1_EEC2Ev(%"class.std::vector"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EEC2Ev(%"struct.std::_Vector_base"* %0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpEC2ERKS5_RKS4_(%"class.std::priority_queue"* %this, %struct.NodeCmp* dereferenceable(1) %__x, %"class.std::vector"* dereferenceable(24) %__s) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::priority_queue"*, align 8 + %__x.addr = alloca %struct.NodeCmp*, align 8 + %__s.addr = alloca %"class.std::vector"*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %agg.tmp3 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp8 = alloca %struct.NodeCmp, align 1 + store %"class.std::priority_queue"* %this, %"class.std::priority_queue"** %this.addr, align 8 + store %struct.NodeCmp* %__x, %struct.NodeCmp** %__x.addr, align 8 + store %"class.std::vector"* %__s, %"class.std::vector"** %__s.addr, align 8 + %this1 = load %"class.std::priority_queue"*, %"class.std::priority_queue"** %this.addr, align 8 + %c = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %0 = load %"class.std::vector"*, %"class.std::vector"** %__s.addr, align 8 + call void @_ZNSt6vectorIP5INodeSaIS1_EEC2ERKS3_(%"class.std::vector"* %c, %"class.std::vector"* dereferenceable(24) %0) + %comp = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 1 + %1 = load %struct.NodeCmp*, %struct.NodeCmp** %__x.addr, align 8 + %c2 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call = invoke %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %c2) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive, align 8 + %c4 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call6 = invoke %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %c4) + to label %invoke.cont5 unwind label %lpad + +invoke.cont5: ; preds = %invoke.cont + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3, i32 0, i32 0 + store %class.INode** %call6, %class.INode*** %coerce.dive7, align 8 + %comp9 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 1 + %coerce.dive10 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive10, align 8 + %coerce.dive11 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3, i32 0, i32 0 + %3 = load %class.INode**, %class.INode*** %coerce.dive11, align 8 + invoke void @_ZSt9make_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_(%class.INode** %2, %class.INode** %3) + to label %invoke.cont12 unwind label %lpad + +invoke.cont12: ; preds = %invoke.cont5 + ret void + +lpad: ; preds = %invoke.cont5, %invoke.cont, %entry + %4 = landingpad { i8*, i32 } + cleanup + %5 = extractvalue { i8*, i32 } %4, 0 + store i8* %5, i8** %exn.slot, align 8 + %6 = extractvalue { i8*, i32 } %4, 1 + store i32 %6, i32* %ehselector.slot, align 4 + invoke void @_ZNSt6vectorIP5INodeSaIS1_EED2Ev(%"class.std::vector"* %c) + to label %invoke.cont13 unwind label %terminate.lpad + +invoke.cont13: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont13 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val14 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val14 + +terminate.lpad: ; preds = %lpad + %7 = landingpad { i8*, i32 } + catch i8* null + %8 = extractvalue { i8*, i32 } %7, 0 + call void @__clang_call_terminate(i8* %8) #16 + unreachable +} + +declare dso_local i32 @__gxx_personality_v0(...) + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIP5INodeSaIS1_EED2Ev(%"class.std::vector"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %_M_start, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 1 + %3 = load %class.INode**, %class.INode*** %_M_finish, align 8 + %4 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %4) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + invoke void @_ZSt8_DestroyIPP5INodeS1_EvT_S3_RSaIT0_E(%class.INode** %1, %class.INode** %3, %"class.std::allocator"* dereferenceable(1) %call) + to label %invoke.cont3 unwind label %lpad + +invoke.cont3: ; preds = %invoke.cont + %5 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EED2Ev(%"struct.std::_Vector_base"* %5) + ret void + +lpad: ; preds = %invoke.cont, %entry + %6 = landingpad { i8*, i32 } + cleanup + %7 = extractvalue { i8*, i32 } %6, 0 + store i8* %7, i8** %exn.slot, align 8 + %8 = extractvalue { i8*, i32 } %6, 1 + store i32 %8, i32* %ehselector.slot, align 4 + %9 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + invoke void @_ZNSt12_Vector_baseIP5INodeSaIS1_EED2Ev(%"struct.std::_Vector_base"* %9) + to label %invoke.cont4 unwind label %terminate.lpad + +invoke.cont4: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont4 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val5 + +terminate.lpad: ; preds = %lpad + %10 = landingpad { i8*, i32 } + catch i8* null + %11 = extractvalue { i8*, i32 } %10, 0 + call void @__clang_call_terminate(i8* %11) #16 + unreachable +} + +; Function Attrs: noinline noreturn nounwind +define linkonce_odr hidden void @__clang_call_terminate(i8* %0) #7 comdat { + %2 = call i8* @__cxa_begin_catch(i8* %0) #3 + call void @_ZSt9terminatev() #16 + unreachable +} + +declare dso_local i8* @__cxa_begin_catch(i8*) + +declare dso_local void @_ZSt9terminatev() + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4pushERKS1_(%"class.std::priority_queue"* %this, %class.INode** dereferenceable(8) %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::priority_queue"*, align 8 + %__x.addr = alloca %class.INode**, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp3 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp7 = alloca %struct.NodeCmp, align 1 + store %"class.std::priority_queue"* %this, %"class.std::priority_queue"** %this.addr, align 8 + store %class.INode** %__x, %class.INode*** %__x.addr, align 8 + %this1 = load %"class.std::priority_queue"*, %"class.std::priority_queue"** %this.addr, align 8 + %c = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %__x.addr, align 8 + call void @_ZNSt6vectorIP5INodeSaIS1_EE9push_backERKS1_(%"class.std::vector"* %c, %class.INode** dereferenceable(8) %0) + %c2 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call = call %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %c2) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive, align 8 + %c4 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call5 = call %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %c4) + %coerce.dive6 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3, i32 0, i32 0 + store %class.INode** %call5, %class.INode*** %coerce.dive6, align 8 + %comp = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 1 + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %coerce.dive8, align 8 + %coerce.dive9 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive9, align 8 + call void @_ZSt9push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_(%class.INode** %1, %class.INode** %2) + ret void +} + +; Function Attrs: nobuiltin +declare dso_local noalias i8* @_Znwm(i64) #8 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN8LeafNodeC2Eic(%class.LeafNode* %this, i32 %f, i8 signext %c) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %class.LeafNode*, align 8 + %f.addr = alloca i32, align 4 + %c.addr = alloca i8, align 1 + store %class.LeafNode* %this, %class.LeafNode** %this.addr, align 8 + store i32 %f, i32* %f.addr, align 4 + store i8 %c, i8* %c.addr, align 1 + %this1 = load %class.LeafNode*, %class.LeafNode** %this.addr, align 8 + %0 = bitcast %class.LeafNode* %this1 to %class.INode* + %1 = load i32, i32* %f.addr, align 4 + call void @_ZN5INodeC2Ei(%class.INode* %0, i32 %1) + %2 = bitcast %class.LeafNode* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV8LeafNode, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %2, align 8 + %c2 = getelementptr inbounds %class.LeafNode, %class.LeafNode* %this1, i32 0, i32 1 + %3 = load i8, i8* %c.addr, align 1 + store i8 %3, i8* %c2, align 4 + ret void +} + +; Function Attrs: nobuiltin nounwind +declare dso_local void @_ZdlPv(i8*) #9 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE4sizeEv(%"class.std::priority_queue"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::priority_queue"*, align 8 + store %"class.std::priority_queue"* %this, %"class.std::priority_queue"** %this.addr, align 8 + %this1 = load %"class.std::priority_queue"*, %"class.std::priority_queue"** %this.addr, align 8 + %c = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %c) + ret i64 %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %class.INode** @_ZNKSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3topEv(%"class.std::priority_queue"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::priority_queue"*, align 8 + store %"class.std::priority_queue"* %this, %"class.std::priority_queue"** %this.addr, align 8 + %this1 = load %"class.std::priority_queue"*, %"class.std::priority_queue"** %this.addr, align 8 + %c = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call = call dereferenceable(8) %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE5frontEv(%"class.std::vector"* %c) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpE3popEv(%"class.std::priority_queue"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::priority_queue"*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp6 = alloca %struct.NodeCmp, align 1 + store %"class.std::priority_queue"* %this, %"class.std::priority_queue"** %this.addr, align 8 + %this1 = load %"class.std::priority_queue"*, %"class.std::priority_queue"** %this.addr, align 8 + %c = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call = call %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %c) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive, align 8 + %c3 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + %call4 = call %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %c3) + %coerce.dive5 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp2, i32 0, i32 0 + store %class.INode** %call4, %class.INode*** %coerce.dive5, align 8 + %comp = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 1 + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %coerce.dive7, align 8 + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp2, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %coerce.dive8, align 8 + call void @_ZSt8pop_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_(%class.INode** %0, %class.INode** %1) + %c9 = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + call void @_ZNSt6vectorIP5INodeSaIS1_EE8pop_backEv(%"class.std::vector"* %c9) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN12InternalNodeC2EP5INodeS1_(%class.InternalNode* %this, %class.INode* %c0, %class.INode* %c1) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.InternalNode*, align 8 + %c0.addr = alloca %class.INode*, align 8 + %c1.addr = alloca %class.INode*, align 8 + store %class.InternalNode* %this, %class.InternalNode** %this.addr, align 8 + store %class.INode* %c0, %class.INode** %c0.addr, align 8 + store %class.INode* %c1, %class.INode** %c1.addr, align 8 + %this1 = load %class.InternalNode*, %class.InternalNode** %this.addr, align 8 + %0 = bitcast %class.InternalNode* %this1 to %class.INode* + %1 = load %class.INode*, %class.INode** %c0.addr, align 8 + %f = getelementptr inbounds %class.INode, %class.INode* %1, i32 0, i32 1 + %2 = load i32, i32* %f, align 8 + %3 = load %class.INode*, %class.INode** %c1.addr, align 8 + %f2 = getelementptr inbounds %class.INode, %class.INode* %3, i32 0, i32 1 + %4 = load i32, i32* %f2, align 8 + %add = add nsw i32 %2, %4 + call void @_ZN5INodeC2Ei(%class.INode* %0, i32 %add) + %5 = bitcast %class.InternalNode* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV12InternalNode, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %5, align 8 + %left = getelementptr inbounds %class.InternalNode, %class.InternalNode* %this1, i32 0, i32 1 + %6 = load %class.INode*, %class.INode** %c0.addr, align 8 + store %class.INode* %6, %class.INode** %left, align 8 + %right = getelementptr inbounds %class.InternalNode, %class.InternalNode* %this1, i32 0, i32 2 + %7 = load %class.INode*, %class.INode** %c1.addr, align 8 + store %class.INode* %7, %class.INode** %right, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt14priority_queueIP5INodeSt6vectorIS1_SaIS1_EE7NodeCmpED2Ev(%"class.std::priority_queue"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::priority_queue"*, align 8 + store %"class.std::priority_queue"* %this, %"class.std::priority_queue"** %this.addr, align 8 + %this1 = load %"class.std::priority_queue"*, %"class.std::priority_queue"** %this.addr, align 8 + %c = getelementptr inbounds %"class.std::priority_queue", %"class.std::priority_queue"* %this1, i32 0, i32 0 + call void @_ZNSt6vectorIP5INodeSaIS1_EED2Ev(%"class.std::vector"* %c) + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13GenerateCodesPK5INodeRKSt6vectorIbSaIbEERSt3mapIhS4_St4lessIhESaISt4pairIKhS4_EEE(%class.INode* %node, %"class.std::vector.0"* dereferenceable(40) %prefix, %"class.std::map"* dereferenceable(48) %outCodes) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %node.addr = alloca %class.INode*, align 8 + %prefix.addr = alloca %"class.std::vector.0"*, align 8 + %outCodes.addr = alloca %"class.std::map"*, align 8 + %lf = alloca %class.LeafNode*, align 8 + %ref.tmp = alloca i8, align 1 + %in = alloca %class.InternalNode*, align 8 + %leftPrefix = alloca %"class.std::vector.0", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %rightPrefix = alloca %"class.std::vector.0", align 8 + store %class.INode* %node, %class.INode** %node.addr, align 8 + store %"class.std::vector.0"* %prefix, %"class.std::vector.0"** %prefix.addr, align 8 + store %"class.std::map"* %outCodes, %"class.std::map"** %outCodes.addr, align 8 + %0 = load %class.INode*, %class.INode** %node.addr, align 8 + %1 = icmp eq %class.INode* %0, null + br i1 %1, label %dynamic_cast.null, label %dynamic_cast.notnull + +dynamic_cast.notnull: ; preds = %entry + %2 = bitcast %class.INode* %0 to i8* + %3 = call i8* @__dynamic_cast(i8* %2, i8* bitcast ({ i8*, i8* }* @_ZTI5INode to i8*), i8* bitcast ({ i8*, i8*, i8* }* @_ZTI8LeafNode to i8*), i64 0) #3 + %4 = bitcast i8* %3 to %class.LeafNode* + br label %dynamic_cast.end + +dynamic_cast.null: ; preds = %entry + br label %dynamic_cast.end + +dynamic_cast.end: ; preds = %dynamic_cast.null, %dynamic_cast.notnull + %5 = phi %class.LeafNode* [ %4, %dynamic_cast.notnull ], [ null, %dynamic_cast.null ] + store %class.LeafNode* %5, %class.LeafNode** %lf, align 8 + %6 = load %class.LeafNode*, %class.LeafNode** %lf, align 8 + %tobool = icmp ne %class.LeafNode* %6, null + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %dynamic_cast.end + %7 = load %"class.std::vector.0"*, %"class.std::vector.0"** %prefix.addr, align 8 + %8 = load %"class.std::map"*, %"class.std::map"** %outCodes.addr, align 8 + %9 = load %class.LeafNode*, %class.LeafNode** %lf, align 8 + %c = getelementptr inbounds %class.LeafNode, %class.LeafNode* %9, i32 0, i32 1 + %10 = load i8, i8* %c, align 4 + store i8 %10, i8* %ref.tmp, align 1 + %call = call dereferenceable(40) %"class.std::vector.0"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEEixERS6_(%"class.std::map"* %8, i8* dereferenceable(1) %ref.tmp) + %call1 = call dereferenceable(40) %"class.std::vector.0"* @_ZNSt6vectorIbSaIbEEaSERKS1_(%"class.std::vector.0"* %call, %"class.std::vector.0"* dereferenceable(40) %7) + br label %if.end15 + +if.else: ; preds = %dynamic_cast.end + %11 = load %class.INode*, %class.INode** %node.addr, align 8 + %12 = icmp eq %class.INode* %11, null + br i1 %12, label %dynamic_cast.null3, label %dynamic_cast.notnull2 + +dynamic_cast.notnull2: ; preds = %if.else + %13 = bitcast %class.INode* %11 to i8* + %14 = call i8* @__dynamic_cast(i8* %13, i8* bitcast ({ i8*, i8* }* @_ZTI5INode to i8*), i8* bitcast ({ i8*, i8*, i8* }* @_ZTI12InternalNode to i8*), i64 0) #3 + %15 = bitcast i8* %14 to %class.InternalNode* + br label %dynamic_cast.end4 + +dynamic_cast.null3: ; preds = %if.else + br label %dynamic_cast.end4 + +dynamic_cast.end4: ; preds = %dynamic_cast.null3, %dynamic_cast.notnull2 + %16 = phi %class.InternalNode* [ %15, %dynamic_cast.notnull2 ], [ null, %dynamic_cast.null3 ] + store %class.InternalNode* %16, %class.InternalNode** %in, align 8 + %17 = load %class.InternalNode*, %class.InternalNode** %in, align 8 + %tobool5 = icmp ne %class.InternalNode* %17, null + br i1 %tobool5, label %if.then6, label %if.end + +if.then6: ; preds = %dynamic_cast.end4 + %18 = load %"class.std::vector.0"*, %"class.std::vector.0"** %prefix.addr, align 8 + call void @_ZNSt6vectorIbSaIbEEC2ERKS1_(%"class.std::vector.0"* %leftPrefix, %"class.std::vector.0"* dereferenceable(40) %18) + invoke void @_ZNSt6vectorIbSaIbEE9push_backEb(%"class.std::vector.0"* %leftPrefix, i1 zeroext false) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %if.then6 + %19 = load %class.InternalNode*, %class.InternalNode** %in, align 8 + %left = getelementptr inbounds %class.InternalNode, %class.InternalNode* %19, i32 0, i32 1 + %20 = load %class.INode*, %class.INode** %left, align 8 + %21 = load %"class.std::map"*, %"class.std::map"** %outCodes.addr, align 8 + invoke void @_Z13GenerateCodesPK5INodeRKSt6vectorIbSaIbEERSt3mapIhS4_St4lessIhESaISt4pairIKhS4_EEE(%class.INode* %20, %"class.std::vector.0"* dereferenceable(40) %leftPrefix, %"class.std::map"* dereferenceable(48) %21) + to label %invoke.cont7 unwind label %lpad + +invoke.cont7: ; preds = %invoke.cont + %22 = load %"class.std::vector.0"*, %"class.std::vector.0"** %prefix.addr, align 8 + invoke void @_ZNSt6vectorIbSaIbEEC2ERKS1_(%"class.std::vector.0"* %rightPrefix, %"class.std::vector.0"* dereferenceable(40) %22) + to label %invoke.cont8 unwind label %lpad + +invoke.cont8: ; preds = %invoke.cont7 + invoke void @_ZNSt6vectorIbSaIbEE9push_backEb(%"class.std::vector.0"* %rightPrefix, i1 zeroext true) + to label %invoke.cont10 unwind label %lpad9 + +invoke.cont10: ; preds = %invoke.cont8 + %23 = load %class.InternalNode*, %class.InternalNode** %in, align 8 + %right = getelementptr inbounds %class.InternalNode, %class.InternalNode* %23, i32 0, i32 2 + %24 = load %class.INode*, %class.INode** %right, align 8 + %25 = load %"class.std::map"*, %"class.std::map"** %outCodes.addr, align 8 + invoke void @_Z13GenerateCodesPK5INodeRKSt6vectorIbSaIbEERSt3mapIhS4_St4lessIhESaISt4pairIKhS4_EEE(%class.INode* %24, %"class.std::vector.0"* dereferenceable(40) %rightPrefix, %"class.std::map"* dereferenceable(48) %25) + to label %invoke.cont11 unwind label %lpad9 + +invoke.cont11: ; preds = %invoke.cont10 + invoke void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %rightPrefix) + to label %invoke.cont12 unwind label %lpad + +invoke.cont12: ; preds = %invoke.cont11 + call void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %leftPrefix) + br label %if.end + +lpad: ; preds = %invoke.cont11, %invoke.cont7, %invoke.cont, %if.then6 + %26 = landingpad { i8*, i32 } + cleanup + %27 = extractvalue { i8*, i32 } %26, 0 + store i8* %27, i8** %exn.slot, align 8 + %28 = extractvalue { i8*, i32 } %26, 1 + store i32 %28, i32* %ehselector.slot, align 4 + br label %ehcleanup + +lpad9: ; preds = %invoke.cont10, %invoke.cont8 + %29 = landingpad { i8*, i32 } + cleanup + %30 = extractvalue { i8*, i32 } %29, 0 + store i8* %30, i8** %exn.slot, align 8 + %31 = extractvalue { i8*, i32 } %29, 1 + store i32 %31, i32* %ehselector.slot, align 4 + invoke void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %rightPrefix) + to label %invoke.cont13 unwind label %terminate.lpad + +invoke.cont13: ; preds = %lpad9 + br label %ehcleanup + +ehcleanup: ; preds = %invoke.cont13, %lpad + invoke void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %leftPrefix) + to label %invoke.cont14 unwind label %terminate.lpad + +invoke.cont14: ; preds = %ehcleanup + br label %eh.resume + +if.end: ; preds = %invoke.cont12, %dynamic_cast.end4 + br label %if.end15 + +if.end15: ; preds = %if.end, %if.then + ret void + +eh.resume: ; preds = %invoke.cont14 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val16 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val16 + +terminate.lpad: ; preds = %ehcleanup, %lpad9 + %32 = landingpad { i8*, i32 } + catch i8* null + %33 = extractvalue { i8*, i32 } %32, 0 + call void @__clang_call_terminate(i8* %33) #16 + unreachable +} + +; Function Attrs: nounwind readonly +declare dso_local i8* @__dynamic_cast(i8*, i8*, i8*, i64) #10 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(40) %"class.std::vector.0"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEEixERS6_(%"class.std::map"* %this, i8* dereferenceable(1) %__k) #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::map"*, align 8 + %__k.addr = alloca i8*, align 8 + %__i = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp5 = alloca %"struct.std::less", align 1 + %undef.agg.tmp = alloca %"struct.std::less", align 1 + %ref.tmp8 = alloca %"struct.std::_Rb_tree_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp9 = alloca %"struct.std::pair", align 8 + %ref.tmp10 = alloca %"class.std::vector.0", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + store i8* %__k, i8** %__k.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %0 = load i8*, i8** %__k.addr, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE11lower_boundERS6_(%"class.std::map"* %this1, i8* dereferenceable(1) %0) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__i, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + %call2 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE3endEv(%"class.std::map"* %this1) + %coerce.dive3 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %ref.tmp, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call2, %"struct.std::_Rb_tree_node_base"** %coerce.dive3, align 8 + %call4 = call zeroext i1 @_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEeqERKS6_(%"struct.std::_Rb_tree_iterator"* %__i, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %ref.tmp) + br i1 %call4, label %lor.end, label %lor.rhs + +lor.rhs: ; preds = %entry + call void @_ZNKSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE8key_compEv(%"class.std::map"* %this1) + %1 = load i8*, i8** %__k.addr, align 8 + %call6 = call dereferenceable(48) %"struct.std::pair"* @_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEdeEv(%"struct.std::_Rb_tree_iterator"* %__i) + %first = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call6, i32 0, i32 0 + %call7 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %ref.tmp5, i8* dereferenceable(1) %1, i8* dereferenceable(1) %first) + br label %lor.end + +lor.end: ; preds = %lor.rhs, %entry + %2 = phi i1 [ true, %entry ], [ %call7, %lor.rhs ] + br i1 %2, label %if.then, label %if.end + +if.then: ; preds = %lor.end + %3 = bitcast %"struct.std::_Rb_tree_iterator"* %agg.tmp to i8* + %4 = bitcast %"struct.std::_Rb_tree_iterator"* %__i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 8, i1 false) + %5 = load i8*, i8** %__k.addr, align 8 + call void @_ZNSt6vectorIbSaIbEEC2Ev(%"class.std::vector.0"* %ref.tmp10) + invoke void @_ZNSt4pairIKhSt6vectorIbSaIbEEEC2ERS0_RKS3_(%"struct.std::pair"* %ref.tmp9, i8* dereferenceable(1) %5, %"class.std::vector.0"* dereferenceable(40) %ref.tmp10) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %if.then + %coerce.dive11 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %agg.tmp, i32 0, i32 0 + %6 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive11, align 8 + %call14 = invoke %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE6insertESt17_Rb_tree_iteratorIS7_ERKS7_(%"class.std::map"* %this1, %"struct.std::_Rb_tree_node_base"* %6, %"struct.std::pair"* dereferenceable(48) %ref.tmp9) + to label %invoke.cont13 unwind label %lpad12 + +invoke.cont13: ; preds = %invoke.cont + %coerce.dive15 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %ref.tmp8, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call14, %"struct.std::_Rb_tree_node_base"** %coerce.dive15, align 8 + %7 = bitcast %"struct.std::_Rb_tree_iterator"* %__i to i8* + %8 = bitcast %"struct.std::_Rb_tree_iterator"* %ref.tmp8 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %7, i8* align 8 %8, i64 8, i1 false) + invoke void @_ZNSt4pairIKhSt6vectorIbSaIbEEED2Ev(%"struct.std::pair"* %ref.tmp9) + to label %invoke.cont16 unwind label %lpad + +invoke.cont16: ; preds = %invoke.cont13 + call void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %ref.tmp10) + br label %if.end + +lpad: ; preds = %invoke.cont13, %if.then + %9 = landingpad { i8*, i32 } + cleanup + %10 = extractvalue { i8*, i32 } %9, 0 + store i8* %10, i8** %exn.slot, align 8 + %11 = extractvalue { i8*, i32 } %9, 1 + store i32 %11, i32* %ehselector.slot, align 4 + br label %ehcleanup + +lpad12: ; preds = %invoke.cont + %12 = landingpad { i8*, i32 } + cleanup + %13 = extractvalue { i8*, i32 } %12, 0 + store i8* %13, i8** %exn.slot, align 8 + %14 = extractvalue { i8*, i32 } %12, 1 + store i32 %14, i32* %ehselector.slot, align 4 + invoke void @_ZNSt4pairIKhSt6vectorIbSaIbEEED2Ev(%"struct.std::pair"* %ref.tmp9) + to label %invoke.cont17 unwind label %terminate.lpad + +invoke.cont17: ; preds = %lpad12 + br label %ehcleanup + +ehcleanup: ; preds = %invoke.cont17, %lpad + invoke void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %ref.tmp10) + to label %invoke.cont18 unwind label %terminate.lpad + +invoke.cont18: ; preds = %ehcleanup + br label %eh.resume + +if.end: ; preds = %invoke.cont16, %lor.end + %call19 = call dereferenceable(48) %"struct.std::pair"* @_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEdeEv(%"struct.std::_Rb_tree_iterator"* %__i) + %second = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call19, i32 0, i32 1 + ret %"class.std::vector.0"* %second + +eh.resume: ; preds = %invoke.cont18 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val20 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val20 + +terminate.lpad: ; preds = %ehcleanup, %lpad12 + %15 = landingpad { i8*, i32 } + catch i8* null + %16 = extractvalue { i8*, i32 } %15, 0 + call void @__clang_call_terminate(i8* %16) #16 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(40) %"class.std::vector.0"* @_ZNSt6vectorIbSaIbEEaSERKS1_(%"class.std::vector.0"* %this, %"class.std::vector.0"* dereferenceable(40) %__x) #0 comdat align 2 { +entry: + %retval = alloca %"class.std::vector.0"*, align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__x.addr = alloca %"class.std::vector.0"*, align 8 + %ref.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp8 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp10 = alloca %"struct.std::_Bit_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store %"class.std::vector.0"* %__x, %"class.std::vector.0"** %__x.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %cmp = icmp eq %"class.std::vector.0"* %0, %this1 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store %"class.std::vector.0"* %this1, %"class.std::vector.0"** %retval, align 8 + br label %return + +if.end: ; preds = %entry + %1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call = call i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %1) + %call2 = call i64 @_ZNKSt6vectorIbSaIbEE8capacityEv(%"class.std::vector.0"* %this1) + %cmp3 = icmp ugt i64 %call, %call2 + br i1 %cmp3, label %if.then4, label %if.end6 + +if.then4: ; preds = %if.end + %2 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + call void @_ZNSt13_Bvector_baseISaIbEE13_M_deallocateEv(%"struct.std::_Bvector_base"* %2) + %3 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call5 = call i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %3) + call void @_ZNSt6vectorIbSaIbEE13_M_initializeEm(%"class.std::vector.0"* %this1, i64 %call5) + br label %if.end6 + +if.end6: ; preds = %if.then4, %if.end + %4 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call7 = call { i64*, i32 } @_ZNKSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %4) + %5 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %6 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %5, i32 0, i32 0 + %7 = extractvalue { i64*, i32 } %call7, 0 + store i64* %7, i64** %6, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %5, i32 0, i32 1 + %9 = extractvalue { i64*, i32 } %call7, 1 + store i32 %9, i32* %8, align 8 + %10 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call9 = call { i64*, i32 } @_ZNKSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %10) + %11 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp8 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = extractvalue { i64*, i32 } %call9, 0 + store i64* %13, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = extractvalue { i64*, i32 } %call9, 1 + store i32 %15, i32* %14, align 8 + %call11 = call { i64*, i32 } @_ZNSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %this1) + %16 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp10 to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call11, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call11, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %22 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %21, i32 0, i32 0 + %23 = load i64*, i64** %22, align 8 + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %21, i32 0, i32 1 + %25 = load i32, i32* %24, align 8 + %26 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp8 to { i64*, i32 }* + %27 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %26, i32 0, i32 0 + %28 = load i64*, i64** %27, align 8 + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %26, i32 0, i32 1 + %30 = load i32, i32* %29, align 8 + %call12 = call { i64*, i32 } @_ZNSt6vectorIbSaIbEE15_M_copy_alignedESt19_Bit_const_iteratorS2_St13_Bit_iterator(%"class.std::vector.0"* %this1, i64* %23, i32 %25, i64* %28, i32 %30, %"struct.std::_Bit_iterator"* byval(%"struct.std::_Bit_iterator") align 8 %agg.tmp10) + %31 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp to { i64*, i32 }* + %32 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %31, i32 0, i32 0 + %33 = extractvalue { i64*, i32 } %call12, 0 + store i64* %33, i64** %32, align 8 + %34 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %31, i32 0, i32 1 + %35 = extractvalue { i64*, i32 } %call12, 1 + store i32 %35, i32* %34, align 8 + %36 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %36, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 1 + %37 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to i8* + %38 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 12, i1 false) + store %"class.std::vector.0"* %this1, %"class.std::vector.0"** %retval, align 8 + br label %return + +return: ; preds = %if.end6, %if.then + %39 = load %"class.std::vector.0"*, %"class.std::vector.0"** %retval, align 8 + ret %"class.std::vector.0"* %39 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIbSaIbEEC2ERKS1_(%"class.std::vector.0"* %this, %"class.std::vector.0"* dereferenceable(40) %__x) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__x.addr = alloca %"class.std::vector.0"*, align 8 + %ref.tmp = alloca %"class.std::allocator.13", align 1 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp9 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp12 = alloca %"struct.std::_Bit_iterator", align 8 + %coerce = alloca %"struct.std::_Bit_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store %"class.std::vector.0"* %__x, %"class.std::vector.0"** %__x.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %2 = bitcast %"class.std::vector.0"* %1 to %"struct.std::_Bvector_base"* + %call = call dereferenceable(1) %"class.std::allocator.1"* @_ZNKSt13_Bvector_baseISaIbEE20_M_get_Bit_allocatorEv(%"struct.std::_Bvector_base"* %2) + %call2 = call dereferenceable(1) %"class.std::allocator.1"* @_ZN9__gnu_cxx14__alloc_traitsISaImEE17_S_select_on_copyERKS1_(%"class.std::allocator.1"* dereferenceable(1) %call) + call void @_ZNSaIbEC2ImEERKSaIT_E(%"class.std::allocator.13"* %ref.tmp, %"class.std::allocator.1"* dereferenceable(1) %call2) #3 + invoke void @_ZNSt13_Bvector_baseISaIbEEC2ERKS0_(%"struct.std::_Bvector_base"* %0, %"class.std::allocator.13"* dereferenceable(1) %ref.tmp) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + call void @_ZNSaIbED2Ev(%"class.std::allocator.13"* %ref.tmp) #3 + %3 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call5 = invoke i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %3) + to label %invoke.cont4 unwind label %lpad3 + +invoke.cont4: ; preds = %invoke.cont + invoke void @_ZNSt6vectorIbSaIbEE13_M_initializeEm(%"class.std::vector.0"* %this1, i64 %call5) + to label %invoke.cont6 unwind label %lpad3 + +invoke.cont6: ; preds = %invoke.cont4 + %4 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call8 = invoke { i64*, i32 } @_ZNKSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %4) + to label %invoke.cont7 unwind label %lpad3 + +invoke.cont7: ; preds = %invoke.cont6 + %5 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %6 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %5, i32 0, i32 0 + %7 = extractvalue { i64*, i32 } %call8, 0 + store i64* %7, i64** %6, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %5, i32 0, i32 1 + %9 = extractvalue { i64*, i32 } %call8, 1 + store i32 %9, i32* %8, align 8 + %10 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__x.addr, align 8 + %call11 = invoke { i64*, i32 } @_ZNKSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %10) + to label %invoke.cont10 unwind label %lpad3 + +invoke.cont10: ; preds = %invoke.cont7 + %11 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp9 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = extractvalue { i64*, i32 } %call11, 0 + store i64* %13, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = extractvalue { i64*, i32 } %call11, 1 + store i32 %15, i32* %14, align 8 + %16 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %16, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 0 + %17 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp12 to i8* + %18 = bitcast %"struct.std::_Bit_iterator"* %_M_start to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 16, i1 false) + %19 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %20 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %19, i32 0, i32 0 + %21 = load i64*, i64** %20, align 8 + %22 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %19, i32 0, i32 1 + %23 = load i32, i32* %22, align 8 + %24 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp9 to { i64*, i32 }* + %25 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %24, i32 0, i32 0 + %26 = load i64*, i64** %25, align 8 + %27 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %24, i32 0, i32 1 + %28 = load i32, i32* %27, align 8 + %call14 = invoke { i64*, i32 } @_ZNSt6vectorIbSaIbEE15_M_copy_alignedESt19_Bit_const_iteratorS2_St13_Bit_iterator(%"class.std::vector.0"* %this1, i64* %21, i32 %23, i64* %26, i32 %28, %"struct.std::_Bit_iterator"* byval(%"struct.std::_Bit_iterator") align 8 %agg.tmp12) + to label %invoke.cont13 unwind label %lpad3 + +invoke.cont13: ; preds = %invoke.cont10 + %29 = bitcast %"struct.std::_Bit_iterator"* %coerce to { i64*, i32 }* + %30 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %29, i32 0, i32 0 + %31 = extractvalue { i64*, i32 } %call14, 0 + store i64* %31, i64** %30, align 8 + %32 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %29, i32 0, i32 1 + %33 = extractvalue { i64*, i32 } %call14, 1 + store i32 %33, i32* %32, align 8 + ret void + +lpad: ; preds = %entry + %34 = landingpad { i8*, i32 } + cleanup + %35 = extractvalue { i8*, i32 } %34, 0 + store i8* %35, i8** %exn.slot, align 8 + %36 = extractvalue { i8*, i32 } %34, 1 + store i32 %36, i32* %ehselector.slot, align 4 + call void @_ZNSaIbED2Ev(%"class.std::allocator.13"* %ref.tmp) #3 + br label %eh.resume + +lpad3: ; preds = %invoke.cont10, %invoke.cont7, %invoke.cont6, %invoke.cont4, %invoke.cont + %37 = landingpad { i8*, i32 } + cleanup + %38 = extractvalue { i8*, i32 } %37, 0 + store i8* %38, i8** %exn.slot, align 8 + %39 = extractvalue { i8*, i32 } %37, 1 + store i32 %39, i32* %ehselector.slot, align 4 + %40 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + invoke void @_ZNSt13_Bvector_baseISaIbEED2Ev(%"struct.std::_Bvector_base"* %40) + to label %invoke.cont15 unwind label %terminate.lpad + +invoke.cont15: ; preds = %lpad3 + br label %eh.resume + +eh.resume: ; preds = %invoke.cont15, %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val16 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val16 + +terminate.lpad: ; preds = %lpad3 + %41 = landingpad { i8*, i32 } + catch i8* null + %42 = extractvalue { i8*, i32 } %41, 0 + call void @__clang_call_terminate(i8* %42) #16 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIbSaIbEE9push_backEb(%"class.std::vector.0"* %this, i1 zeroext %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__x.addr = alloca i8, align 1 + %ref.tmp = alloca %"struct.std::_Bit_reference", align 8 + %ref.tmp3 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %frombool = zext i1 %__x to i8 + store i8 %frombool, i8* %__x.addr, align 1 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 1 + %1 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %1, i32 0, i32 0 + %2 = load i64*, i64** %_M_p, align 8 + %3 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %3, i32 0, i32 0 + %call = call i64* @_ZNKSt13_Bvector_baseISaIbEE13_Bvector_impl11_M_end_addrEv(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl2) + %cmp = icmp ne i64* %2, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %4 = load i8, i8* %__x.addr, align 1 + %tobool = trunc i8 %4 to i1 + %5 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl4 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %5, i32 0, i32 0 + %_M_finish5 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl4, i32 0, i32 1 + %call6 = call { i64*, i32 } @_ZNSt13_Bit_iteratorppEi(%"struct.std::_Bit_iterator"* %_M_finish5, i32 0) + %6 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp3 to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + %8 = extractvalue { i64*, i32 } %call6, 0 + store i64* %8, i64** %7, align 8 + %9 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + %10 = extractvalue { i64*, i32 } %call6, 1 + store i32 %10, i32* %9, align 8 + %call7 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %ref.tmp3) + %11 = bitcast %"struct.std::_Bit_reference"* %ref.tmp to { i64*, i64 }* + %12 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %11, i32 0, i32 0 + %13 = extractvalue { i64*, i64 } %call7, 0 + store i64* %13, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %11, i32 0, i32 1 + %15 = extractvalue { i64*, i64 } %call7, 1 + store i64 %15, i64* %14, align 8 + %call8 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSEb(%"struct.std::_Bit_reference"* %ref.tmp, i1 zeroext %tobool) + br label %if.end + +if.else: ; preds = %entry + %call9 = call { i64*, i32 } @_ZNSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %this1) + %16 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call9, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call9, 1 + store i32 %20, i32* %19, align 8 + %21 = load i8, i8* %__x.addr, align 1 + %tobool10 = trunc i8 %21 to i1 + %22 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %23 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %22, i32 0, i32 0 + %24 = load i64*, i64** %23, align 8 + %25 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %22, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + call void @_ZNSt6vectorIbSaIbEE13_M_insert_auxESt13_Bit_iteratorb(%"class.std::vector.0"* %this1, i64* %24, i32 %26, i1 zeroext %tobool10) + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + call void @_ZNSt13_Bvector_baseISaIbEED2Ev(%"struct.std::_Bvector_base"* %0) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i64 @_Z8get_timev() #6 { +entry: + %tv = alloca %struct.timeval, align 8 + %call = call i32 @gettimeofday(%struct.timeval* %tv, %struct.timezone* null) #3 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 0 + %0 = load i64, i64* %tv_sec, align 8 + %mul = mul nsw i64 %0, 1000000 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 1 + %1 = load i64, i64* %tv_usec, align 8 + %add = add nsw i64 %mul, %1 + ret i64 %add +} + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #11 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #12 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %num_block_threads = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call zeroext i1 @_Z8InitCUDAv() + br i1 %call, label %if.end, label %if.then + +if.then: ; preds = %entry + store i32 0, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + store i32 256, i32* %num_block_threads, align 4 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp sgt i32 %0, 1 + br i1 %cmp, label %if.then1, label %if.else + +if.then1: ; preds = %if.end + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then1 + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %argc.addr, align 4 + %cmp2 = icmp slt i32 %1, %2 + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %3 = load i8**, i8*** %argv.addr, align 8 + %4 = load i32, i32* %i, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i8*, i8** %3, i64 %idxprom + %5 = load i8*, i8** %arrayidx, align 8 + %6 = load i32, i32* %num_block_threads, align 4 + call void @_Z10runVLCTestPcjj(i8* %5, i32 %6, i32 1) + br label %for.inc + +for.inc: ; preds = %for.body + %7 = load i32, i32* %i, align 4 + %inc = add nsw i32 %7, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + br label %if.end3 + +if.else: ; preds = %if.end + %8 = load i32, i32* %num_block_threads, align 4 + call void @_Z10runVLCTestPcjj(i8* null, i32 %8, i32 1024) + br label %if.end3 + +if.end3: ; preds = %if.else, %for.end + store i32 0, i32* %retval, align 4 + br label %return + +return: ; preds = %if.end3, %if.then + %9 = load i32, i32* %retval, align 4 + ret i32 %9 +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z10runVLCTestPcjj(i8* %file_name, i32 %num_block_threads, i32 %num_blocks) #0 { +entry: + %file_name.addr = alloca i8*, align 8 + %num_block_threads.addr = alloca i32, align 4 + %num_blocks.addr = alloca i32, align 4 + %num_elements = alloca i32, align 4 + %mem_size = alloca i32, align 4 + %symbol_type_size = alloca i32, align 4 + %H = alloca double, align 8 + %sourceData = alloca i32*, align 8 + %destData = alloca i32*, align 8 + %crefData = alloca i32*, align 8 + %codewords = alloca i32*, align 8 + %codewordlens = alloca i32*, align 8 + %cw32 = alloca i32*, align 8 + %cw32len = alloca i32*, align 8 + %cw32idx = alloca i32*, align 8 + %cindex2 = alloca i32*, align 8 + %d_sourceData = alloca i32*, align 8 + %d_destData = alloca i32*, align 8 + %d_destDataPacked = alloca i32*, align 8 + %d_codewords = alloca i32*, align 8 + %d_codewordlens = alloca i32*, align 8 + %d_cw32 = alloca i32*, align 8 + %d_cw32len = alloca i32*, align 8 + %d_cw32idx = alloca i32*, align 8 + %d_cindex = alloca i32*, align 8 + %d_cindex2 = alloca i32*, align 8 + %err = alloca i32, align 4 + %err37 = alloca i32, align 4 + %err45 = alloca i32, align 4 + %err53 = alloca i32, align 4 + %err62 = alloca i32, align 4 + %err71 = alloca i32, align 4 + %err79 = alloca i32, align 4 + %err87 = alloca i32, align 4 + %err95 = alloca i32, align 4 + %err104 = alloca i32, align 4 + %err113 = alloca i32, align 4 + %err121 = alloca i32, align 4 + %err130 = alloca i32, align 4 + %err139 = alloca i32, align 4 + %grid_size = alloca %struct.dim3, align 4 + %block_size = alloca %struct.dim3, align 4 + %sm_size = alloca i32, align 4 + %NT = alloca i32, align 4 + %refbytesize = alloca i32, align 4 + %timer = alloca i64, align 8 + %msec = alloca float, align 4 + %num_ints = alloca i32, align 4 + %i = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp167 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp167.coerce = alloca { i64, i32 }, align 4 + %num_scan_elements = alloca i32, align 4 + %agg.tmp174 = alloca %struct.dim3, align 4 + %agg.tmp176 = alloca %struct.dim3, align 4 + %agg.tmp174.coerce = alloca { i64, i32 }, align 4 + %agg.tmp176.coerce = alloca { i64, i32 }, align 4 + %err183 = alloca i32, align 4 + %err190 = alloca i32, align 4 + %err199 = alloca i32, align 4 + %err206 = alloca i32, align 4 + %err213 = alloca i32, align 4 + %err220 = alloca i32, align 4 + %err227 = alloca i32, align 4 + %err234 = alloca i32, align 4 + %err241 = alloca i32, align 4 + %err248 = alloca i32, align 4 + %err255 = alloca i32, align 4 + %err262 = alloca i32, align 4 + store i8* %file_name, i8** %file_name.addr, align 8 + store i32 %num_block_threads, i32* %num_block_threads.addr, align 4 + store i32 %num_blocks, i32* %num_blocks.addr, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.4, i64 0, i64 0)) + store i32 4, i32* %symbol_type_size, align 4 + %0 = load i8*, i8** %file_name.addr, align 8 + %1 = load i32, i32* %num_block_threads.addr, align 4 + %2 = load i32, i32* %symbol_type_size, align 4 + call void @_Z10initParamsPcjRjS0_S0_j(i8* %0, i32 %1, i32* dereferenceable(4) %num_blocks.addr, i32* dereferenceable(4) %num_elements, i32* dereferenceable(4) %mem_size, i32 %2) + %3 = load i32, i32* %num_elements, align 4 + %4 = load i32, i32* %num_blocks.addr, align 4 + %5 = load i32, i32* %num_block_threads.addr, align 4 + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([98 x i8], [98 x i8]* @.str.5, i64 0, i64 0), i32 %3, i32 %4, i32 %5) + %6 = load i32, i32* %mem_size, align 4 + %conv = zext i32 %6 to i64 + %call2 = call noalias i8* @malloc(i64 %conv) #3 + %7 = bitcast i8* %call2 to i32* + store i32* %7, i32** %sourceData, align 8 + %8 = load i32, i32* %mem_size, align 4 + %conv3 = zext i32 %8 to i64 + %call4 = call noalias i8* @malloc(i64 %conv3) #3 + %9 = bitcast i8* %call4 to i32* + store i32* %9, i32** %destData, align 8 + %10 = load i32, i32* %mem_size, align 4 + %conv5 = zext i32 %10 to i64 + %call6 = call noalias i8* @malloc(i64 %conv5) #3 + %11 = bitcast i8* %call6 to i32* + store i32* %11, i32** %crefData, align 8 + %12 = load i32, i32* %symbol_type_size, align 4 + %mul = mul i32 256, %12 + %conv7 = zext i32 %mul to i64 + %call8 = call noalias i8* @malloc(i64 %conv7) #3 + %13 = bitcast i8* %call8 to i32* + store i32* %13, i32** %codewords, align 8 + %14 = load i32, i32* %symbol_type_size, align 4 + %mul9 = mul i32 256, %14 + %conv10 = zext i32 %mul9 to i64 + %call11 = call noalias i8* @malloc(i64 %conv10) #3 + %15 = bitcast i8* %call11 to i32* + store i32* %15, i32** %codewordlens, align 8 + %16 = load i32, i32* %mem_size, align 4 + %conv12 = zext i32 %16 to i64 + %call13 = call noalias i8* @malloc(i64 %conv12) #3 + %17 = bitcast i8* %call13 to i32* + store i32* %17, i32** %cw32, align 8 + %18 = load i32, i32* %mem_size, align 4 + %conv14 = zext i32 %18 to i64 + %call15 = call noalias i8* @malloc(i64 %conv14) #3 + %19 = bitcast i8* %call15 to i32* + store i32* %19, i32** %cw32len, align 8 + %20 = load i32, i32* %mem_size, align 4 + %conv16 = zext i32 %20 to i64 + %call17 = call noalias i8* @malloc(i64 %conv16) #3 + %21 = bitcast i8* %call17 to i32* + store i32* %21, i32** %cw32idx, align 8 + %22 = load i32, i32* %num_blocks.addr, align 4 + %conv18 = zext i32 %22 to i64 + %mul19 = mul i64 %conv18, 4 + %call20 = call noalias i8* @malloc(i64 %mul19) #3 + %23 = bitcast i8* %call20 to i32* + store i32* %23, i32** %cindex2, align 8 + %24 = load i32*, i32** %sourceData, align 8 + %25 = bitcast i32* %24 to i8* + %26 = load i32, i32* %mem_size, align 4 + %conv21 = zext i32 %26 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %25, i8 0, i64 %conv21, i1 false) + %27 = load i32*, i32** %destData, align 8 + %28 = bitcast i32* %27 to i8* + %29 = load i32, i32* %mem_size, align 4 + %conv22 = zext i32 %29 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %28, i8 0, i64 %conv22, i1 false) + %30 = load i32*, i32** %crefData, align 8 + %31 = bitcast i32* %30 to i8* + %32 = load i32, i32* %mem_size, align 4 + %conv23 = zext i32 %32 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %31, i8 0, i64 %conv23, i1 false) + %33 = load i32*, i32** %cw32, align 8 + %34 = bitcast i32* %33 to i8* + %35 = load i32, i32* %mem_size, align 4 + %conv24 = zext i32 %35 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %34, i8 0, i64 %conv24, i1 false) + %36 = load i32*, i32** %cw32len, align 8 + %37 = bitcast i32* %36 to i8* + %38 = load i32, i32* %mem_size, align 4 + %conv25 = zext i32 %38 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %37, i8 0, i64 %conv25, i1 false) + %39 = load i32*, i32** %cw32idx, align 8 + %40 = bitcast i32* %39 to i8* + %41 = load i32, i32* %mem_size, align 4 + %conv26 = zext i32 %41 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %40, i8 0, i64 %conv26, i1 false) + %42 = load i32*, i32** %codewords, align 8 + %43 = bitcast i32* %42 to i8* + %44 = load i32, i32* %symbol_type_size, align 4 + %mul27 = mul i32 256, %44 + %conv28 = zext i32 %mul27 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %43, i8 0, i64 %conv28, i1 false) + %45 = load i32*, i32** %codewordlens, align 8 + %46 = bitcast i32* %45 to i8* + %47 = load i32, i32* %symbol_type_size, align 4 + %mul29 = mul i32 256, %47 + %conv30 = zext i32 %mul29 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %46, i8 0, i64 %conv30, i1 false) + %48 = load i32*, i32** %cindex2, align 8 + %49 = bitcast i32* %48 to i8* + %50 = load i32, i32* %num_blocks.addr, align 4 + %conv31 = zext i32 %50 to i64 + %mul32 = mul i64 %conv31, 4 + call void @llvm.memset.p0i8.i64(i8* align 4 %49, i8 0, i64 %mul32, i1 false) + %51 = load i8*, i8** %file_name.addr, align 8 + %52 = load i32*, i32** %sourceData, align 8 + %53 = load i32*, i32** %codewords, align 8 + %54 = load i32*, i32** %codewordlens, align 8 + %55 = load i32, i32* %num_elements, align 4 + %56 = load i32, i32* %mem_size, align 4 + call void @_Z8loadDataPcPjS0_S0_jjRd(i8* %51, i32* %52, i32* %53, i32* %54, i32 %55, i32 %56, double* dereferenceable(8) %H) + %57 = bitcast i32** %d_sourceData to i8** + %58 = load i32, i32* %mem_size, align 4 + %conv33 = zext i32 %58 to i64 + %call34 = call i32 @cudaMalloc(i8** %57, i64 %conv33) + store i32 %call34, i32* %err, align 4 + %59 = load i32, i32* %err, align 4 + %cmp = icmp ne i32 0, %59 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %60 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %61 = load i32, i32* %err, align 4 + %call35 = call i8* @cudaGetErrorString(i32 %61) + %call36 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %60, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 107, i8* %call35) + call void @exit(i32 1) #16 + unreachable + +if.end: ; preds = %entry + %62 = bitcast i32** %d_destData to i8** + %63 = load i32, i32* %mem_size, align 4 + %conv38 = zext i32 %63 to i64 + %call39 = call i32 @cudaMalloc(i8** %62, i64 %conv38) + store i32 %call39, i32* %err37, align 4 + %64 = load i32, i32* %err37, align 4 + %cmp40 = icmp ne i32 0, %64 + br i1 %cmp40, label %if.then41, label %if.end44 + +if.then41: ; preds = %if.end + %65 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %66 = load i32, i32* %err37, align 4 + %call42 = call i8* @cudaGetErrorString(i32 %66) + %call43 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %65, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 108, i8* %call42) + call void @exit(i32 1) #16 + unreachable + +if.end44: ; preds = %if.end + %67 = bitcast i32** %d_destDataPacked to i8** + %68 = load i32, i32* %mem_size, align 4 + %conv46 = zext i32 %68 to i64 + %call47 = call i32 @cudaMalloc(i8** %67, i64 %conv46) + store i32 %call47, i32* %err45, align 4 + %69 = load i32, i32* %err45, align 4 + %cmp48 = icmp ne i32 0, %69 + br i1 %cmp48, label %if.then49, label %if.end52 + +if.then49: ; preds = %if.end44 + %70 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %71 = load i32, i32* %err45, align 4 + %call50 = call i8* @cudaGetErrorString(i32 %71) + %call51 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %70, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 109, i8* %call50) + call void @exit(i32 1) #16 + unreachable + +if.end52: ; preds = %if.end44 + %72 = bitcast i32** %d_codewords to i8** + %73 = load i32, i32* %symbol_type_size, align 4 + %mul54 = mul i32 256, %73 + %conv55 = zext i32 %mul54 to i64 + %call56 = call i32 @cudaMalloc(i8** %72, i64 %conv55) + store i32 %call56, i32* %err53, align 4 + %74 = load i32, i32* %err53, align 4 + %cmp57 = icmp ne i32 0, %74 + br i1 %cmp57, label %if.then58, label %if.end61 + +if.then58: ; preds = %if.end52 + %75 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %76 = load i32, i32* %err53, align 4 + %call59 = call i8* @cudaGetErrorString(i32 %76) + %call60 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %75, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 112, i8* %call59) + call void @exit(i32 1) #16 + unreachable + +if.end61: ; preds = %if.end52 + %77 = bitcast i32** %d_codewordlens to i8** + %78 = load i32, i32* %symbol_type_size, align 4 + %mul63 = mul i32 256, %78 + %conv64 = zext i32 %mul63 to i64 + %call65 = call i32 @cudaMalloc(i8** %77, i64 %conv64) + store i32 %call65, i32* %err62, align 4 + %79 = load i32, i32* %err62, align 4 + %cmp66 = icmp ne i32 0, %79 + br i1 %cmp66, label %if.then67, label %if.end70 + +if.then67: ; preds = %if.end61 + %80 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %81 = load i32, i32* %err62, align 4 + %call68 = call i8* @cudaGetErrorString(i32 %81) + %call69 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %80, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 114, i8* %call68) + call void @exit(i32 1) #16 + unreachable + +if.end70: ; preds = %if.end61 + %82 = bitcast i32** %d_cw32 to i8** + %83 = load i32, i32* %mem_size, align 4 + %conv72 = zext i32 %83 to i64 + %call73 = call i32 @cudaMalloc(i8** %82, i64 %conv72) + store i32 %call73, i32* %err71, align 4 + %84 = load i32, i32* %err71, align 4 + %cmp74 = icmp ne i32 0, %84 + br i1 %cmp74, label %if.then75, label %if.end78 + +if.then75: ; preds = %if.end70 + %85 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %86 = load i32, i32* %err71, align 4 + %call76 = call i8* @cudaGetErrorString(i32 %86) + %call77 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %85, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 116, i8* %call76) + call void @exit(i32 1) #16 + unreachable + +if.end78: ; preds = %if.end70 + %87 = bitcast i32** %d_cw32len to i8** + %88 = load i32, i32* %mem_size, align 4 + %conv80 = zext i32 %88 to i64 + %call81 = call i32 @cudaMalloc(i8** %87, i64 %conv80) + store i32 %call81, i32* %err79, align 4 + %89 = load i32, i32* %err79, align 4 + %cmp82 = icmp ne i32 0, %89 + br i1 %cmp82, label %if.then83, label %if.end86 + +if.then83: ; preds = %if.end78 + %90 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %91 = load i32, i32* %err79, align 4 + %call84 = call i8* @cudaGetErrorString(i32 %91) + %call85 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %90, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 117, i8* %call84) + call void @exit(i32 1) #16 + unreachable + +if.end86: ; preds = %if.end78 + %92 = bitcast i32** %d_cw32idx to i8** + %93 = load i32, i32* %mem_size, align 4 + %conv88 = zext i32 %93 to i64 + %call89 = call i32 @cudaMalloc(i8** %92, i64 %conv88) + store i32 %call89, i32* %err87, align 4 + %94 = load i32, i32* %err87, align 4 + %cmp90 = icmp ne i32 0, %94 + br i1 %cmp90, label %if.then91, label %if.end94 + +if.then91: ; preds = %if.end86 + %95 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %96 = load i32, i32* %err87, align 4 + %call92 = call i8* @cudaGetErrorString(i32 %96) + %call93 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %95, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 118, i8* %call92) + call void @exit(i32 1) #16 + unreachable + +if.end94: ; preds = %if.end86 + %97 = bitcast i32** %d_cindex to i8** + %98 = load i32, i32* %num_blocks.addr, align 4 + %conv96 = zext i32 %98 to i64 + %mul97 = mul i64 %conv96, 4 + %call98 = call i32 @cudaMalloc(i8** %97, i64 %mul97) + store i32 %call98, i32* %err95, align 4 + %99 = load i32, i32* %err95, align 4 + %cmp99 = icmp ne i32 0, %99 + br i1 %cmp99, label %if.then100, label %if.end103 + +if.then100: ; preds = %if.end94 + %100 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %101 = load i32, i32* %err95, align 4 + %call101 = call i8* @cudaGetErrorString(i32 %101) + %call102 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %100, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 121, i8* %call101) + call void @exit(i32 1) #16 + unreachable + +if.end103: ; preds = %if.end94 + %102 = bitcast i32** %d_cindex2 to i8** + %103 = load i32, i32* %num_blocks.addr, align 4 + %conv105 = zext i32 %103 to i64 + %mul106 = mul i64 %conv105, 4 + %call107 = call i32 @cudaMalloc(i8** %102, i64 %mul106) + store i32 %call107, i32* %err104, align 4 + %104 = load i32, i32* %err104, align 4 + %cmp108 = icmp ne i32 0, %104 + br i1 %cmp108, label %if.then109, label %if.end112 + +if.then109: ; preds = %if.end103 + %105 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %106 = load i32, i32* %err104, align 4 + %call110 = call i8* @cudaGetErrorString(i32 %106) + %call111 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %105, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 123, i8* %call110) + call void @exit(i32 1) #16 + unreachable + +if.end112: ; preds = %if.end103 + %107 = load i32*, i32** %d_sourceData, align 8 + %108 = bitcast i32* %107 to i8* + %109 = load i32*, i32** %sourceData, align 8 + %110 = bitcast i32* %109 to i8* + %111 = load i32, i32* %mem_size, align 4 + %conv114 = zext i32 %111 to i64 + %call115 = call i32 @cudaMemcpy(i8* %108, i8* %110, i64 %conv114, i32 1) + store i32 %call115, i32* %err113, align 4 + %112 = load i32, i32* %err113, align 4 + %cmp116 = icmp ne i32 0, %112 + br i1 %cmp116, label %if.then117, label %if.end120 + +if.then117: ; preds = %if.end112 + %113 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %114 = load i32, i32* %err113, align 4 + %call118 = call i8* @cudaGetErrorString(i32 %114) + %call119 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %113, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 141, i8* %call118) + call void @exit(i32 1) #16 + unreachable + +if.end120: ; preds = %if.end112 + %115 = load i32*, i32** %d_codewords, align 8 + %116 = bitcast i32* %115 to i8* + %117 = load i32*, i32** %codewords, align 8 + %118 = bitcast i32* %117 to i8* + %119 = load i32, i32* %symbol_type_size, align 4 + %mul122 = mul i32 256, %119 + %conv123 = zext i32 %mul122 to i64 + %call124 = call i32 @cudaMemcpy(i8* %116, i8* %118, i64 %conv123, i32 1) + store i32 %call124, i32* %err121, align 4 + %120 = load i32, i32* %err121, align 4 + %cmp125 = icmp ne i32 0, %120 + br i1 %cmp125, label %if.then126, label %if.end129 + +if.then126: ; preds = %if.end120 + %121 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %122 = load i32, i32* %err121, align 4 + %call127 = call i8* @cudaGetErrorString(i32 %122) + %call128 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %121, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 144, i8* %call127) + call void @exit(i32 1) #16 + unreachable + +if.end129: ; preds = %if.end120 + %123 = load i32*, i32** %d_codewordlens, align 8 + %124 = bitcast i32* %123 to i8* + %125 = load i32*, i32** %codewordlens, align 8 + %126 = bitcast i32* %125 to i8* + %127 = load i32, i32* %symbol_type_size, align 4 + %mul131 = mul i32 256, %127 + %conv132 = zext i32 %mul131 to i64 + %call133 = call i32 @cudaMemcpy(i8* %124, i8* %126, i64 %conv132, i32 1) + store i32 %call133, i32* %err130, align 4 + %128 = load i32, i32* %err130, align 4 + %cmp134 = icmp ne i32 0, %128 + br i1 %cmp134, label %if.then135, label %if.end138 + +if.then135: ; preds = %if.end129 + %129 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %130 = load i32, i32* %err130, align 4 + %call136 = call i8* @cudaGetErrorString(i32 %130) + %call137 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %129, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 147, i8* %call136) + call void @exit(i32 1) #16 + unreachable + +if.end138: ; preds = %if.end129 + %131 = load i32*, i32** %d_destData, align 8 + %132 = bitcast i32* %131 to i8* + %133 = load i32*, i32** %destData, align 8 + %134 = bitcast i32* %133 to i8* + %135 = load i32, i32* %mem_size, align 4 + %conv140 = zext i32 %135 to i64 + %call141 = call i32 @cudaMemcpy(i8* %132, i8* %134, i64 %conv140, i32 1) + store i32 %call141, i32* %err139, align 4 + %136 = load i32, i32* %err139, align 4 + %cmp142 = icmp ne i32 0, %136 + br i1 %cmp142, label %if.then143, label %if.end146 + +if.then143: ; preds = %if.end138 + %137 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %138 = load i32, i32* %err139, align 4 + %call144 = call i8* @cudaGetErrorString(i32 %138) + %call145 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %137, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 149, i8* %call144) + call void @exit(i32 1) #16 + unreachable + +if.end146: ; preds = %if.end138 + %139 = load i32, i32* %num_blocks.addr, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid_size, i32 %139, i32 1, i32 1) + %140 = load i32, i32* %num_block_threads.addr, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %block_size, i32 %140, i32 1, i32 1) + store i32 10, i32* %NT, align 4 + %call147 = call i64 @_Z8get_timev() + store i64 %call147, i64* %timer, align 8 + %141 = load i32*, i32** %sourceData, align 8 + %142 = load i32, i32* %num_elements, align 4 + %143 = load i32*, i32** %crefData, align 8 + %144 = load i32*, i32** %codewords, align 8 + %145 = load i32*, i32** %codewordlens, align 8 + call void @cpu_vlc_encode(i32* %141, i32 %142, i32* %143, i32* %refbytesize, i32* %144, i32* %145) + %call148 = call i64 @_Z8get_timev() + %146 = load i64, i64* %timer, align 8 + %sub = sub nsw i64 %call148, %146 + %conv149 = sitofp i64 %sub to double + %div = fdiv double %conv149, 1.000000e+03 + %conv150 = fptrunc double %div to float + store float %conv150, float* %msec, align 4 + %147 = load float, float* %msec, align 4 + %conv151 = fpext float %147 to double + %call152 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.8, i64 0, i64 0), double %conv151) + %148 = load i32, i32* %refbytesize, align 4 + %call153 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.9, i64 0, i64 0), i32 %148) + %149 = load i32, i32* %refbytesize, align 4 + %div154 = udiv i32 %149, 4 + %150 = load i32, i32* %refbytesize, align 4 + %rem = urem i32 %150, 4 + %cmp155 = icmp eq i32 %rem, 0 + %151 = zext i1 %cmp155 to i64 + %cond = select i1 %cmp155, i32 0, i32 1 + %add = add i32 %div154, %cond + store i32 %add, i32* %num_ints, align 4 + %152 = load i32, i32* %num_blocks.addr, align 4 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_size, i32 0, i32 0 + store i32 %152, i32* %x, align 4 + %153 = load i32, i32* %num_block_threads.addr, align 4 + %x156 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_size, i32 0, i32 0 + store i32 %153, i32* %x156, align 4 + %x157 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_size, i32 0, i32 0 + %154 = load i32, i32* %x157, align 4 + %conv158 = zext i32 %154 to i64 + %mul159 = mul i64 %conv158, 4 + %conv160 = trunc i64 %mul159 to i32 + store i32 %conv160, i32* %sm_size, align 4 + %x161 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_size, i32 0, i32 0 + %155 = load i32, i32* %x161, align 4 + %conv162 = zext i32 %155 to i64 + %mul163 = mul i64 %conv162, 4 + %add164 = add i64 2048, %mul163 + %conv165 = trunc i64 %add164 to i32 + store i32 %conv165, i32* %sm_size, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end146 + %156 = load i32, i32* %i, align 4 + %157 = load i32, i32* %NT, align 4 + %cmp166 = icmp ult i32 %156, %157 + br i1 %cmp166, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %158 = bitcast %struct.dim3* %agg.tmp to i8* + %159 = bitcast %struct.dim3* %grid_size to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %158, i8* align 4 %159, i64 12, i1 false) + %160 = bitcast %struct.dim3* %agg.tmp167 to i8* + %161 = bitcast %struct.dim3* %block_size to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %160, i8* align 4 %161, i64 12, i1 false) + %162 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %163 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %162, i8* align 4 %163, i64 12, i1 false) + %164 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %165 = load i64, i64* %164, align 4 + %166 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %167 = load i32, i32* %166, align 4 + %168 = bitcast { i64, i32 }* %agg.tmp167.coerce to i8* + %169 = bitcast %struct.dim3* %agg.tmp167 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %168, i8* align 4 %169, i64 12, i1 false) + %170 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp167.coerce, i32 0, i32 0 + %171 = load i64, i64* %170, align 4 + %172 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp167.coerce, i32 0, i32 1 + %173 = load i32, i32* %172, align 4 + %call168 = call i32 @__cudaPushCallConfiguration(i64 %165, i32 %167, i64 %171, i32 %173, i64 0, i8* null) + %tobool = icmp ne i32 %call168, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %174 = load i32*, i32** %d_sourceData, align 8 + %175 = load i32*, i32** %d_codewords, align 8 + %176 = load i32*, i32** %d_codewordlens, align 8 + %177 = load i32*, i32** %d_cw32, align 8 + %178 = load i32*, i32** %d_cw32len, align 8 + %179 = load i32*, i32** %d_cw32idx, align 8 + %180 = load i32*, i32** %d_destData, align 8 + %181 = load i32*, i32** %d_cindex, align 8 + call void @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_(i32* %174, i32* %175, i32* %176, i32* %177, i32* %178, i32* %179, i32* %180, i32* %181) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.body + %call169 = call i32 @cudaThreadSynchronize() + br label %for.inc + +for.inc: ; preds = %kcall.end + %182 = load i32, i32* %i, align 4 + %inc = add nsw i32 %182, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %x170 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_size, i32 0, i32 0 + %183 = load i32, i32* %x170, align 4 + store i32 %183, i32* %num_scan_elements, align 4 + %184 = load i32, i32* %num_scan_elements, align 4 + call void @_ZL17preallocBlockSumsj(i32 %184) + %185 = load i32*, i32** %d_destDataPacked, align 8 + %186 = bitcast i32* %185 to i8* + %187 = load i32, i32* %mem_size, align 4 + %conv171 = zext i32 %187 to i64 + %call172 = call i32 @cudaMemset(i8* %186, i32 0, i64 %conv171) + %188 = load i32, i32* %num_scan_elements, align 4 + %call173 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.10, i64 0, i64 0), i32 %188) + %189 = load i32*, i32** %d_cindex2, align 8 + %190 = load i32*, i32** %d_cindex, align 8 + %191 = load i32, i32* %num_scan_elements, align 4 + call void @_ZL12prescanArrayPjS_i(i32* %189, i32* %190, i32 %191) + %192 = load i32, i32* %num_scan_elements, align 4 + %div175 = udiv i32 %192, 32 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp174, i32 %div175, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp176, i32 32, i32 1, i32 1) + %193 = bitcast { i64, i32 }* %agg.tmp174.coerce to i8* + %194 = bitcast %struct.dim3* %agg.tmp174 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %193, i8* align 4 %194, i64 12, i1 false) + %195 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp174.coerce, i32 0, i32 0 + %196 = load i64, i64* %195, align 4 + %197 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp174.coerce, i32 0, i32 1 + %198 = load i32, i32* %197, align 4 + %199 = bitcast { i64, i32 }* %agg.tmp176.coerce to i8* + %200 = bitcast %struct.dim3* %agg.tmp176 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %199, i8* align 4 %200, i64 12, i1 false) + %201 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp176.coerce, i32 0, i32 0 + %202 = load i64, i64* %201, align 4 + %203 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp176.coerce, i32 0, i32 1 + %204 = load i32, i32* %203, align 4 + %call177 = call i32 @__cudaPushCallConfiguration(i64 %196, i32 %198, i64 %202, i32 %204, i64 0, i8* null) + %tobool178 = icmp ne i32 %call177, 0 + br i1 %tobool178, label %kcall.end181, label %kcall.configok179 + +kcall.configok179: ; preds = %for.end + %205 = load i32*, i32** %d_destData, align 8 + %206 = load i32*, i32** %d_cindex, align 8 + %207 = load i32*, i32** %d_cindex2, align 8 + %208 = load i32*, i32** %d_destDataPacked, align 8 + %209 = load i32, i32* %num_elements, align 4 + %210 = load i32, i32* %num_scan_elements, align 4 + %div180 = udiv i32 %209, %210 + call void @_ZL5pack2PjS_S_S_j(i32* %205, i32* %206, i32* %207, i32* %208, i32 %div180) + br label %kcall.end181 + +kcall.end181: ; preds = %kcall.configok179, %for.end + %call182 = call i32 @cudaThreadSynchronize() + %call184 = call i32 @cudaGetLastError() + store i32 %call184, i32* %err183, align 4 + %211 = load i32, i32* %err183, align 4 + %cmp185 = icmp ne i32 0, %211 + br i1 %cmp185, label %if.then186, label %if.end189 + +if.then186: ; preds = %kcall.end181 + %212 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %213 = load i32, i32* %err183, align 4 + %call187 = call i8* @cudaGetErrorString(i32 %213) + %call188 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %212, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.12, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 198, i8* %call187) + call void @exit(i32 1) #16 + unreachable + +if.end189: ; preds = %kcall.end181 + call void @_ZL16deallocBlockSumsv() + %214 = load i32*, i32** %destData, align 8 + %215 = bitcast i32* %214 to i8* + %216 = load i32*, i32** %d_destDataPacked, align 8 + %217 = bitcast i32* %216 to i8* + %218 = load i32, i32* %mem_size, align 4 + %conv191 = zext i32 %218 to i64 + %call192 = call i32 @cudaMemcpy(i8* %215, i8* %217, i64 %conv191, i32 2) + store i32 %call192, i32* %err190, align 4 + %219 = load i32, i32* %err190, align 4 + %cmp193 = icmp ne i32 0, %219 + br i1 %cmp193, label %if.then194, label %if.end197 + +if.then194: ; preds = %if.end189 + %220 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %221 = load i32, i32* %err190, align 4 + %call195 = call i8* @cudaGetErrorString(i32 %221) + %call196 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %220, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 203, i8* %call195) + call void @exit(i32 1) #16 + unreachable + +if.end197: ; preds = %if.end189 + %222 = load i32*, i32** %crefData, align 8 + %223 = load i32*, i32** %destData, align 8 + %224 = load i32, i32* %num_ints, align 4 + %call198 = call i32 @_Z15compare_vectorsIjEiPT_S1_j(i32* %222, i32* %223, i32 %224) + %225 = load i32*, i32** %sourceData, align 8 + %226 = bitcast i32* %225 to i8* + call void @free(i8* %226) #3 + %227 = load i32*, i32** %destData, align 8 + %228 = bitcast i32* %227 to i8* + call void @free(i8* %228) #3 + %229 = load i32*, i32** %codewords, align 8 + %230 = bitcast i32* %229 to i8* + call void @free(i8* %230) #3 + %231 = load i32*, i32** %codewordlens, align 8 + %232 = bitcast i32* %231 to i8* + call void @free(i8* %232) #3 + %233 = load i32*, i32** %cw32, align 8 + %234 = bitcast i32* %233 to i8* + call void @free(i8* %234) #3 + %235 = load i32*, i32** %cw32len, align 8 + %236 = bitcast i32* %235 to i8* + call void @free(i8* %236) #3 + %237 = load i32*, i32** %crefData, align 8 + %238 = bitcast i32* %237 to i8* + call void @free(i8* %238) #3 + %239 = load i32*, i32** %d_sourceData, align 8 + %240 = bitcast i32* %239 to i8* + %call200 = call i32 @cudaFree(i8* %240) + store i32 %call200, i32* %err199, align 4 + %241 = load i32, i32* %err199, align 4 + %cmp201 = icmp ne i32 0, %241 + br i1 %cmp201, label %if.then202, label %if.end205 + +if.then202: ; preds = %if.end197 + %242 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %243 = load i32, i32* %err199, align 4 + %call203 = call i8* @cudaGetErrorString(i32 %243) + %call204 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %242, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 214, i8* %call203) + call void @exit(i32 1) #16 + unreachable + +if.end205: ; preds = %if.end197 + %244 = load i32*, i32** %d_destData, align 8 + %245 = bitcast i32* %244 to i8* + %call207 = call i32 @cudaFree(i8* %245) + store i32 %call207, i32* %err206, align 4 + %246 = load i32, i32* %err206, align 4 + %cmp208 = icmp ne i32 0, %246 + br i1 %cmp208, label %if.then209, label %if.end212 + +if.then209: ; preds = %if.end205 + %247 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %248 = load i32, i32* %err206, align 4 + %call210 = call i8* @cudaGetErrorString(i32 %248) + %call211 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %247, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 215, i8* %call210) + call void @exit(i32 1) #16 + unreachable + +if.end212: ; preds = %if.end205 + %249 = load i32*, i32** %d_destDataPacked, align 8 + %250 = bitcast i32* %249 to i8* + %call214 = call i32 @cudaFree(i8* %250) + store i32 %call214, i32* %err213, align 4 + %251 = load i32, i32* %err213, align 4 + %cmp215 = icmp ne i32 0, %251 + br i1 %cmp215, label %if.then216, label %if.end219 + +if.then216: ; preds = %if.end212 + %252 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %253 = load i32, i32* %err213, align 4 + %call217 = call i8* @cudaGetErrorString(i32 %253) + %call218 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %252, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 216, i8* %call217) + call void @exit(i32 1) #16 + unreachable + +if.end219: ; preds = %if.end212 + %254 = load i32*, i32** %d_codewords, align 8 + %255 = bitcast i32* %254 to i8* + %call221 = call i32 @cudaFree(i8* %255) + store i32 %call221, i32* %err220, align 4 + %256 = load i32, i32* %err220, align 4 + %cmp222 = icmp ne i32 0, %256 + br i1 %cmp222, label %if.then223, label %if.end226 + +if.then223: ; preds = %if.end219 + %257 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %258 = load i32, i32* %err220, align 4 + %call224 = call i8* @cudaGetErrorString(i32 %258) + %call225 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %257, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 217, i8* %call224) + call void @exit(i32 1) #16 + unreachable + +if.end226: ; preds = %if.end219 + %259 = load i32*, i32** %d_codewordlens, align 8 + %260 = bitcast i32* %259 to i8* + %call228 = call i32 @cudaFree(i8* %260) + store i32 %call228, i32* %err227, align 4 + %261 = load i32, i32* %err227, align 4 + %cmp229 = icmp ne i32 0, %261 + br i1 %cmp229, label %if.then230, label %if.end233 + +if.then230: ; preds = %if.end226 + %262 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %263 = load i32, i32* %err227, align 4 + %call231 = call i8* @cudaGetErrorString(i32 %263) + %call232 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %262, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 218, i8* %call231) + call void @exit(i32 1) #16 + unreachable + +if.end233: ; preds = %if.end226 + %264 = load i32*, i32** %d_cw32, align 8 + %265 = bitcast i32* %264 to i8* + %call235 = call i32 @cudaFree(i8* %265) + store i32 %call235, i32* %err234, align 4 + %266 = load i32, i32* %err234, align 4 + %cmp236 = icmp ne i32 0, %266 + br i1 %cmp236, label %if.then237, label %if.end240 + +if.then237: ; preds = %if.end233 + %267 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %268 = load i32, i32* %err234, align 4 + %call238 = call i8* @cudaGetErrorString(i32 %268) + %call239 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %267, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 219, i8* %call238) + call void @exit(i32 1) #16 + unreachable + +if.end240: ; preds = %if.end233 + %269 = load i32*, i32** %d_cw32len, align 8 + %270 = bitcast i32* %269 to i8* + %call242 = call i32 @cudaFree(i8* %270) + store i32 %call242, i32* %err241, align 4 + %271 = load i32, i32* %err241, align 4 + %cmp243 = icmp ne i32 0, %271 + br i1 %cmp243, label %if.then244, label %if.end247 + +if.then244: ; preds = %if.end240 + %272 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %273 = load i32, i32* %err241, align 4 + %call245 = call i8* @cudaGetErrorString(i32 %273) + %call246 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %272, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 220, i8* %call245) + call void @exit(i32 1) #16 + unreachable + +if.end247: ; preds = %if.end240 + %274 = load i32*, i32** %d_cw32idx, align 8 + %275 = bitcast i32* %274 to i8* + %call249 = call i32 @cudaFree(i8* %275) + store i32 %call249, i32* %err248, align 4 + %276 = load i32, i32* %err248, align 4 + %cmp250 = icmp ne i32 0, %276 + br i1 %cmp250, label %if.then251, label %if.end254 + +if.then251: ; preds = %if.end247 + %277 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %278 = load i32, i32* %err248, align 4 + %call252 = call i8* @cudaGetErrorString(i32 %278) + %call253 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %277, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 221, i8* %call252) + call void @exit(i32 1) #16 + unreachable + +if.end254: ; preds = %if.end247 + %279 = load i32*, i32** %d_cindex, align 8 + %280 = bitcast i32* %279 to i8* + %call256 = call i32 @cudaFree(i8* %280) + store i32 %call256, i32* %err255, align 4 + %281 = load i32, i32* %err255, align 4 + %cmp257 = icmp ne i32 0, %281 + br i1 %cmp257, label %if.then258, label %if.end261 + +if.then258: ; preds = %if.end254 + %282 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %283 = load i32, i32* %err255, align 4 + %call259 = call i8* @cudaGetErrorString(i32 %283) + %call260 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %282, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 222, i8* %call259) + call void @exit(i32 1) #16 + unreachable + +if.end261: ; preds = %if.end254 + %284 = load i32*, i32** %d_cindex2, align 8 + %285 = bitcast i32* %284 to i8* + %call263 = call i32 @cudaFree(i8* %285) + store i32 %call263, i32* %err262, align 4 + %286 = load i32, i32* %err262, align 4 + %cmp264 = icmp ne i32 0, %286 + br i1 %cmp264, label %if.then265, label %if.end268 + +if.then265: ; preds = %if.end261 + %287 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %288 = load i32, i32* %err262, align 4 + %call266 = call i8* @cudaGetErrorString(i32 %288) + %call267 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %287, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0), i32 223, i8* %call266) + call void @exit(i32 1) #16 + unreachable + +if.end268: ; preds = %if.end261 + %289 = load i32*, i32** %cindex2, align 8 + %290 = bitcast i32* %289 to i8* + call void @free(i8* %290) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_Z10initParamsPcjRjS0_S0_j(i8* %file_name, i32 %num_block_threads, i32* dereferenceable(4) %num_blocks, i32* dereferenceable(4) %num_elements, i32* dereferenceable(4) %mem_size, i32 %symbol_type_size) #0 comdat { +entry: + %file_name.addr = alloca i8*, align 8 + %num_block_threads.addr = alloca i32, align 4 + %num_blocks.addr = alloca i32*, align 8 + %num_elements.addr = alloca i32*, align 8 + %mem_size.addr = alloca i32*, align 8 + %symbol_type_size.addr = alloca i32, align 4 + %f = alloca %struct._IO_FILE*, align 8 + store i8* %file_name, i8** %file_name.addr, align 8 + store i32 %num_block_threads, i32* %num_block_threads.addr, align 4 + store i32* %num_blocks, i32** %num_blocks.addr, align 8 + store i32* %num_elements, i32** %num_elements.addr, align 8 + store i32* %mem_size, i32** %mem_size.addr, align 8 + store i32 %symbol_type_size, i32* %symbol_type_size.addr, align 4 + %0 = load i8*, i8** %file_name.addr, align 8 + %cmp = icmp eq i8* %0, null + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32*, i32** %num_blocks.addr, align 8 + %2 = load i32, i32* %1, align 4 + %3 = load i32, i32* %num_block_threads.addr, align 4 + %mul = mul i32 %2, %3 + %4 = load i32*, i32** %num_elements.addr, align 8 + store i32 %mul, i32* %4, align 4 + %5 = load i32*, i32** %num_elements.addr, align 8 + %6 = load i32, i32* %5, align 4 + %7 = load i32, i32* %symbol_type_size.addr, align 4 + %mul1 = mul i32 %6, %7 + %8 = load i32*, i32** %mem_size.addr, align 8 + store i32 %mul1, i32* %8, align 4 + br label %if.end7 + +if.else: ; preds = %entry + %9 = load i8*, i8** %file_name.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %9, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.1, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %f, align 8 + %10 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %tobool = icmp ne %struct._IO_FILE* %10, null + br i1 %tobool, label %if.end, label %if.then2 + +if.then2: ; preds = %if.else + %11 = load i8*, i8** %file_name.addr, align 8 + call void @perror(i8* %11) + call void @exit(i32 1) #16 + unreachable + +if.end: ; preds = %if.else + %12 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %call3 = call i32 @fseek(%struct._IO_FILE* %12, i64 0, i32 2) + %13 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %call4 = call i64 @ftell(%struct._IO_FILE* %13) + %conv = trunc i64 %call4 to i32 + %14 = load i32*, i32** %mem_size.addr, align 8 + store i32 %conv, i32* %14, align 4 + %15 = load %struct._IO_FILE*, %struct._IO_FILE** %f, align 8 + %call5 = call i32 @fclose(%struct._IO_FILE* %15) + %16 = load i32*, i32** %mem_size.addr, align 8 + %17 = load i32, i32* %16, align 4 + %18 = load i32, i32* %symbol_type_size.addr, align 4 + %div = udiv i32 %17, %18 + %19 = load i32*, i32** %num_elements.addr, align 8 + store i32 %div, i32* %19, align 4 + %20 = load i32*, i32** %num_elements.addr, align 8 + %21 = load i32, i32* %20, align 4 + %22 = load i32, i32* %num_block_threads.addr, align 4 + %div6 = udiv i32 %21, %22 + %23 = load i32*, i32** %num_blocks.addr, align 8 + store i32 %div6, i32* %23, align 4 + br label %if.end7 + +if.end7: ; preds = %if.end, %if.then + ret void +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #11 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_Z8loadDataPcPjS0_S0_jjRd(i8* %file_name, i32* %sourceData, i32* %codewords, i32* %codewordlens, i32 %num_elements, i32 %mem_size, double* dereferenceable(8) %H) #0 comdat personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %file_name.addr = alloca i8*, align 8 + %sourceData.addr = alloca i32*, align 8 + %codewords.addr = alloca i32*, align 8 + %codewordlens.addr = alloca i32*, align 8 + %num_elements.addr = alloca i32, align 4 + %mem_size.addr = alloca i32, align 4 + %H.addr = alloca double*, align 8 + %freqs = alloca [256 x i32], align 16 + %root = alloca %class.INode*, align 8 + %codes = alloca %"class.std::map", align 8 + %ref.tmp = alloca %"class.std::vector.0", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %it = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %ref.tmp8 = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp12 = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %ref.tmp13 = alloca %"struct.std::_Rb_tree_iterator", align 8 + %count = alloca i32, align 4 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp24 = alloca %"struct.std::_Bit_const_iterator", align 8 + %i = alloca i32, align 4 + %i59 = alloca i32, align 4 + %p = alloca double, align 8 + store i8* %file_name, i8** %file_name.addr, align 8 + store i32* %sourceData, i32** %sourceData.addr, align 8 + store i32* %codewords, i32** %codewords.addr, align 8 + store i32* %codewordlens, i32** %codewordlens.addr, align 8 + store i32 %num_elements, i32* %num_elements.addr, align 4 + store i32 %mem_size, i32* %mem_size.addr, align 4 + store double* %H, double** %H.addr, align 8 + %0 = load i8*, i8** %file_name.addr, align 8 + %cmp = icmp eq i8* %0, null + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.14, i64 0, i64 0)) + call void @exit(i32 -1) #16 + unreachable + +if.else: ; preds = %entry + %1 = bitcast [256 x i32]* %freqs to i8* + call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 1024, i1 false) + %2 = load i8*, i8** %file_name.addr, align 8 + %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %freqs, i64 0, i64 0 + %3 = load i32, i32* %mem_size.addr, align 4 + %4 = load i32*, i32** %sourceData.addr, align 8 + %call1 = call i32 @_Z8runHistoPcPjjS0_(i8* %2, i32* %arraydecay, i32 %3, i32* %4) + %call2 = call %class.INode* @_Z9BuildTreeRA256_j([256 x i32]* dereferenceable(1024) %freqs) + store %class.INode* %call2, %class.INode** %root, align 8 + call void @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEEC2Ev(%"class.std::map"* %codes) + %5 = load %class.INode*, %class.INode** %root, align 8 + invoke void @_ZNSt6vectorIbSaIbEEC2Ev(%"class.std::vector.0"* %ref.tmp) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %if.else + invoke void @_Z13GenerateCodesPK5INodeRKSt6vectorIbSaIbEERSt3mapIhS4_St4lessIhESaISt4pairIKhS4_EEE(%class.INode* %5, %"class.std::vector.0"* dereferenceable(40) %ref.tmp, %"class.std::map"* dereferenceable(48) %codes) + to label %invoke.cont4 unwind label %lpad3 + +invoke.cont4: ; preds = %invoke.cont + invoke void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %ref.tmp) + to label %invoke.cont5 unwind label %lpad + +invoke.cont5: ; preds = %invoke.cont4 + %6 = load %class.INode*, %class.INode** %root, align 8 + %isnull = icmp eq %class.INode* %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %invoke.cont5 + %7 = bitcast %class.INode* %6 to void (%class.INode*)*** + %vtable = load void (%class.INode*)**, void (%class.INode*)*** %7, align 8 + %vfn = getelementptr inbounds void (%class.INode*)*, void (%class.INode*)** %vtable, i64 1 + %8 = load void (%class.INode*)*, void (%class.INode*)** %vfn, align 8 + invoke void %8(%class.INode* %6) + to label %invoke.cont7 unwind label %lpad + +invoke.cont7: ; preds = %delete.notnull + br label %delete.end + +delete.end: ; preds = %invoke.cont7, %invoke.cont5 + %call10 = invoke %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE5beginEv(%"class.std::map"* %codes) + to label %invoke.cont9 unwind label %lpad + +invoke.cont9: ; preds = %delete.end + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %ref.tmp8, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call10, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + invoke void @_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2ERKSt17_Rb_tree_iteratorIS5_E(%"struct.std::_Rb_tree_const_iterator"* %it, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %ref.tmp8) + to label %invoke.cont11 unwind label %lpad + +invoke.cont11: ; preds = %invoke.cont9 + br label %for.cond + +for.cond: ; preds = %invoke.cont56, %invoke.cont11 + %call15 = invoke %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE3endEv(%"class.std::map"* %codes) + to label %invoke.cont14 unwind label %lpad + +invoke.cont14: ; preds = %for.cond + %coerce.dive16 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %ref.tmp13, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call15, %"struct.std::_Rb_tree_node_base"** %coerce.dive16, align 8 + invoke void @_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2ERKSt17_Rb_tree_iteratorIS5_E(%"struct.std::_Rb_tree_const_iterator"* %ref.tmp12, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %ref.tmp13) + to label %invoke.cont17 unwind label %lpad + +invoke.cont17: ; preds = %invoke.cont14 + %call19 = invoke zeroext i1 @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEneERKS6_(%"struct.std::_Rb_tree_const_iterator"* %it, %"struct.std::_Rb_tree_const_iterator"* dereferenceable(8) %ref.tmp12) + to label %invoke.cont18 unwind label %lpad + +invoke.cont18: ; preds = %invoke.cont17 + br i1 %call19, label %for.body, label %for.end58 + +for.body: ; preds = %invoke.cont18 + %call21 = invoke %"struct.std::pair"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv(%"struct.std::_Rb_tree_const_iterator"* %it) + to label %invoke.cont20 unwind label %lpad + +invoke.cont20: ; preds = %for.body + %second = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call21, i32 0, i32 1 + %call23 = invoke { i64*, i32 } @_ZNKSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %second) + to label %invoke.cont22 unwind label %lpad + +invoke.cont22: ; preds = %invoke.cont20 + %9 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %10 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %9, i32 0, i32 0 + %11 = extractvalue { i64*, i32 } %call23, 0 + store i64* %11, i64** %10, align 8 + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %9, i32 0, i32 1 + %13 = extractvalue { i64*, i32 } %call23, 1 + store i32 %13, i32* %12, align 8 + %call26 = invoke %"struct.std::pair"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv(%"struct.std::_Rb_tree_const_iterator"* %it) + to label %invoke.cont25 unwind label %lpad + +invoke.cont25: ; preds = %invoke.cont22 + %second27 = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call26, i32 0, i32 1 + %call29 = invoke { i64*, i32 } @_ZNKSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %second27) + to label %invoke.cont28 unwind label %lpad + +invoke.cont28: ; preds = %invoke.cont25 + %14 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp24 to { i64*, i32 }* + %15 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %14, i32 0, i32 0 + %16 = extractvalue { i64*, i32 } %call29, 0 + store i64* %16, i64** %15, align 8 + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %14, i32 0, i32 1 + %18 = extractvalue { i64*, i32 } %call29, 1 + store i32 %18, i32* %17, align 8 + %19 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %20 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %19, i32 0, i32 0 + %21 = load i64*, i64** %20, align 8 + %22 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %19, i32 0, i32 1 + %23 = load i32, i32* %22, align 8 + %24 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp24 to { i64*, i32 }* + %25 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %24, i32 0, i32 0 + %26 = load i64*, i64** %25, align 8 + %27 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %24, i32 0, i32 1 + %28 = load i32, i32* %27, align 8 + %call31 = invoke i64 @_ZSt8distanceISt19_Bit_const_iteratorENSt15iterator_traitsIT_E15difference_typeES2_S2_(i64* %21, i32 %23, i64* %26, i32 %28) + to label %invoke.cont30 unwind label %lpad + +invoke.cont30: ; preds = %invoke.cont28 + %conv = trunc i64 %call31 to i32 + store i32 %conv, i32* %count, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond32 + +for.cond32: ; preds = %for.inc, %invoke.cont30 + %29 = load i32, i32* %i, align 4 + %30 = load i32, i32* %count, align 4 + %cmp33 = icmp ult i32 %29, %30 + br i1 %cmp33, label %for.body34, label %for.end + +for.body34: ; preds = %for.cond32 + %call36 = invoke %"struct.std::pair"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv(%"struct.std::_Rb_tree_const_iterator"* %it) + to label %invoke.cont35 unwind label %lpad + +invoke.cont35: ; preds = %for.body34 + %second37 = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call36, i32 0, i32 1 + %31 = load i32, i32* %i, align 4 + %conv38 = sext i32 %31 to i64 + %call40 = invoke zeroext i1 @_ZNKSt6vectorIbSaIbEEixEm(%"class.std::vector.0"* %second37, i64 %conv38) + to label %invoke.cont39 unwind label %lpad + +invoke.cont39: ; preds = %invoke.cont35 + br i1 %call40, label %if.then41, label %if.end + +if.then41: ; preds = %invoke.cont39 + %32 = load i32, i32* %count, align 4 + %33 = load i32, i32* %i, align 4 + %sub = sub nsw i32 %32, %33 + %sub42 = sub nsw i32 %sub, 1 + %call44 = invoke float @_ZSt3powfi(float 2.000000e+00, i32 %sub42) + to label %invoke.cont43 unwind label %lpad + +invoke.cont43: ; preds = %if.then41 + %conv45 = fptoui float %call44 to i32 + %34 = load i32*, i32** %codewords.addr, align 8 + %call47 = invoke %"struct.std::pair"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv(%"struct.std::_Rb_tree_const_iterator"* %it) + to label %invoke.cont46 unwind label %lpad + +invoke.cont46: ; preds = %invoke.cont43 + %first = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call47, i32 0, i32 0 + %35 = load i8, i8* %first, align 8 + %conv48 = zext i8 %35 to i32 + %idxprom = zext i32 %conv48 to i64 + %arrayidx = getelementptr inbounds i32, i32* %34, i64 %idxprom + %36 = load i32, i32* %arrayidx, align 4 + %add = add i32 %36, %conv45 + store i32 %add, i32* %arrayidx, align 4 + br label %if.end + +lpad: ; preds = %for.end78, %for.inc55, %for.end, %invoke.cont43, %if.then41, %invoke.cont35, %for.body34, %invoke.cont28, %invoke.cont25, %invoke.cont22, %invoke.cont20, %for.body, %invoke.cont17, %invoke.cont14, %for.cond, %invoke.cont9, %delete.end, %delete.notnull, %invoke.cont4, %if.else + %37 = landingpad { i8*, i32 } + cleanup + %38 = extractvalue { i8*, i32 } %37, 0 + store i8* %38, i8** %exn.slot, align 8 + %39 = extractvalue { i8*, i32 } %37, 1 + store i32 %39, i32* %ehselector.slot, align 4 + br label %ehcleanup + +lpad3: ; preds = %invoke.cont + %40 = landingpad { i8*, i32 } + cleanup + %41 = extractvalue { i8*, i32 } %40, 0 + store i8* %41, i8** %exn.slot, align 8 + %42 = extractvalue { i8*, i32 } %40, 1 + store i32 %42, i32* %ehselector.slot, align 4 + invoke void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %ref.tmp) + to label %invoke.cont6 unwind label %terminate.lpad + +invoke.cont6: ; preds = %lpad3 + br label %ehcleanup + +if.end: ; preds = %invoke.cont46, %invoke.cont39 + br label %for.inc + +for.inc: ; preds = %if.end + %43 = load i32, i32* %i, align 4 + %inc = add nsw i32 %43, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond32 + +for.end: ; preds = %for.cond32 + %44 = load i32, i32* %count, align 4 + %45 = load i32*, i32** %codewordlens.addr, align 8 + %call50 = invoke %"struct.std::pair"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv(%"struct.std::_Rb_tree_const_iterator"* %it) + to label %invoke.cont49 unwind label %lpad + +invoke.cont49: ; preds = %for.end + %first51 = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %call50, i32 0, i32 0 + %46 = load i8, i8* %first51, align 8 + %conv52 = zext i8 %46 to i32 + %idxprom53 = zext i32 %conv52 to i64 + %arrayidx54 = getelementptr inbounds i32, i32* %45, i64 %idxprom53 + store i32 %44, i32* %arrayidx54, align 4 + br label %for.inc55 + +for.inc55: ; preds = %invoke.cont49 + %call57 = invoke dereferenceable(8) %"struct.std::_Rb_tree_const_iterator"* @_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEppEv(%"struct.std::_Rb_tree_const_iterator"* %it) + to label %invoke.cont56 unwind label %lpad + +invoke.cont56: ; preds = %for.inc55 + br label %for.cond + +for.end58: ; preds = %invoke.cont18 + %47 = load double*, double** %H.addr, align 8 + store double 0.000000e+00, double* %47, align 8 + store i32 0, i32* %i59, align 4 + br label %for.cond60 + +for.cond60: ; preds = %for.inc76, %for.end58 + %48 = load i32, i32* %i59, align 4 + %cmp61 = icmp ult i32 %48, 256 + br i1 %cmp61, label %for.body62, label %for.end78 + +for.body62: ; preds = %for.cond60 + %49 = load i32, i32* %i59, align 4 + %idxprom63 = zext i32 %49 to i64 + %arrayidx64 = getelementptr inbounds [256 x i32], [256 x i32]* %freqs, i64 0, i64 %idxprom63 + %50 = load i32, i32* %arrayidx64, align 4 + %cmp65 = icmp ugt i32 %50, 0 + br i1 %cmp65, label %if.then66, label %if.end75 + +if.then66: ; preds = %for.body62 + %51 = load i32, i32* %i59, align 4 + %idxprom67 = zext i32 %51 to i64 + %arrayidx68 = getelementptr inbounds [256 x i32], [256 x i32]* %freqs, i64 0, i64 %idxprom67 + %52 = load i32, i32* %arrayidx68, align 4 + %conv69 = uitofp i32 %52 to double + %53 = load i32, i32* %mem_size.addr, align 4 + %conv70 = uitofp i32 %53 to double + %div = fdiv double %conv69, %conv70 + store double %div, double* %p, align 8 + %54 = load double, double* %p, align 8 + %55 = load double, double* %p, align 8 + %call71 = call double @log(double %55) #3 + %mul = fmul contract double %54, %call71 + %call72 = call double @log(double 2.000000e+00) #3 + %div73 = fdiv double %mul, %call72 + %56 = load double*, double** %H.addr, align 8 + %57 = load double, double* %56, align 8 + %add74 = fadd contract double %57, %div73 + store double %add74, double* %56, align 8 + br label %if.end75 + +if.end75: ; preds = %if.then66, %for.body62 + br label %for.inc76 + +for.inc76: ; preds = %if.end75 + %58 = load i32, i32* %i59, align 4 + %inc77 = add i32 %58, 1 + store i32 %inc77, i32* %i59, align 4 + br label %for.cond60 + +for.end78: ; preds = %for.cond60 + %59 = load double*, double** %H.addr, align 8 + %60 = load double, double* %59, align 8 + %fneg = fneg double %60 + %61 = load double*, double** %H.addr, align 8 + store double %fneg, double* %61, align 8 + %62 = load i8*, i8** %file_name.addr, align 8 + %63 = load i32, i32* %mem_size.addr, align 4 + %64 = load double*, double** %H.addr, align 8 + %65 = load double, double* %64, align 8 + %call80 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.15, i64 0, i64 0), i8* %62, i32 %63, double %65) + to label %invoke.cont79 unwind label %lpad + +invoke.cont79: ; preds = %for.end78 + call void @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEED2Ev(%"class.std::map"* %codes) + br label %if.end82 + +ehcleanup: ; preds = %invoke.cont6, %lpad + invoke void @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEED2Ev(%"class.std::map"* %codes) + to label %invoke.cont81 unwind label %terminate.lpad + +invoke.cont81: ; preds = %ehcleanup + br label %eh.resume + +if.end82: ; preds = %invoke.cont79 + ret void + +eh.resume: ; preds = %invoke.cont81 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val83 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val83 + +terminate.lpad: ; preds = %ehcleanup, %lpad3 + %66 = landingpad { i8*, i32 } + catch i8* null + %67 = extractvalue { i8*, i32 } %66, 0 + call void @__clang_call_terminate(i8* %67) #16 + unreachable +} + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 + +declare dso_local i8* @cudaGetErrorString(i32) #1 + +declare dso_local void @cpu_vlc_encode(i32*, i32, i32*, i32*, i32*, i32*) #1 + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_(i32* %data, i32* %gm_codewords, i32* %gm_codewordlens, i32* %cw32, i32* %cw32len, i32* %cw32idx, i32* %out, i32* %outidx) #0 { +entry: + %data.addr = alloca i32*, align 8 + %gm_codewords.addr = alloca i32*, align 8 + %gm_codewordlens.addr = alloca i32*, align 8 + %cw32.addr = alloca i32*, align 8 + %cw32len.addr = alloca i32*, align 8 + %cw32idx.addr = alloca i32*, align 8 + %out.addr = alloca i32*, align 8 + %outidx.addr = alloca i32*, align 8 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %data, i32** %data.addr, align 8 + store i32* %gm_codewords, i32** %gm_codewords.addr, align 8 + store i32* %gm_codewordlens, i32** %gm_codewordlens.addr, align 8 + store i32* %cw32, i32** %cw32.addr, align 8 + store i32* %cw32len, i32** %cw32len.addr, align 8 + store i32* %cw32idx, i32** %cw32idx.addr, align 8 + store i32* %out, i32** %out.addr, align 8 + store i32* %outidx, i32** %outidx.addr, align 8 + %kernel_args = alloca i8*, i64 8, align 16 + %0 = bitcast i32** %data.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %gm_codewords.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %gm_codewordlens.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32** %cw32.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32** %cw32len.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32** %cw32idx.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32** %out.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast i32** %outidx.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %17 = load i64, i64* %shmem_size, align 8 + %18 = load i8*, i8** %stream, align 8 + %19 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %20 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %26 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %25, i8* align 8 %26, i64 12, i1 false) + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %28 = load i64, i64* %27, align 8 + %29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %30 = load i32, i32* %29, align 8 + %31 = bitcast i8* %18 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, i32*)* @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_ to i8*), i64 %22, i32 %24, i64 %28, i32 %30, i8** %kernel_args, i64 %17, %struct.CUstream_st* %31) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @cudaThreadSynchronize() #1 + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL17preallocBlockSumsj(i32 %maxNumElements) #0 { +entry: + %maxNumElements.addr = alloca i32, align 4 + %blockSize = alloca i32, align 4 + %numElts = alloca i32, align 4 + %level = alloca i32, align 4 + %numBlocks = alloca i32, align 4 + %numBlocks19 = alloca i32, align 4 + %err = alloca i32, align 4 + %err52 = alloca i32, align 4 + store i32 %maxNumElements, i32* %maxNumElements.addr, align 4 + %0 = load i32, i32* @_ZL18g_numEltsAllocated, align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + call void @__assert_fail(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.16, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 63, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @__PRETTY_FUNCTION__._ZL17preallocBlockSumsj, i64 0, i64 0)) #16 + unreachable + +1: ; No predecessors! + br label %cond.end + +cond.end: ; preds = %1, %cond.true + %2 = load i32, i32* %maxNumElements.addr, align 4 + store i32 %2, i32* @_ZL18g_numEltsAllocated, align 4 + store i32 256, i32* %blockSize, align 4 + %3 = load i32, i32* %maxNumElements.addr, align 4 + store i32 %3, i32* %numElts, align 4 + store i32 0, i32* %level, align 4 + br label %do.body + +do.body: ; preds = %do.cond, %cond.end + %4 = load i32, i32* %numElts, align 4 + %conv = uitofp i32 %4 to float + %5 = load i32, i32* %blockSize, align 4 + %conv1 = uitofp i32 %5 to float + %mul = fmul contract float 2.000000e+00, %conv1 + %div = fdiv float %conv, %mul + %call = call float @_ZSt4ceilf(float %div) + %conv2 = fptosi float %call to i32 + %cmp3 = icmp sgt i32 1, %conv2 + br i1 %cmp3, label %cond.true4, label %cond.false5 + +cond.true4: ; preds = %do.body + br label %cond.end12 + +cond.false5: ; preds = %do.body + %6 = load i32, i32* %numElts, align 4 + %conv6 = uitofp i32 %6 to float + %7 = load i32, i32* %blockSize, align 4 + %conv7 = uitofp i32 %7 to float + %mul8 = fmul contract float 2.000000e+00, %conv7 + %div9 = fdiv float %conv6, %mul8 + %call10 = call float @_ZSt4ceilf(float %div9) + %conv11 = fptosi float %call10 to i32 + br label %cond.end12 + +cond.end12: ; preds = %cond.false5, %cond.true4 + %cond = phi i32 [ 1, %cond.true4 ], [ %conv11, %cond.false5 ] + store i32 %cond, i32* %numBlocks, align 4 + %8 = load i32, i32* %numBlocks, align 4 + %cmp13 = icmp ugt i32 %8, 1 + br i1 %cmp13, label %if.then, label %if.end + +if.then: ; preds = %cond.end12 + %9 = load i32, i32* %level, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %level, align 4 + br label %if.end + +if.end: ; preds = %if.then, %cond.end12 + %10 = load i32, i32* %numBlocks, align 4 + store i32 %10, i32* %numElts, align 4 + br label %do.cond + +do.cond: ; preds = %if.end + %11 = load i32, i32* %numElts, align 4 + %cmp14 = icmp ugt i32 %11, 1 + br i1 %cmp14, label %do.body, label %do.end + +do.end: ; preds = %do.cond + %12 = load i32, i32* %level, align 4 + %conv15 = sext i32 %12 to i64 + %mul16 = mul i64 %conv15, 8 + %call17 = call noalias i8* @malloc(i64 %mul16) #3 + %13 = bitcast i8* %call17 to i32** + store i32** %13, i32*** @_ZL15g_scanBlockSums, align 8 + %14 = load i32, i32* %level, align 4 + store i32 %14, i32* @_ZL20g_numLevelsAllocated, align 4 + %15 = load i32, i32* %maxNumElements.addr, align 4 + store i32 %15, i32* %numElts, align 4 + store i32 0, i32* %level, align 4 + br label %do.body18 + +do.body18: ; preds = %do.cond49, %do.end + %16 = load i32, i32* %numElts, align 4 + %conv20 = uitofp i32 %16 to float + %17 = load i32, i32* %blockSize, align 4 + %conv21 = uitofp i32 %17 to float + %mul22 = fmul contract float 2.000000e+00, %conv21 + %div23 = fdiv float %conv20, %mul22 + %call24 = call float @_ZSt4ceilf(float %div23) + %conv25 = fptosi float %call24 to i32 + %cmp26 = icmp sgt i32 1, %conv25 + br i1 %cmp26, label %cond.true27, label %cond.false28 + +cond.true27: ; preds = %do.body18 + br label %cond.end35 + +cond.false28: ; preds = %do.body18 + %18 = load i32, i32* %numElts, align 4 + %conv29 = uitofp i32 %18 to float + %19 = load i32, i32* %blockSize, align 4 + %conv30 = uitofp i32 %19 to float + %mul31 = fmul contract float 2.000000e+00, %conv30 + %div32 = fdiv float %conv29, %mul31 + %call33 = call float @_ZSt4ceilf(float %div32) + %conv34 = fptosi float %call33 to i32 + br label %cond.end35 + +cond.end35: ; preds = %cond.false28, %cond.true27 + %cond36 = phi i32 [ 1, %cond.true27 ], [ %conv34, %cond.false28 ] + store i32 %cond36, i32* %numBlocks19, align 4 + %20 = load i32, i32* %numBlocks19, align 4 + %cmp37 = icmp ugt i32 %20, 1 + br i1 %cmp37, label %if.then38, label %if.end48 + +if.then38: ; preds = %cond.end35 + %21 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %22 = load i32, i32* %level, align 4 + %inc39 = add nsw i32 %22, 1 + store i32 %inc39, i32* %level, align 4 + %idxprom = sext i32 %22 to i64 + %arrayidx = getelementptr inbounds i32*, i32** %21, i64 %idxprom + %23 = bitcast i32** %arrayidx to i8** + %24 = load i32, i32* %numBlocks19, align 4 + %conv40 = zext i32 %24 to i64 + %mul41 = mul i64 %conv40, 4 + %call42 = call i32 @cudaMalloc(i8** %23, i64 %mul41) + store i32 %call42, i32* %err, align 4 + %25 = load i32, i32* %err, align 4 + %cmp43 = icmp ne i32 0, %25 + br i1 %cmp43, label %if.then44, label %if.end47 + +if.then44: ; preds = %if.then38 + %26 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %27 = load i32, i32* %err, align 4 + %call45 = call i8* @cudaGetErrorString(i32 %27) + %call46 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %26, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 89, i8* %call45) + call void @exit(i32 1) #16 + unreachable + +if.end47: ; preds = %if.then38 + br label %if.end48 + +if.end48: ; preds = %if.end47, %cond.end35 + %28 = load i32, i32* %numBlocks19, align 4 + store i32 %28, i32* %numElts, align 4 + br label %do.cond49 + +do.cond49: ; preds = %if.end48 + %29 = load i32, i32* %numElts, align 4 + %cmp50 = icmp ugt i32 %29, 1 + br i1 %cmp50, label %do.body18, label %do.end51 + +do.end51: ; preds = %do.cond49 + %call53 = call i32 @cudaGetLastError() + store i32 %call53, i32* %err52, align 4 + %30 = load i32, i32* %err52, align 4 + %cmp54 = icmp ne i32 0, %30 + br i1 %cmp54, label %if.then55, label %if.end58 + +if.then55: ; preds = %do.end51 + %31 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %32 = load i32, i32* %err52, align 4 + %call56 = call i8* @cudaGetErrorString(i32 %32) + %call57 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %31, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.18, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 93, i8* %call56) + call void @exit(i32 1) #16 + unreachable + +if.end58: ; preds = %do.end51 + ret void +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL12prescanArrayPjS_i(i32* %outArray, i32* %inArray, i32 %numElements) #0 { +entry: + %outArray.addr = alloca i32*, align 8 + %inArray.addr = alloca i32*, align 8 + %numElements.addr = alloca i32, align 4 + store i32* %outArray, i32** %outArray.addr, align 8 + store i32* %inArray, i32** %inArray.addr, align 8 + store i32 %numElements, i32* %numElements.addr, align 4 + %0 = load i32*, i32** %outArray.addr, align 8 + %1 = load i32*, i32** %inArray.addr, align 8 + %2 = load i32, i32* %numElements.addr, align 4 + call void @_ZL21prescanArrayRecursivePjPKjii(i32* %0, i32* %1, i32 %2, i32 0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL5pack2PjS_S_S_j(i32* %srcData, i32* %cindex, i32* %cindex2, i32* %dstData, i32 %original_num_block_elements) #0 { +entry: + %srcData.addr = alloca i32*, align 8 + %cindex.addr = alloca i32*, align 8 + %cindex2.addr = alloca i32*, align 8 + %dstData.addr = alloca i32*, align 8 + %original_num_block_elements.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %srcData, i32** %srcData.addr, align 8 + store i32* %cindex, i32** %cindex.addr, align 8 + store i32* %cindex2, i32** %cindex2.addr, align 8 + store i32* %dstData, i32** %dstData.addr, align 8 + store i32 %original_num_block_elements, i32* %original_num_block_elements.addr, align 4 + %kernel_args = alloca i8*, i64 5, align 16 + %0 = bitcast i32** %srcData.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %cindex.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %cindex2.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32** %dstData.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %original_num_block_elements.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %11 = load i64, i64* %shmem_size, align 8 + %12 = load i8*, i8** %stream, align 8 + %13 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %14 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 12, i1 false) + %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %16 = load i64, i64* %15, align 8 + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %18 = load i32, i32* %17, align 8 + %19 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %20 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast i8* %12 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32*, i32*, i32)* @_ZL5pack2PjS_S_S_j to i8*), i64 %16, i32 %18, i64 %22, i32 %24, i8** %kernel_args, i64 %11, %struct.CUstream_st* %25) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @cudaGetLastError() #1 + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL16deallocBlockSumsv() #0 { +entry: + %i = alloca i32, align 4 + %err = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* @_ZL20g_numLevelsAllocated, align 4 + %cmp = icmp ult i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = zext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32*, i32** %2, i64 %idxprom + %4 = load i32*, i32** %arrayidx, align 8 + %5 = bitcast i32* %4 to i8* + %call = call i32 @cudaFree(i8* %5) + br label %for.inc + +for.inc: ; preds = %for.body + %6 = load i32, i32* %i, align 4 + %inc = add i32 %6, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %call1 = call i32 @cudaGetLastError() + store i32 %call1, i32* %err, align 4 + %7 = load i32, i32* %err, align 4 + %cmp2 = icmp ne i32 0, %7 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %for.end + %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %9 = load i32, i32* %err, align 4 + %call3 = call i8* @cudaGetErrorString(i32 %9) + %call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str.25, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 101, i8* %call3) + call void @exit(i32 1) #16 + unreachable + +if.end: ; preds = %for.end + %10 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %11 = bitcast i32** %10 to i8** + %12 = bitcast i8** %11 to i8* + call void @free(i8* %12) #3 + store i32** null, i32*** @_ZL15g_scanBlockSums, align 8 + store i32 0, i32* @_ZL18g_numEltsAllocated, align 4 + store i32 0, i32* @_ZL20g_numLevelsAllocated, align 4 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i32 @_Z15compare_vectorsIjEiPT_S1_j(i32* %data1, i32* %data2, i32 %size) #0 comdat { +entry: + %data1.addr = alloca i32*, align 8 + %data2.addr = alloca i32*, align 8 + %size.addr = alloca i32, align 4 + %match = alloca i8, align 1 + %i = alloca i32, align 4 + store i32* %data1, i32** %data1.addr, align 8 + store i32* %data2, i32** %data2.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.28, i64 0, i64 0)) + store i8 1, i8* %match, align 1 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %size.addr, align 4 + %cmp = icmp ult i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %data1.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = zext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + %5 = load i32*, i32** %data2.addr, align 8 + %6 = load i32, i32* %i, align 4 + %idxprom1 = zext i32 %6 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %5, i64 %idxprom1 + %7 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp ne i32 %4, %7 + br i1 %cmp3, label %if.then, label %if.end + +if.then: ; preds = %for.body + store i8 0, i8* %match, align 1 + %8 = load i32, i32* %i, align 4 + %9 = load i32*, i32** %data1.addr, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom4 = zext i32 %10 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %9, i64 %idxprom4 + %11 = load i32, i32* %arrayidx5, align 4 + %12 = load i32, i32* %i, align 4 + %13 = load i32*, i32** %data2.addr, align 8 + %14 = load i32, i32* %i, align 4 + %idxprom6 = zext i32 %14 to i64 + %arrayidx7 = getelementptr inbounds i32, i32* %13, i64 %idxprom6 + %15 = load i32, i32* %arrayidx7, align 4 + %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.29, i64 0, i64 0), i32 %8, i32 %11, i32 %12, i32 %15) + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %16 = load i32, i32* %i, align 4 + %inc = add i32 %16, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %17 = load i8, i8* %match, align 1 + %tobool = trunc i8 %17 to i1 + br i1 %tobool, label %if.then9, label %if.else + +if.then9: ; preds = %for.end + %call10 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.30, i64 0, i64 0)) + ret i32 0 + +if.else: ; preds = %for.end + %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.31, i64 0, i64 0)) + call void @exit(i32 1) #16 + unreachable +} + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #11 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN5INodeC2Ei(%class.INode* %this, i32 %f) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.INode*, align 8 + %f.addr = alloca i32, align 4 + store %class.INode* %this, %class.INode** %this.addr, align 8 + store i32 %f, i32* %f.addr, align 4 + %this1 = load %class.INode*, %class.INode** %this.addr, align 8 + %0 = bitcast %class.INode* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV5INode, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %0, align 8 + %f2 = getelementptr inbounds %class.INode, %class.INode* %this1, i32 0, i32 1 + %1 = load i32, i32* %f.addr, align 4 + store i32 %1, i32* %f2, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN8LeafNodeD2Ev(%class.LeafNode* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.LeafNode*, align 8 + store %class.LeafNode* %this, %class.LeafNode** %this.addr, align 8 + %this1 = load %class.LeafNode*, %class.LeafNode** %this.addr, align 8 + %0 = bitcast %class.LeafNode* %this1 to %class.INode* + call void @_ZN5INodeD2Ev(%class.INode* %0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN8LeafNodeD0Ev(%class.LeafNode* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.LeafNode*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.LeafNode* %this, %class.LeafNode** %this.addr, align 8 + %this1 = load %class.LeafNode*, %class.LeafNode** %this.addr, align 8 + invoke void @_ZN8LeafNodeD2Ev(%class.LeafNode* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %0 = bitcast %class.LeafNode* %this1 to i8* + call void @_ZdlPv(i8* %0) #18 + ret void + +lpad: ; preds = %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + %4 = bitcast %class.LeafNode* %this1 to i8* + call void @_ZdlPv(i8* %4) #18 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN5INodeD2Ev(%class.INode* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.INode*, align 8 + store %class.INode* %this, %class.INode** %this.addr, align 8 + %this1 = load %class.INode*, %class.INode** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN5INodeD0Ev(%class.INode* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.INode*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.INode* %this, %class.INode** %this.addr, align 8 + %this1 = load %class.INode*, %class.INode** %this.addr, align 8 + invoke void @_ZN5INodeD2Ev(%class.INode* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %0 = bitcast %class.INode* %this1 to i8* + call void @_ZdlPv(i8* %0) #18 + ret void + +lpad: ; preds = %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + %4 = bitcast %class.INode* %this1 to i8* + call void @_ZdlPv(i8* %4) #18 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN12InternalNodeD2Ev(%class.InternalNode* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.InternalNode*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.InternalNode* %this, %class.InternalNode** %this.addr, align 8 + %this1 = load %class.InternalNode*, %class.InternalNode** %this.addr, align 8 + %0 = bitcast %class.InternalNode* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV12InternalNode, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %0, align 8 + %left = getelementptr inbounds %class.InternalNode, %class.InternalNode* %this1, i32 0, i32 1 + %1 = load %class.INode*, %class.INode** %left, align 8 + %isnull = icmp eq %class.INode* %1, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + %2 = bitcast %class.INode* %1 to void (%class.INode*)*** + %vtable = load void (%class.INode*)**, void (%class.INode*)*** %2, align 8 + %vfn = getelementptr inbounds void (%class.INode*)*, void (%class.INode*)** %vtable, i64 1 + %3 = load void (%class.INode*)*, void (%class.INode*)** %vfn, align 8 + invoke void %3(%class.INode* %1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %delete.notnull + br label %delete.end + +delete.end: ; preds = %invoke.cont, %entry + %right = getelementptr inbounds %class.InternalNode, %class.InternalNode* %this1, i32 0, i32 2 + %4 = load %class.INode*, %class.INode** %right, align 8 + %isnull2 = icmp eq %class.INode* %4, null + br i1 %isnull2, label %delete.end7, label %delete.notnull3 + +delete.notnull3: ; preds = %delete.end + %5 = bitcast %class.INode* %4 to void (%class.INode*)*** + %vtable4 = load void (%class.INode*)**, void (%class.INode*)*** %5, align 8 + %vfn5 = getelementptr inbounds void (%class.INode*)*, void (%class.INode*)** %vtable4, i64 1 + %6 = load void (%class.INode*)*, void (%class.INode*)** %vfn5, align 8 + invoke void %6(%class.INode* %4) + to label %invoke.cont6 unwind label %lpad + +invoke.cont6: ; preds = %delete.notnull3 + br label %delete.end7 + +delete.end7: ; preds = %invoke.cont6, %delete.end + %7 = bitcast %class.InternalNode* %this1 to %class.INode* + call void @_ZN5INodeD2Ev(%class.INode* %7) + ret void + +lpad: ; preds = %delete.notnull3, %delete.notnull + %8 = landingpad { i8*, i32 } + cleanup + %9 = extractvalue { i8*, i32 } %8, 0 + store i8* %9, i8** %exn.slot, align 8 + %10 = extractvalue { i8*, i32 } %8, 1 + store i32 %10, i32* %ehselector.slot, align 4 + %11 = bitcast %class.InternalNode* %this1 to %class.INode* + invoke void @_ZN5INodeD2Ev(%class.INode* %11) + to label %invoke.cont8 unwind label %terminate.lpad + +invoke.cont8: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont8 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val9 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val9 + +terminate.lpad: ; preds = %lpad + %12 = landingpad { i8*, i32 } + catch i8* null + %13 = extractvalue { i8*, i32 } %12, 0 + call void @__clang_call_terminate(i8* %13) #16 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN12InternalNodeD0Ev(%class.InternalNode* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.InternalNode*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.InternalNode* %this, %class.InternalNode** %this.addr, align 8 + %this1 = load %class.InternalNode*, %class.InternalNode** %this.addr, align 8 + invoke void @_ZN12InternalNodeD2Ev(%class.InternalNode* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %0 = bitcast %class.InternalNode* %this1 to i8* + call void @_ZdlPv(i8* %0) #18 + ret void + +lpad: ; preds = %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + %4 = bitcast %class.InternalNode* %this1 to i8* + call void @_ZdlPv(i8* %4) #18 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +declare dso_local i64 @ftell(%struct._IO_FILE*) #1 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEEC2Ev(%"class.std::map"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::map"*, align 8 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EEC2Ev(%"class.std::_Rb_tree"* %_M_t) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIbSaIbEEC2Ev(%"class.std::vector.0"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + call void @_ZNSt13_Bvector_baseISaIbEEC2Ev(%"struct.std::_Bvector_base"* %0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE5beginEv(%"class.std::map"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::map"*, align 8 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE5beginEv(%"class.std::_Rb_tree"* %_M_t) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + ret %"struct.std::_Rb_tree_node_base"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2ERKSt17_Rb_tree_iteratorIS5_E(%"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %__it) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_const_iterator"*, align 8 + %__it.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + store %"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + store %"struct.std::_Rb_tree_iterator"* %__it, %"struct.std::_Rb_tree_iterator"** %__it.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_const_iterator"*, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %__it.addr, align 8 + %_M_node2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %0, i32 0, i32 0 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node2, align 8 + store %"struct.std::_Rb_tree_node_base"* %1, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEneERKS6_(%"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_const_iterator"* dereferenceable(8) %__x) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_const_iterator"*, align 8 + %__x.addr = alloca %"struct.std::_Rb_tree_const_iterator"*, align 8 + store %"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + store %"struct.std::_Rb_tree_const_iterator"* %__x, %"struct.std::_Rb_tree_const_iterator"** %__x.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_const_iterator"*, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %1 = load %"struct.std::_Rb_tree_const_iterator"*, %"struct.std::_Rb_tree_const_iterator"** %__x.addr, align 8 + %_M_node2 = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %1, i32 0, i32 0 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node2, align 8 + %cmp = icmp ne %"struct.std::_Rb_tree_node_base"* %0, %2 + ret i1 %cmp +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE3endEv(%"class.std::map"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::map"*, align 8 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE3endEv(%"class.std::_Rb_tree"* %_M_t) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + ret %"struct.std::_Rb_tree_node_base"* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZSt8distanceISt19_Bit_const_iteratorENSt15iterator_traitsIT_E15difference_typeES2_S2_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1) #0 comdat { +entry: + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::random_access_iterator_tag", align 1 + %undef.agg.tmp = alloca %"struct.std::random_access_iterator_tag", align 1 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to i8* + %7 = bitcast %"struct.std::_Bit_const_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %6, i8* align 8 %7, i64 16, i1 false) + %8 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to i8* + %9 = bitcast %"struct.std::_Bit_const_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %8, i8* align 8 %9, i64 16, i1 false) + call void @_ZSt19__iterator_categoryISt19_Bit_const_iteratorENSt15iterator_traitsIT_E17iterator_categoryERKS2_(%"struct.std::_Bit_const_iterator"* dereferenceable(16) %__first) + %10 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %11 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %10, i32 0, i32 0 + %12 = load i64*, i64** %11, align 8 + %13 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %10, i32 0, i32 1 + %14 = load i32, i32* %13, align 8 + %15 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to { i64*, i32 }* + %16 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 0 + %17 = load i64*, i64** %16, align 8 + %18 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 1 + %19 = load i32, i32* %18, align 8 + %call = call i64 @_ZSt10__distanceISt19_Bit_const_iteratorENSt15iterator_traitsIT_E15difference_typeES2_S2_St26random_access_iterator_tag(i64* %12, i32 %14, i64* %17, i32 %19) + ret i64 %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::pair"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEptEv(%"struct.std::_Rb_tree_const_iterator"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_const_iterator"*, align 8 + store %"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_const_iterator"*, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %1 = bitcast %"struct.std::_Rb_tree_node_base"* %0 to %"struct.std::_Rb_tree_node"* + %call = call %"struct.std::pair"* @_ZNKSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %1) + ret %"struct.std::pair"* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNKSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_const_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 0 + call void @_ZNSt19_Bit_const_iteratorC2ERKSt13_Bit_iterator(%"struct.std::_Bit_const_iterator"* %retval, %"struct.std::_Bit_iterator"* dereferenceable(16) %_M_start) + %1 = bitcast %"struct.std::_Bit_const_iterator"* %retval to { i64*, i32 }* + %2 = load { i64*, i32 }, { i64*, i32 }* %1, align 8 + ret { i64*, i32 } %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNKSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_const_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 1 + call void @_ZNSt19_Bit_const_iteratorC2ERKSt13_Bit_iterator(%"struct.std::_Bit_const_iterator"* %retval, %"struct.std::_Bit_iterator"* dereferenceable(16) %_M_finish) + %1 = bitcast %"struct.std::_Bit_const_iterator"* %retval to { i64*, i32 }* + %2 = load { i64*, i32 }, { i64*, i32 }* %1, align 8 + ret { i64*, i32 } %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNKSt6vectorIbSaIbEEixEm(%"class.std::vector.0"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__n.addr = alloca i64, align 8 + %ref.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 0 + %1 = bitcast %"struct.std::_Bit_iterator"* %_M_start to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %1, i32 0, i32 0 + %2 = load i64*, i64** %_M_p, align 8 + %3 = load i64, i64* %__n.addr, align 8 + %div = udiv i64 %3, 64 + %add.ptr = getelementptr inbounds i64, i64* %2, i64 %div + %4 = load i64, i64* %__n.addr, align 8 + %rem = urem i64 %4, 64 + %conv = trunc i64 %rem to i32 + call void @_ZNSt19_Bit_const_iteratorC2EPmj(%"struct.std::_Bit_const_iterator"* %ref.tmp, i64* %add.ptr, i32 %conv) + %call = call zeroext i1 @_ZNKSt19_Bit_const_iteratordeEv(%"struct.std::_Bit_const_iterator"* %ref.tmp) + ret i1 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt3powfi(float %__x, i32 %__n) #6 comdat { +entry: + %__x.addr = alloca float, align 4 + %__n.addr = alloca i32, align 4 + store float %__x, float* %__x.addr, align 4 + store i32 %__n, i32* %__n.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %1 = load i32, i32* %__n.addr, align 4 + %2 = call float @llvm.powi.f32(float %0, i32 %1) + ret float %2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %"struct.std::_Rb_tree_const_iterator"* @_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEppEv(%"struct.std::_Rb_tree_const_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_const_iterator"*, align 8 + store %"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_const_iterator"*, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZSt18_Rb_tree_incrementPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %0) #10 + %_M_node2 = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %this1, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %_M_node2, align 8 + ret %"struct.std::_Rb_tree_const_iterator"* %this1 +} + +; Function Attrs: nounwind +declare dso_local double @log(double) #11 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEED2Ev(%"class.std::map"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::map"*, align 8 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EED2Ev(%"class.std::_Rb_tree"* %_M_t) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EEC2Ev(%"class.std::_Rb_tree"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EEC2Ev(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EEC2Ev(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"*, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this1 to %"class.std::allocator.4"* + call void @_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEC2Ev(%"class.std::allocator.4"* %0) #3 + %1 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this1 to %"struct.std::_Rb_tree_key_compare"* + invoke void @_ZNSt20_Rb_tree_key_compareISt4lessIhEEC2Ev(%"struct.std::_Rb_tree_key_compare"* %1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %2 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this1 to i8* + %3 = getelementptr inbounds i8, i8* %2, i64 8 + %4 = bitcast i8* %3 to %"struct.std::_Rb_tree_header"* + invoke void @_ZNSt15_Rb_tree_headerC2Ev(%"struct.std::_Rb_tree_header"* %4) + to label %invoke.cont2 unwind label %lpad + +invoke.cont2: ; preds = %invoke.cont + ret void + +lpad: ; preds = %invoke.cont, %entry + %5 = landingpad { i8*, i32 } + cleanup + %6 = extractvalue { i8*, i32 } %5, 0 + store i8* %6, i8** %exn.slot, align 8 + %7 = extractvalue { i8*, i32 } %5, 1 + store i32 %7, i32* %ehselector.slot, align 4 + %8 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this1 to %"class.std::allocator.4"* + call void @_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev(%"class.std::allocator.4"* %8) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEC2Ev(%"class.std::allocator.4"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.4"*, align 8 + store %"class.std::allocator.4"* %this, %"class.std::allocator.4"** %this.addr, align 8 + %this1 = load %"class.std::allocator.4"*, %"class.std::allocator.4"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.4"* %this1 to %"class.__gnu_cxx::new_allocator.5"* + call void @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEC2Ev(%"class.__gnu_cxx::new_allocator.5"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt20_Rb_tree_key_compareISt4lessIhEEC2Ev(%"struct.std::_Rb_tree_key_compare"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_key_compare"*, align 8 + store %"struct.std::_Rb_tree_key_compare"* %this, %"struct.std::_Rb_tree_key_compare"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_key_compare"*, %"struct.std::_Rb_tree_key_compare"** %this.addr, align 8 + %_M_key_compare = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %this1, i32 0, i32 0 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt15_Rb_tree_headerC2Ev(%"struct.std::_Rb_tree_header"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_header"*, align 8 + store %"struct.std::_Rb_tree_header"* %this, %"struct.std::_Rb_tree_header"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_header"*, %"struct.std::_Rb_tree_header"** %this.addr, align 8 + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_header2 = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_color = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header2, i32 0, i32 0 + store i32 0, i32* %_M_color, align 8 + call void @_ZNSt15_Rb_tree_header8_M_resetEv(%"struct.std::_Rb_tree_header"* %this1) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev(%"class.std::allocator.4"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.4"*, align 8 + store %"class.std::allocator.4"* %this, %"class.std::allocator.4"** %this.addr, align 8 + %this1 = load %"class.std::allocator.4"*, %"class.std::allocator.4"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.4"* %this1 to %"class.__gnu_cxx::new_allocator.5"* + call void @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev(%"class.__gnu_cxx::new_allocator.5"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEC2Ev(%"class.__gnu_cxx::new_allocator.5"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.5"*, align 8 + store %"class.__gnu_cxx::new_allocator.5"* %this, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.5"*, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt15_Rb_tree_header8_M_resetEv(%"struct.std::_Rb_tree_header"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_header"*, align 8 + store %"struct.std::_Rb_tree_header"* %this, %"struct.std::_Rb_tree_header"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_header"*, %"struct.std::_Rb_tree_header"** %this.addr, align 8 + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_parent = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header, i32 0, i32 1 + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %_M_parent, align 8 + %_M_header2 = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_header3 = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_left = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header3, i32 0, i32 2 + store %"struct.std::_Rb_tree_node_base"* %_M_header2, %"struct.std::_Rb_tree_node_base"** %_M_left, align 8 + %_M_header4 = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_header5 = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 0 + %_M_right = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header5, i32 0, i32 3 + store %"struct.std::_Rb_tree_node_base"* %_M_header4, %"struct.std::_Rb_tree_node_base"** %_M_right, align 8 + %_M_node_count = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %this1, i32 0, i32 1 + store i64 0, i64* %_M_node_count, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev(%"class.__gnu_cxx::new_allocator.5"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.5"*, align 8 + store %"class.__gnu_cxx::new_allocator.5"* %this, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.5"*, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEEC2Ev(%"struct.std::_Bvector_base"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bvector_base"*, align 8 + store %"struct.std::_Bvector_base"* %this, %"struct.std::_Bvector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base"*, %"struct.std::_Bvector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + call void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implC2Ev(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implC2Ev(%"struct.std::_Bvector_base >::_Bvector_impl"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Bvector_base >::_Bvector_impl"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Bvector_base >::_Bvector_impl"* %this, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base >::_Bvector_impl"*, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %this1 to %"class.std::allocator.1"* + call void @_ZNSaImEC2Ev(%"class.std::allocator.1"* %0) #3 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 0 + invoke void @_ZNSt13_Bit_iteratorC2Ev(%"struct.std::_Bit_iterator"* %_M_start) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 1 + invoke void @_ZNSt13_Bit_iteratorC2Ev(%"struct.std::_Bit_iterator"* %_M_finish) + to label %invoke.cont2 unwind label %lpad + +invoke.cont2: ; preds = %invoke.cont + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 2 + store i64* null, i64** %_M_end_of_storage, align 8 + ret void + +lpad: ; preds = %invoke.cont, %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + %4 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %this1 to %"class.std::allocator.1"* + call void @_ZNSaImED2Ev(%"class.std::allocator.1"* %4) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaImEC2Ev(%"class.std::allocator.1"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.1"*, align 8 + store %"class.std::allocator.1"* %this, %"class.std::allocator.1"** %this.addr, align 8 + %this1 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.1"* %this1 to %"class.__gnu_cxx::new_allocator.2"* + call void @_ZN9__gnu_cxx13new_allocatorImEC2Ev(%"class.__gnu_cxx::new_allocator.2"* %0) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bit_iteratorC2Ev(%"struct.std::_Bit_iterator"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + call void @_ZNSt18_Bit_iterator_baseC2EPmj(%"struct.std::_Bit_iterator_base"* %0, i64* null, i32 0) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorImEC2Ev(%"class.__gnu_cxx::new_allocator.2"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.2"*, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt18_Bit_iterator_baseC2EPmj(%"struct.std::_Bit_iterator_base"* %this, i64* %__x, i32 %__y) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator_base"*, align 8 + %__x.addr = alloca i64*, align 8 + %__y.addr = alloca i32, align 4 + store %"struct.std::_Bit_iterator_base"* %this, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + store i64* %__x, i64** %__x.addr, align 8 + store i32 %__y, i32* %__y.addr, align 4 + %this1 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator_base"* %this1 to %"struct.std::iterator"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 0 + %1 = load i64*, i64** %__x.addr, align 8 + store i64* %1, i64** %_M_p, align 8 + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + %2 = load i32, i32* %__y.addr, align 4 + store i32 %2, i32* %_M_offset, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorImED2Ev(%"class.__gnu_cxx::new_allocator.2"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.2"*, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE5beginEv(%"class.std::_Rb_tree"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 0 + %_M_left = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header, i32 0, i32 2 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_left, align 8 + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %retval, %"struct.std::_Rb_tree_node_base"* %2) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %3 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + ret %"struct.std::_Rb_tree_node_base"* %3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_node_base"* %__x) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + %__x.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + store %"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + store %"struct.std::_Rb_tree_node_base"* %0, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE3endEv(%"class.std::_Rb_tree"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 0 + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %retval, %"struct.std::_Rb_tree_node_base"* %_M_header) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + ret %"struct.std::_Rb_tree_node_base"* %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZSt10__distanceISt19_Bit_const_iteratorENSt15iterator_traitsIT_E15difference_typeES2_S2_St26random_access_iterator_tag(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1) #0 comdat { +entry: + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %0 = alloca %"struct.std::random_access_iterator_tag", align 1 + %1 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %1, i32 0, i32 0 + store i64* %__first.coerce0, i64** %2, align 8 + %3 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %1, i32 0, i32 1 + store i32 %__first.coerce1, i32* %3, align 8 + %4 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %4, i32 0, i32 0 + store i64* %__last.coerce0, i64** %5, align 8 + %6 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %4, i32 0, i32 1 + store i32 %__last.coerce1, i32* %6, align 8 + %7 = bitcast %"struct.std::_Bit_const_iterator"* %__last to %"struct.std::_Bit_iterator_base"* + %8 = bitcast %"struct.std::_Bit_const_iterator"* %__first to %"struct.std::_Bit_iterator_base"* + %call = call i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %7, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %8) + ret i64 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZSt19__iterator_categoryISt19_Bit_const_iteratorENSt15iterator_traitsIT_E17iterator_categoryERKS2_(%"struct.std::_Bit_const_iterator"* dereferenceable(16) %0) #6 comdat { +entry: + %.addr = alloca %"struct.std::_Bit_const_iterator"*, align 8 + store %"struct.std::_Bit_const_iterator"* %0, %"struct.std::_Bit_const_iterator"** %.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %__x, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %__y) #6 comdat { +entry: + %__x.addr = alloca %"struct.std::_Bit_iterator_base"*, align 8 + %__y.addr = alloca %"struct.std::_Bit_iterator_base"*, align 8 + store %"struct.std::_Bit_iterator_base"* %__x, %"struct.std::_Bit_iterator_base"** %__x.addr, align 8 + store %"struct.std::_Bit_iterator_base"* %__y, %"struct.std::_Bit_iterator_base"** %__y.addr, align 8 + %0 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %__x.addr, align 8 + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %0, i32 0, i32 0 + %1 = load i64*, i64** %_M_p, align 8 + %2 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %__y.addr, align 8 + %_M_p1 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %2, i32 0, i32 0 + %3 = load i64*, i64** %_M_p1, align 8 + %sub.ptr.lhs.cast = ptrtoint i64* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint i64* %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + %mul = mul nsw i64 64, %sub.ptr.div + %4 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %__x.addr, align 8 + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %4, i32 0, i32 1 + %5 = load i32, i32* %_M_offset, align 8 + %conv = zext i32 %5 to i64 + %add = add nsw i64 %mul, %conv + %6 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %__y.addr, align 8 + %_M_offset2 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %6, i32 0, i32 1 + %7 = load i32, i32* %_M_offset2, align 8 + %conv3 = zext i32 %7 to i64 + %sub = sub nsw i64 %add, %conv3 + ret i64 %sub +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::pair"* @_ZNKSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"struct.std::_Rb_tree_node"* %this, %"struct.std::_Rb_tree_node"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %this.addr, align 8 + %_M_value_field = getelementptr inbounds %"struct.std::_Rb_tree_node", %"struct.std::_Rb_tree_node"* %this1, i32 0, i32 1 + %call = call %"struct.std::pair"* @_ZSt11__addressofIKSt4pairIKhSt6vectorIbSaIbEEEEPT_RS7_(%"struct.std::pair"* dereferenceable(48) %_M_value_field) + ret %"struct.std::pair"* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %"struct.std::pair"* @_ZSt11__addressofIKSt4pairIKhSt6vectorIbSaIbEEEEPT_RS7_(%"struct.std::pair"* dereferenceable(48) %__r) #6 comdat { +entry: + %__r.addr = alloca %"struct.std::pair"*, align 8 + store %"struct.std::pair"* %__r, %"struct.std::pair"** %__r.addr, align 8 + %0 = load %"struct.std::pair"*, %"struct.std::pair"** %__r.addr, align 8 + ret %"struct.std::pair"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt19_Bit_const_iteratorC2ERKSt13_Bit_iterator(%"struct.std::_Bit_const_iterator"* %this, %"struct.std::_Bit_iterator"* dereferenceable(16) %__x) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_const_iterator"*, align 8 + %__x.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + store %"struct.std::_Bit_const_iterator"* %this, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + store %"struct.std::_Bit_iterator"* %__x, %"struct.std::_Bit_iterator"** %__x.addr, align 8 + %this1 = load %"struct.std::_Bit_const_iterator"*, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %__x.addr, align 8 + %2 = bitcast %"struct.std::_Bit_iterator"* %1 to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %2, i32 0, i32 0 + %3 = load i64*, i64** %_M_p, align 8 + %4 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %__x.addr, align 8 + %5 = bitcast %"struct.std::_Bit_iterator"* %4 to %"struct.std::_Bit_iterator_base"* + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %5, i32 0, i32 1 + %6 = load i32, i32* %_M_offset, align 8 + call void @_ZNSt18_Bit_iterator_baseC2EPmj(%"struct.std::_Bit_iterator_base"* %0, i64* %3, i32 %6) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt19_Bit_const_iteratorC2EPmj(%"struct.std::_Bit_const_iterator"* %this, i64* %__x, i32 %__y) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_const_iterator"*, align 8 + %__x.addr = alloca i64*, align 8 + %__y.addr = alloca i32, align 4 + store %"struct.std::_Bit_const_iterator"* %this, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + store i64* %__x, i64** %__x.addr, align 8 + store i32 %__y, i32* %__y.addr, align 4 + %this1 = load %"struct.std::_Bit_const_iterator"*, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %1 = load i64*, i64** %__x.addr, align 8 + %2 = load i32, i32* %__y.addr, align 4 + call void @_ZNSt18_Bit_iterator_baseC2EPmj(%"struct.std::_Bit_iterator_base"* %0, i64* %1, i32 %2) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNKSt19_Bit_const_iteratordeEv(%"struct.std::_Bit_const_iterator"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_const_iterator"*, align 8 + %ref.tmp = alloca %"struct.std::_Bit_reference", align 8 + store %"struct.std::_Bit_const_iterator"* %this, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_const_iterator"*, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %0, i32 0, i32 0 + %1 = load i64*, i64** %_M_p, align 8 + %2 = bitcast %"struct.std::_Bit_const_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %2, i32 0, i32 1 + %3 = load i32, i32* %_M_offset, align 8 + %sh_prom = zext i32 %3 to i64 + %shl = shl i64 1, %sh_prom + call void @_ZNSt14_Bit_referenceC2EPmm(%"struct.std::_Bit_reference"* %ref.tmp, i64* %1, i64 %shl) + %call = call zeroext i1 @_ZNKSt14_Bit_referencecvbEv(%"struct.std::_Bit_reference"* %ref.tmp) + ret i1 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt14_Bit_referenceC2EPmm(%"struct.std::_Bit_reference"* %this, i64* %__x, i64 %__y) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_reference"*, align 8 + %__x.addr = alloca i64*, align 8 + %__y.addr = alloca i64, align 8 + store %"struct.std::_Bit_reference"* %this, %"struct.std::_Bit_reference"** %this.addr, align 8 + store i64* %__x, i64** %__x.addr, align 8 + store i64 %__y, i64* %__y.addr, align 8 + %this1 = load %"struct.std::_Bit_reference"*, %"struct.std::_Bit_reference"** %this.addr, align 8 + %_M_p = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 0 + %0 = load i64*, i64** %__x.addr, align 8 + store i64* %0, i64** %_M_p, align 8 + %_M_mask = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 1 + %1 = load i64, i64* %__y.addr, align 8 + store i64 %1, i64* %_M_mask, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNKSt14_Bit_referencecvbEv(%"struct.std::_Bit_reference"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_reference"*, align 8 + store %"struct.std::_Bit_reference"* %this, %"struct.std::_Bit_reference"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_reference"*, %"struct.std::_Bit_reference"** %this.addr, align 8 + %_M_p = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 0 + %0 = load i64*, i64** %_M_p, align 8 + %1 = load i64, i64* %0, align 8 + %_M_mask = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 1 + %2 = load i64, i64* %_M_mask, align 8 + %and = and i64 %1, %2 + %tobool = icmp ne i64 %and, 0 + %lnot = xor i1 %tobool, true + %lnot2 = xor i1 %lnot, true + ret i1 %lnot2 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare float @llvm.powi.f32(float, i32) #13 + +; Function Attrs: nounwind readonly +declare dso_local %"struct.std::_Rb_tree_node_base"* @_ZSt18_Rb_tree_incrementPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"*) #14 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EED2Ev(%"class.std::_Rb_tree"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = invoke %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_beginEv(%"class.std::_Rb_tree"* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + invoke void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_eraseEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %call) + to label %invoke.cont2 unwind label %lpad + +invoke.cont2: ; preds = %invoke.cont + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EED2Ev(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl) #3 + ret void + +lpad: ; preds = %invoke.cont, %entry + %0 = landingpad { i8*, i32 } + cleanup + %1 = extractvalue { i8*, i32 } %0, 0 + store i8* %1, i8** %exn.slot, align 8 + %2 = extractvalue { i8*, i32 } %0, 1 + store i32 %2, i32* %ehselector.slot, align 4 + %_M_impl3 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EED2Ev(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl3) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val4 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_eraseEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node"* %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__x.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %__y = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__x, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %cmp = icmp ne %"struct.std::_Rb_tree_node"* %0, null + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %2 = bitcast %"struct.std::_Rb_tree_node"* %1 to %"struct.std::_Rb_tree_node_base"* + %call = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %2) + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_eraseEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %call) + %3 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %4 = bitcast %"struct.std::_Rb_tree_node"* %3 to %"struct.std::_Rb_tree_node_base"* + %call2 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE7_S_leftEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %4) + store %"struct.std::_Rb_tree_node"* %call2, %"struct.std::_Rb_tree_node"** %__y, align 8 + %5 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_drop_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %5) + %6 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__y, align 8 + store %"struct.std::_Rb_tree_node"* %6, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_beginEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 0 + %_M_parent = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header, i32 0, i32 1 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_parent, align 8 + %3 = bitcast %"struct.std::_Rb_tree_node_base"* %2 to %"struct.std::_Rb_tree_node"* + ret %"struct.std::_Rb_tree_node"* %3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13_Rb_tree_implIS9_Lb0EED2Ev(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"*, align 8 + store %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"*, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %this1 to %"class.std::allocator.4"* + call void @_ZNSaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEED2Ev(%"class.std::allocator.4"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %__x) #6 comdat align 2 { +entry: + %__x.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + store %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %_M_right = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %0, i32 0, i32 3 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_right, align 8 + %2 = bitcast %"struct.std::_Rb_tree_node_base"* %1 to %"struct.std::_Rb_tree_node"* + ret %"struct.std::_Rb_tree_node"* %2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE7_S_leftEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %__x) #6 comdat align 2 { +entry: + %__x.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + store %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %_M_left = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %0, i32 0, i32 2 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_left, align 8 + %2 = bitcast %"struct.std::_Rb_tree_node_base"* %1 to %"struct.std::_Rb_tree_node"* + ret %"struct.std::_Rb_tree_node"* %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_drop_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node"* %__p) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__p.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__p, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE15_M_destroy_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %0) + %1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_put_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %1) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE15_M_destroy_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node"* %__p) #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__p.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %ref.tmp = alloca %"class.std::allocator.7", align 1 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__p, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + call void @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13get_allocatorEv(%"class.std::allocator.7"* sret %ref.tmp, %"class.std::_Rb_tree"* %this1) + %0 = bitcast %"class.std::allocator.7"* %ref.tmp to %"class.__gnu_cxx::new_allocator.8"* + %1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + %call = invoke %"struct.std::pair"* @_ZNSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + invoke void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEE7destroyEPS6_(%"class.__gnu_cxx::new_allocator.8"* %0, %"struct.std::pair"* %call) + to label %invoke.cont2 unwind label %lpad + +invoke.cont2: ; preds = %invoke.cont + call void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.std::allocator.7"* %ref.tmp) #3 + ret void + +lpad: ; preds = %invoke.cont, %entry + %2 = landingpad { i8*, i32 } + cleanup + %3 = extractvalue { i8*, i32 } %2, 0 + store i8* %3, i8** %exn.slot, align 8 + %4 = extractvalue { i8*, i32 } %2, 1 + store i32 %4, i32* %ehselector.slot, align 4 + call void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.std::allocator.7"* %ref.tmp) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val3 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_put_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node"* %__p) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__p.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__p, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call dereferenceable(1) %"class.std::allocator.4"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv(%"class.std::_Rb_tree"* %this1) + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEE10deallocateERS9_PS8_m(%"class.std::allocator.4"* dereferenceable(1) %call, %"struct.std::_Rb_tree_node"* %0, i64 1) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13get_allocatorEv(%"class.std::allocator.7"* noalias sret %agg.result, %"class.std::_Rb_tree"* %this) #0 comdat align 2 { +entry: + %result.ptr = alloca i8*, align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %0 = bitcast %"class.std::allocator.7"* %agg.result to i8* + store i8* %0, i8** %result.ptr, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call dereferenceable(1) %"class.std::allocator.4"* @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv(%"class.std::_Rb_tree"* %this1) + call void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEEC2ISt13_Rb_tree_nodeIS4_EEERKSaIT_E(%"class.std::allocator.7"* %agg.result, %"class.std::allocator.4"* dereferenceable(1) %call) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEE7destroyEPS6_(%"class.__gnu_cxx::new_allocator.8"* %this, %"struct.std::pair"* %__p) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.8"*, align 8 + %__p.addr = alloca %"struct.std::pair"*, align 8 + store %"class.__gnu_cxx::new_allocator.8"* %this, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + store %"struct.std::pair"* %__p, %"struct.std::pair"** %__p.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.8"*, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + %0 = load %"struct.std::pair"*, %"struct.std::pair"** %__p.addr, align 8 + call void @_ZNSt4pairIKhSt6vectorIbSaIbEEED2Ev(%"struct.std::pair"* %0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::pair"* @_ZNSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"struct.std::_Rb_tree_node"* %this, %"struct.std::_Rb_tree_node"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %this.addr, align 8 + %_M_value_field = getelementptr inbounds %"struct.std::_Rb_tree_node", %"struct.std::_Rb_tree_node"* %this1, i32 0, i32 1 + %call = call %"struct.std::pair"* @_ZSt11__addressofISt4pairIKhSt6vectorIbSaIbEEEEPT_RS6_(%"struct.std::pair"* dereferenceable(48) %_M_value_field) + ret %"struct.std::pair"* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.std::allocator.7"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.7"*, align 8 + store %"class.std::allocator.7"* %this, %"class.std::allocator.7"** %this.addr, align 8 + %this1 = load %"class.std::allocator.7"*, %"class.std::allocator.7"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.7"* %this1 to %"class.__gnu_cxx::new_allocator.8"* + call void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.__gnu_cxx::new_allocator.8"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator.4"* @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"class.std::allocator.4"* + ret %"class.std::allocator.4"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEEC2ISt13_Rb_tree_nodeIS4_EEERKSaIT_E(%"class.std::allocator.7"* %this, %"class.std::allocator.4"* dereferenceable(1) %0) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.7"*, align 8 + %.addr = alloca %"class.std::allocator.4"*, align 8 + store %"class.std::allocator.7"* %this, %"class.std::allocator.7"** %this.addr, align 8 + store %"class.std::allocator.4"* %0, %"class.std::allocator.4"** %.addr, align 8 + %this1 = load %"class.std::allocator.7"*, %"class.std::allocator.7"** %this.addr, align 8 + %1 = bitcast %"class.std::allocator.7"* %this1 to %"class.__gnu_cxx::new_allocator.8"* + call void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEEC2Ev(%"class.__gnu_cxx::new_allocator.8"* %1) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEEC2Ev(%"class.__gnu_cxx::new_allocator.8"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.8"*, align 8 + store %"class.__gnu_cxx::new_allocator.8"* %this, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.8"*, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt4pairIKhSt6vectorIbSaIbEEED2Ev(%"struct.std::pair"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::pair"*, align 8 + store %"struct.std::pair"* %this, %"struct.std::pair"** %this.addr, align 8 + %this1 = load %"struct.std::pair"*, %"struct.std::pair"** %this.addr, align 8 + %second = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %this1, i32 0, i32 1 + call void @_ZNSt6vectorIbSaIbEED2Ev(%"class.std::vector.0"* %second) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %"struct.std::pair"* @_ZSt11__addressofISt4pairIKhSt6vectorIbSaIbEEEEPT_RS6_(%"struct.std::pair"* dereferenceable(48) %__r) #6 comdat { +entry: + %__r.addr = alloca %"struct.std::pair"*, align 8 + store %"struct.std::pair"* %__r, %"struct.std::pair"** %__r.addr, align 8 + %0 = load %"struct.std::pair"*, %"struct.std::pair"** %__r.addr, align 8 + ret %"struct.std::pair"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.__gnu_cxx::new_allocator.8"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.8"*, align 8 + store %"class.__gnu_cxx::new_allocator.8"* %this, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.8"*, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEE10deallocateERS9_PS8_m(%"class.std::allocator.4"* dereferenceable(1) %__a, %"struct.std::_Rb_tree_node"* %__p, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.4"*, align 8 + %__p.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator.4"* %__a, %"class.std::allocator.4"** %__a.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__p, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator.4"*, %"class.std::allocator.4"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.4"* %0 to %"class.__gnu_cxx::new_allocator.5"* + %2 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE10deallocateEPS8_m(%"class.__gnu_cxx::new_allocator.5"* %1, %"struct.std::_Rb_tree_node"* %2, i64 %3) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator.4"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"class.std::allocator.4"* + ret %"class.std::allocator.4"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE10deallocateEPS8_m(%"class.__gnu_cxx::new_allocator.5"* %this, %"struct.std::_Rb_tree_node"* %__p, i64 %0) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.5"*, align 8 + %__p.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %.addr = alloca i64, align 8 + store %"class.__gnu_cxx::new_allocator.5"* %this, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__p, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + store i64 %0, i64* %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.5"*, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + %1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__p.addr, align 8 + %2 = bitcast %"struct.std::_Rb_tree_node"* %1 to i8* + call void @_ZdlPv(i8* %2) #3 + ret void +} + +; Function Attrs: noreturn nounwind +declare dso_local void @__assert_fail(i8*, i8*, i32, i8*) #5 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt4ceilf(float %__x) #6 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %1 = call float @llvm.ceil.f32(float %0) + ret float %1 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare float @llvm.ceil.f32(float) #13 + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL21prescanArrayRecursivePjPKjii(i32* %outArray, i32* %inArray, i32 %numElements, i32 %level) #0 { +entry: + %outArray.addr = alloca i32*, align 8 + %inArray.addr = alloca i32*, align 8 + %numElements.addr = alloca i32, align 4 + %level.addr = alloca i32, align 4 + %blockSize = alloca i32, align 4 + %numBlocks = alloca i32, align 4 + %numThreads = alloca i32, align 4 + %numEltsPerBlock = alloca i32, align 4 + %numEltsLastBlock = alloca i32, align 4 + %numThreadsLastBlock = alloca i32, align 4 + %np2LastBlock = alloca i32, align 4 + %sharedMemLastBlock = alloca i32, align 4 + %extraSpace = alloca i32, align 4 + %extraSpace39 = alloca i32, align 4 + %sharedMemSize = alloca i32, align 4 + %grid = alloca %struct.dim3, align 4 + %threads = alloca %struct.dim3, align 4 + %err = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp60 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp60.coerce = alloca { i64, i32 }, align 4 + %err64 = alloca i32, align 4 + %agg.tmp73 = alloca %struct.dim3, align 4 + %agg.tmp74 = alloca %struct.dim3, align 4 + %agg.tmp73.coerce = alloca { i64, i32 }, align 4 + %agg.tmp74.coerce = alloca { i64, i32 }, align 4 + %err84 = alloca i32, align 4 + %agg.tmp97 = alloca %struct.dim3, align 4 + %agg.tmp98 = alloca %struct.dim3, align 4 + %agg.tmp97.coerce = alloca { i64, i32 }, align 4 + %agg.tmp98.coerce = alloca { i64, i32 }, align 4 + %err107 = alloca i32, align 4 + %agg.tmp116 = alloca %struct.dim3, align 4 + %agg.tmp117 = alloca %struct.dim3, align 4 + %agg.tmp116.coerce = alloca { i64, i32 }, align 4 + %agg.tmp117.coerce = alloca { i64, i32 }, align 4 + %err127 = alloca i32, align 4 + %agg.tmp138 = alloca %struct.dim3, align 4 + %agg.tmp139 = alloca %struct.dim3, align 4 + %agg.tmp138.coerce = alloca { i64, i32 }, align 4 + %agg.tmp139.coerce = alloca { i64, i32 }, align 4 + %err146 = alloca i32, align 4 + %agg.tmp154 = alloca %struct.dim3, align 4 + %agg.tmp155 = alloca %struct.dim3, align 4 + %agg.tmp154.coerce = alloca { i64, i32 }, align 4 + %agg.tmp155.coerce = alloca { i64, i32 }, align 4 + %err161 = alloca i32, align 4 + store i32* %outArray, i32** %outArray.addr, align 8 + store i32* %inArray, i32** %inArray.addr, align 8 + store i32 %numElements, i32* %numElements.addr, align 4 + store i32 %level, i32* %level.addr, align 4 + store i32 256, i32* %blockSize, align 4 + %0 = load i32, i32* %numElements.addr, align 4 + %conv = sitofp i32 %0 to float + %1 = load i32, i32* %blockSize, align 4 + %conv1 = uitofp i32 %1 to float + %mul = fmul contract float 2.000000e+00, %conv1 + %div = fdiv float %conv, %mul + %call = call float @_ZSt4ceilf(float %div) + %conv2 = fptosi float %call to i32 + %cmp = icmp sgt i32 1, %conv2 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %2 = load i32, i32* %numElements.addr, align 4 + %conv3 = sitofp i32 %2 to float + %3 = load i32, i32* %blockSize, align 4 + %conv4 = uitofp i32 %3 to float + %mul5 = fmul contract float 2.000000e+00, %conv4 + %div6 = fdiv float %conv3, %mul5 + %call7 = call float @_ZSt4ceilf(float %div6) + %conv8 = fptosi float %call7 to i32 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 1, %cond.true ], [ %conv8, %cond.false ] + store i32 %cond, i32* %numBlocks, align 4 + %4 = load i32, i32* %numBlocks, align 4 + %cmp9 = icmp ugt i32 %4, 1 + br i1 %cmp9, label %if.then, label %if.else + +if.then: ; preds = %cond.end + %5 = load i32, i32* %blockSize, align 4 + store i32 %5, i32* %numThreads, align 4 + br label %if.end15 + +if.else: ; preds = %cond.end + %6 = load i32, i32* %numElements.addr, align 4 + %call10 = call zeroext i1 @_Z12isPowerOfTwoi(i32 %6) + br i1 %call10, label %if.then11, label %if.else13 + +if.then11: ; preds = %if.else + %7 = load i32, i32* %numElements.addr, align 4 + %div12 = sdiv i32 %7, 2 + store i32 %div12, i32* %numThreads, align 4 + br label %if.end + +if.else13: ; preds = %if.else + %8 = load i32, i32* %numElements.addr, align 4 + %call14 = call i32 @_Z9floorPow2i(i32 %8) + store i32 %call14, i32* %numThreads, align 4 + br label %if.end + +if.end: ; preds = %if.else13, %if.then11 + br label %if.end15 + +if.end15: ; preds = %if.end, %if.then + %9 = load i32, i32* %numThreads, align 4 + %mul16 = mul i32 %9, 2 + store i32 %mul16, i32* %numEltsPerBlock, align 4 + %10 = load i32, i32* %numElements.addr, align 4 + %11 = load i32, i32* %numBlocks, align 4 + %sub = sub i32 %11, 1 + %12 = load i32, i32* %numEltsPerBlock, align 4 + %mul17 = mul i32 %sub, %12 + %sub18 = sub i32 %10, %mul17 + store i32 %sub18, i32* %numEltsLastBlock, align 4 + %13 = load i32, i32* %numEltsLastBlock, align 4 + %div19 = udiv i32 %13, 2 + %cmp20 = icmp ugt i32 1, %div19 + br i1 %cmp20, label %cond.true21, label %cond.false22 + +cond.true21: ; preds = %if.end15 + br label %cond.end24 + +cond.false22: ; preds = %if.end15 + %14 = load i32, i32* %numEltsLastBlock, align 4 + %div23 = udiv i32 %14, 2 + br label %cond.end24 + +cond.end24: ; preds = %cond.false22, %cond.true21 + %cond25 = phi i32 [ 1, %cond.true21 ], [ %div23, %cond.false22 ] + store i32 %cond25, i32* %numThreadsLastBlock, align 4 + store i32 0, i32* %np2LastBlock, align 4 + store i32 0, i32* %sharedMemLastBlock, align 4 + %15 = load i32, i32* %numEltsLastBlock, align 4 + %16 = load i32, i32* %numEltsPerBlock, align 4 + %cmp26 = icmp ne i32 %15, %16 + br i1 %cmp26, label %if.then27, label %if.end38 + +if.then27: ; preds = %cond.end24 + store i32 1, i32* %np2LastBlock, align 4 + %17 = load i32, i32* %numEltsLastBlock, align 4 + %call28 = call zeroext i1 @_Z12isPowerOfTwoi(i32 %17) + br i1 %call28, label %if.end31, label %if.then29 + +if.then29: ; preds = %if.then27 + %18 = load i32, i32* %numEltsLastBlock, align 4 + %call30 = call i32 @_Z9floorPow2i(i32 %18) + store i32 %call30, i32* %numThreadsLastBlock, align 4 + br label %if.end31 + +if.end31: ; preds = %if.then29, %if.then27 + %19 = load i32, i32* %numThreadsLastBlock, align 4 + %mul32 = mul i32 2, %19 + %div33 = udiv i32 %mul32, 16 + store i32 %div33, i32* %extraSpace, align 4 + %20 = load i32, i32* %numThreadsLastBlock, align 4 + %mul34 = mul i32 2, %20 + %21 = load i32, i32* %extraSpace, align 4 + %add = add i32 %mul34, %21 + %conv35 = zext i32 %add to i64 + %mul36 = mul i64 4, %conv35 + %conv37 = trunc i64 %mul36 to i32 + store i32 %conv37, i32* %sharedMemLastBlock, align 4 + br label %if.end38 + +if.end38: ; preds = %if.end31, %cond.end24 + %22 = load i32, i32* %numEltsPerBlock, align 4 + %div40 = udiv i32 %22, 16 + store i32 %div40, i32* %extraSpace39, align 4 + %23 = load i32, i32* %numEltsPerBlock, align 4 + %24 = load i32, i32* %extraSpace39, align 4 + %add41 = add i32 %23, %24 + %conv42 = zext i32 %add41 to i64 + %mul43 = mul i64 4, %conv42 + %conv44 = trunc i64 %mul43 to i32 + store i32 %conv44, i32* %sharedMemSize, align 4 + %25 = load i32, i32* %numBlocks, align 4 + %26 = load i32, i32* %np2LastBlock, align 4 + %sub45 = sub i32 %25, %26 + %cmp46 = icmp ugt i32 1, %sub45 + br i1 %cmp46, label %cond.true47, label %cond.false48 + +cond.true47: ; preds = %if.end38 + br label %cond.end50 + +cond.false48: ; preds = %if.end38 + %27 = load i32, i32* %numBlocks, align 4 + %28 = load i32, i32* %np2LastBlock, align 4 + %sub49 = sub i32 %27, %28 + br label %cond.end50 + +cond.end50: ; preds = %cond.false48, %cond.true47 + %cond51 = phi i32 [ 1, %cond.true47 ], [ %sub49, %cond.false48 ] + call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid, i32 %cond51, i32 1, i32 1) + %29 = load i32, i32* %numThreads, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %threads, i32 %29, i32 1, i32 1) + %call52 = call i32 @cudaGetLastError() + store i32 %call52, i32* %err, align 4 + %30 = load i32, i32* %err, align 4 + %cmp53 = icmp ne i32 0, %30 + br i1 %cmp53, label %if.then54, label %if.end57 + +if.then54: ; preds = %cond.end50 + %31 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %32 = load i32, i32* %err, align 4 + %call55 = call i8* @cudaGetErrorString(i32 %32) + %call56 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %31, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.19, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 163, i8* %call55) + call void @exit(i32 1) #16 + unreachable + +if.end57: ; preds = %cond.end50 + %33 = load i32, i32* %numBlocks, align 4 + %cmp58 = icmp ugt i32 %33, 1 + br i1 %cmp58, label %if.then59, label %if.else135 + +if.then59: ; preds = %if.end57 + %34 = bitcast %struct.dim3* %agg.tmp to i8* + %35 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %34, i8* align 4 %35, i64 12, i1 false) + %36 = bitcast %struct.dim3* %agg.tmp60 to i8* + %37 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %36, i8* align 4 %37, i64 12, i1 false) + %38 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %39 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %38, i8* align 4 %39, i64 12, i1 false) + %40 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %41 = load i64, i64* %40, align 4 + %42 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %43 = load i32, i32* %42, align 4 + %44 = bitcast { i64, i32 }* %agg.tmp60.coerce to i8* + %45 = bitcast %struct.dim3* %agg.tmp60 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %44, i8* align 4 %45, i64 12, i1 false) + %46 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp60.coerce, i32 0, i32 0 + %47 = load i64, i64* %46, align 4 + %48 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp60.coerce, i32 0, i32 1 + %49 = load i32, i32* %48, align 4 + %call61 = call i32 @__cudaPushCallConfiguration(i64 %41, i32 %43, i64 %47, i32 %49, i64 0, i8* null) + %tobool = icmp ne i32 %call61, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %if.then59 + %50 = load i32*, i32** %outArray.addr, align 8 + %51 = load i32*, i32** %inArray.addr, align 8 + %52 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %53 = load i32, i32* %level.addr, align 4 + %idxprom = sext i32 %53 to i64 + %arrayidx = getelementptr inbounds i32*, i32** %52, i64 %idxprom + %54 = load i32*, i32** %arrayidx, align 8 + %55 = load i32, i32* %numThreads, align 4 + %mul62 = mul i32 %55, 2 + call void @_ZL7prescanILb1ELb0EEvPjPKjS0_iii(i32* %50, i32* %51, i32* %54, i32 %mul62, i32 0, i32 0) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %if.then59 + %call63 = call i32 @cudaThreadSynchronize() + %call65 = call i32 @cudaGetLastError() + store i32 %call65, i32* %err64, align 4 + %56 = load i32, i32* %err64, align 4 + %cmp66 = icmp ne i32 0, %56 + br i1 %cmp66, label %if.then67, label %if.end70 + +if.then67: ; preds = %kcall.end + %57 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %58 = load i32, i32* %err64, align 4 + %call68 = call i8* @cudaGetErrorString(i32 %58) + %call69 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %57, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.20, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 170, i8* %call68) + call void @exit(i32 1) #16 + unreachable + +if.end70: ; preds = %kcall.end + %59 = load i32, i32* %np2LastBlock, align 4 + %tobool71 = icmp ne i32 %59, 0 + br i1 %tobool71, label %if.then72, label %if.end91 + +if.then72: ; preds = %if.end70 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp73, i32 1, i32 1, i32 1) + %60 = load i32, i32* %numThreadsLastBlock, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp74, i32 %60, i32 1, i32 1) + %61 = bitcast { i64, i32 }* %agg.tmp73.coerce to i8* + %62 = bitcast %struct.dim3* %agg.tmp73 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %61, i8* align 4 %62, i64 12, i1 false) + %63 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp73.coerce, i32 0, i32 0 + %64 = load i64, i64* %63, align 4 + %65 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp73.coerce, i32 0, i32 1 + %66 = load i32, i32* %65, align 4 + %67 = bitcast { i64, i32 }* %agg.tmp74.coerce to i8* + %68 = bitcast %struct.dim3* %agg.tmp74 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %67, i8* align 4 %68, i64 12, i1 false) + %69 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp74.coerce, i32 0, i32 0 + %70 = load i64, i64* %69, align 4 + %71 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp74.coerce, i32 0, i32 1 + %72 = load i32, i32* %71, align 4 + %call75 = call i32 @__cudaPushCallConfiguration(i64 %64, i32 %66, i64 %70, i32 %72, i64 0, i8* null) + %tobool76 = icmp ne i32 %call75, 0 + br i1 %tobool76, label %kcall.end82, label %kcall.configok77 + +kcall.configok77: ; preds = %if.then72 + %73 = load i32*, i32** %outArray.addr, align 8 + %74 = load i32*, i32** %inArray.addr, align 8 + %75 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %76 = load i32, i32* %level.addr, align 4 + %idxprom78 = sext i32 %76 to i64 + %arrayidx79 = getelementptr inbounds i32*, i32** %75, i64 %idxprom78 + %77 = load i32*, i32** %arrayidx79, align 8 + %78 = load i32, i32* %numEltsLastBlock, align 4 + %79 = load i32, i32* %numBlocks, align 4 + %sub80 = sub i32 %79, 1 + %80 = load i32, i32* %numElements.addr, align 4 + %81 = load i32, i32* %numEltsLastBlock, align 4 + %sub81 = sub i32 %80, %81 + call void @_ZL7prescanILb1ELb1EEvPjPKjS0_iii(i32* %73, i32* %74, i32* %77, i32 %78, i32 %sub80, i32 %sub81) + br label %kcall.end82 + +kcall.end82: ; preds = %kcall.configok77, %if.then72 + %call83 = call i32 @cudaThreadSynchronize() + %call85 = call i32 @cudaGetLastError() + store i32 %call85, i32* %err84, align 4 + %82 = load i32, i32* %err84, align 4 + %cmp86 = icmp ne i32 0, %82 + br i1 %cmp86, label %if.then87, label %if.end90 + +if.then87: ; preds = %kcall.end82 + %83 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %84 = load i32, i32* %err84, align 4 + %call88 = call i8* @cudaGetErrorString(i32 %84) + %call89 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %83, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 176, i8* %call88) + call void @exit(i32 1) #16 + unreachable + +if.end90: ; preds = %kcall.end82 + br label %if.end91 + +if.end91: ; preds = %if.end90, %if.end70 + %85 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %86 = load i32, i32* %level.addr, align 4 + %idxprom92 = sext i32 %86 to i64 + %arrayidx93 = getelementptr inbounds i32*, i32** %85, i64 %idxprom92 + %87 = load i32*, i32** %arrayidx93, align 8 + %88 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %89 = load i32, i32* %level.addr, align 4 + %idxprom94 = sext i32 %89 to i64 + %arrayidx95 = getelementptr inbounds i32*, i32** %88, i64 %idxprom94 + %90 = load i32*, i32** %arrayidx95, align 8 + %91 = load i32, i32* %numBlocks, align 4 + %92 = load i32, i32* %level.addr, align 4 + %add96 = add nsw i32 %92, 1 + call void @_ZL21prescanArrayRecursivePjPKjii(i32* %87, i32* %90, i32 %91, i32 %add96) + %93 = bitcast %struct.dim3* %agg.tmp97 to i8* + %94 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %93, i8* align 4 %94, i64 12, i1 false) + %95 = bitcast %struct.dim3* %agg.tmp98 to i8* + %96 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %95, i8* align 4 %96, i64 12, i1 false) + %97 = bitcast { i64, i32 }* %agg.tmp97.coerce to i8* + %98 = bitcast %struct.dim3* %agg.tmp97 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %97, i8* align 4 %98, i64 12, i1 false) + %99 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp97.coerce, i32 0, i32 0 + %100 = load i64, i64* %99, align 4 + %101 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp97.coerce, i32 0, i32 1 + %102 = load i32, i32* %101, align 4 + %103 = bitcast { i64, i32 }* %agg.tmp98.coerce to i8* + %104 = bitcast %struct.dim3* %agg.tmp98 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %103, i8* align 4 %104, i64 12, i1 false) + %105 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp98.coerce, i32 0, i32 0 + %106 = load i64, i64* %105, align 4 + %107 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp98.coerce, i32 0, i32 1 + %108 = load i32, i32* %107, align 4 + %call99 = call i32 @__cudaPushCallConfiguration(i64 %100, i32 %102, i64 %106, i32 %108, i64 0, i8* null) + %tobool100 = icmp ne i32 %call99, 0 + br i1 %tobool100, label %kcall.end105, label %kcall.configok101 + +kcall.configok101: ; preds = %if.end91 + %109 = load i32*, i32** %outArray.addr, align 8 + %110 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %111 = load i32, i32* %level.addr, align 4 + %idxprom102 = sext i32 %111 to i64 + %arrayidx103 = getelementptr inbounds i32*, i32** %110, i64 %idxprom102 + %112 = load i32*, i32** %arrayidx103, align 8 + %113 = load i32, i32* %numElements.addr, align 4 + %114 = load i32, i32* %numEltsLastBlock, align 4 + %sub104 = sub i32 %113, %114 + call void @_ZL10uniformAddPjS_iii(i32* %109, i32* %112, i32 %sub104, i32 0, i32 0) + br label %kcall.end105 + +kcall.end105: ; preds = %kcall.configok101, %if.end91 + %call106 = call i32 @cudaThreadSynchronize() + %call108 = call i32 @cudaGetLastError() + store i32 %call108, i32* %err107, align 4 + %115 = load i32, i32* %err107, align 4 + %cmp109 = icmp ne i32 0, %115 + br i1 %cmp109, label %if.then110, label %if.end113 + +if.then110: ; preds = %kcall.end105 + %116 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %117 = load i32, i32* %err107, align 4 + %call111 = call i8* @cudaGetErrorString(i32 %117) + %call112 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %116, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 190, i8* %call111) + call void @exit(i32 1) #16 + unreachable + +if.end113: ; preds = %kcall.end105 + %118 = load i32, i32* %np2LastBlock, align 4 + %tobool114 = icmp ne i32 %118, 0 + br i1 %tobool114, label %if.then115, label %if.end134 + +if.then115: ; preds = %if.end113 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp116, i32 1, i32 1, i32 1) + %119 = load i32, i32* %numThreadsLastBlock, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp117, i32 %119, i32 1, i32 1) + %120 = bitcast { i64, i32 }* %agg.tmp116.coerce to i8* + %121 = bitcast %struct.dim3* %agg.tmp116 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %120, i8* align 4 %121, i64 12, i1 false) + %122 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp116.coerce, i32 0, i32 0 + %123 = load i64, i64* %122, align 4 + %124 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp116.coerce, i32 0, i32 1 + %125 = load i32, i32* %124, align 4 + %126 = bitcast { i64, i32 }* %agg.tmp117.coerce to i8* + %127 = bitcast %struct.dim3* %agg.tmp117 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %126, i8* align 4 %127, i64 12, i1 false) + %128 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp117.coerce, i32 0, i32 0 + %129 = load i64, i64* %128, align 4 + %130 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp117.coerce, i32 0, i32 1 + %131 = load i32, i32* %130, align 4 + %call118 = call i32 @__cudaPushCallConfiguration(i64 %123, i32 %125, i64 %129, i32 %131, i64 0, i8* null) + %tobool119 = icmp ne i32 %call118, 0 + br i1 %tobool119, label %kcall.end125, label %kcall.configok120 + +kcall.configok120: ; preds = %if.then115 + %132 = load i32*, i32** %outArray.addr, align 8 + %133 = load i32**, i32*** @_ZL15g_scanBlockSums, align 8 + %134 = load i32, i32* %level.addr, align 4 + %idxprom121 = sext i32 %134 to i64 + %arrayidx122 = getelementptr inbounds i32*, i32** %133, i64 %idxprom121 + %135 = load i32*, i32** %arrayidx122, align 8 + %136 = load i32, i32* %numEltsLastBlock, align 4 + %137 = load i32, i32* %numBlocks, align 4 + %sub123 = sub i32 %137, 1 + %138 = load i32, i32* %numElements.addr, align 4 + %139 = load i32, i32* %numEltsLastBlock, align 4 + %sub124 = sub i32 %138, %139 + call void @_ZL10uniformAddPjS_iii(i32* %132, i32* %135, i32 %136, i32 %sub123, i32 %sub124) + br label %kcall.end125 + +kcall.end125: ; preds = %kcall.configok120, %if.then115 + %call126 = call i32 @cudaThreadSynchronize() + %call128 = call i32 @cudaGetLastError() + store i32 %call128, i32* %err127, align 4 + %140 = load i32, i32* %err127, align 4 + %cmp129 = icmp ne i32 0, %140 + br i1 %cmp129, label %if.then130, label %if.end133 + +if.then130: ; preds = %kcall.end125 + %141 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %142 = load i32, i32* %err127, align 4 + %call131 = call i8* @cudaGetErrorString(i32 %142) + %call132 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %141, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 196, i8* %call131) + call void @exit(i32 1) #16 + unreachable + +if.end133: ; preds = %kcall.end125 + br label %if.end134 + +if.end134: ; preds = %if.end133, %if.end113 + br label %if.end169 + +if.else135: ; preds = %if.end57 + %143 = load i32, i32* %numElements.addr, align 4 + %call136 = call zeroext i1 @_Z12isPowerOfTwoi(i32 %143) + br i1 %call136, label %if.then137, label %if.else153 + +if.then137: ; preds = %if.else135 + %144 = bitcast %struct.dim3* %agg.tmp138 to i8* + %145 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %144, i8* align 4 %145, i64 12, i1 false) + %146 = bitcast %struct.dim3* %agg.tmp139 to i8* + %147 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %146, i8* align 4 %147, i64 12, i1 false) + %148 = bitcast { i64, i32 }* %agg.tmp138.coerce to i8* + %149 = bitcast %struct.dim3* %agg.tmp138 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %148, i8* align 4 %149, i64 12, i1 false) + %150 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp138.coerce, i32 0, i32 0 + %151 = load i64, i64* %150, align 4 + %152 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp138.coerce, i32 0, i32 1 + %153 = load i32, i32* %152, align 4 + %154 = bitcast { i64, i32 }* %agg.tmp139.coerce to i8* + %155 = bitcast %struct.dim3* %agg.tmp139 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %154, i8* align 4 %155, i64 12, i1 false) + %156 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp139.coerce, i32 0, i32 0 + %157 = load i64, i64* %156, align 4 + %158 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp139.coerce, i32 0, i32 1 + %159 = load i32, i32* %158, align 4 + %call140 = call i32 @__cudaPushCallConfiguration(i64 %151, i32 %153, i64 %157, i32 %159, i64 0, i8* null) + %tobool141 = icmp ne i32 %call140, 0 + br i1 %tobool141, label %kcall.end144, label %kcall.configok142 + +kcall.configok142: ; preds = %if.then137 + %160 = load i32*, i32** %outArray.addr, align 8 + %161 = load i32*, i32** %inArray.addr, align 8 + %162 = load i32, i32* %numThreads, align 4 + %mul143 = mul i32 %162, 2 + call void @_ZL7prescanILb0ELb0EEvPjPKjS0_iii(i32* %160, i32* %161, i32* null, i32 %mul143, i32 0, i32 0) + br label %kcall.end144 + +kcall.end144: ; preds = %kcall.configok142, %if.then137 + %call145 = call i32 @cudaThreadSynchronize() + %call147 = call i32 @cudaGetLastError() + store i32 %call147, i32* %err146, align 4 + %163 = load i32, i32* %err146, align 4 + %cmp148 = icmp ne i32 0, %163 + br i1 %cmp148, label %if.then149, label %if.end152 + +if.then149: ; preds = %kcall.end144 + %164 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %165 = load i32, i32* %err146, align 4 + %call150 = call i8* @cudaGetErrorString(i32 %165) + %call151 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %164, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.23, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 202, i8* %call150) + call void @exit(i32 1) #16 + unreachable + +if.end152: ; preds = %kcall.end144 + br label %if.end168 + +if.else153: ; preds = %if.else135 + %166 = bitcast %struct.dim3* %agg.tmp154 to i8* + %167 = bitcast %struct.dim3* %grid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %166, i8* align 4 %167, i64 12, i1 false) + %168 = bitcast %struct.dim3* %agg.tmp155 to i8* + %169 = bitcast %struct.dim3* %threads to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %168, i8* align 4 %169, i64 12, i1 false) + %170 = bitcast { i64, i32 }* %agg.tmp154.coerce to i8* + %171 = bitcast %struct.dim3* %agg.tmp154 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %170, i8* align 4 %171, i64 12, i1 false) + %172 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp154.coerce, i32 0, i32 0 + %173 = load i64, i64* %172, align 4 + %174 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp154.coerce, i32 0, i32 1 + %175 = load i32, i32* %174, align 4 + %176 = bitcast { i64, i32 }* %agg.tmp155.coerce to i8* + %177 = bitcast %struct.dim3* %agg.tmp155 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %176, i8* align 4 %177, i64 12, i1 false) + %178 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp155.coerce, i32 0, i32 0 + %179 = load i64, i64* %178, align 4 + %180 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp155.coerce, i32 0, i32 1 + %181 = load i32, i32* %180, align 4 + %call156 = call i32 @__cudaPushCallConfiguration(i64 %173, i32 %175, i64 %179, i32 %181, i64 0, i8* null) + %tobool157 = icmp ne i32 %call156, 0 + br i1 %tobool157, label %kcall.end159, label %kcall.configok158 + +kcall.configok158: ; preds = %if.else153 + %182 = load i32*, i32** %outArray.addr, align 8 + %183 = load i32*, i32** %inArray.addr, align 8 + %184 = load i32, i32* %numElements.addr, align 4 + call void @_ZL7prescanILb0ELb1EEvPjPKjS0_iii(i32* %182, i32* %183, i32* null, i32 %184, i32 0, i32 0) + br label %kcall.end159 + +kcall.end159: ; preds = %kcall.configok158, %if.else153 + %call160 = call i32 @cudaThreadSynchronize() + %call162 = call i32 @cudaGetLastError() + store i32 %call162, i32* %err161, align 4 + %185 = load i32, i32* %err161, align 4 + %cmp163 = icmp ne i32 0, %185 + br i1 %cmp163, label %if.then164, label %if.end167 + +if.then164: ; preds = %kcall.end159 + %186 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %187 = load i32, i32* %err161, align 4 + %call165 = call i8* @cudaGetErrorString(i32 %187) + %call166 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %186, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.24, i64 0, i64 0), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.17, i64 0, i64 0), i32 207, i8* %call165) + call void @exit(i32 1) #16 + unreachable + +if.end167: ; preds = %kcall.end159 + br label %if.end168 + +if.end168: ; preds = %if.end167, %if.end152 + br label %if.end169 + +if.end169: ; preds = %if.end168, %if.end134 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_Z12isPowerOfTwoi(i32 %n) #6 comdat { +entry: + %n.addr = alloca i32, align 4 + store i32 %n, i32* %n.addr, align 4 + %0 = load i32, i32* %n.addr, align 4 + %1 = load i32, i32* %n.addr, align 4 + %sub = sub nsw i32 %1, 1 + %and = and i32 %0, %sub + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i32 @_Z9floorPow2i(i32 %n) #0 comdat { +entry: + %n.addr = alloca i32, align 4 + %exp = alloca i32, align 4 + store i32 %n, i32* %n.addr, align 4 + %0 = load i32, i32* %n.addr, align 4 + %conv = sitofp i32 %0 to float + %call = call float @_ZSt5frexpfPi(float %conv, i32* %exp) + %1 = load i32, i32* %exp, align 4 + %sub = sub nsw i32 %1, 1 + %shl = shl i32 1, %sub + ret i32 %shl +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL7prescanILb1ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast i32** %g_odata.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %g_idata.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %g_blockSums.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %n.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %blockIndex.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %baseIndex.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb0EEvPjPKjS0_iii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL7prescanILb1ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast i32** %g_odata.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %g_idata.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %g_blockSums.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %n.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %blockIndex.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %baseIndex.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb1EEvPjPKjS0_iii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL10uniformAddPjS_iii(i32* %g_data, i32* %uniforms, i32 %n, i32 %blockOffset, i32 %baseIndex) #0 { +entry: + %g_data.addr = alloca i32*, align 8 + %uniforms.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockOffset.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %g_data, i32** %g_data.addr, align 8 + store i32* %uniforms, i32** %uniforms.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockOffset, i32* %blockOffset.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %kernel_args = alloca i8*, i64 5, align 16 + %0 = bitcast i32** %g_data.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %uniforms.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32* %n.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %blockOffset.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %baseIndex.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %11 = load i64, i64* %shmem_size, align 8 + %12 = load i8*, i8** %stream, align 8 + %13 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %14 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 12, i1 false) + %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %16 = load i64, i64* %15, align 8 + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %18 = load i32, i32* %17, align 8 + %19 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %20 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast i8* %12 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32, i32, i32)* @_ZL10uniformAddPjS_iii to i8*), i64 %16, i32 %18, i64 %22, i32 %24, i8** %kernel_args, i64 %11, %struct.CUstream_st* %25) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL7prescanILb0ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast i32** %g_odata.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %g_idata.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %g_blockSums.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %n.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %blockIndex.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %baseIndex.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb0EEvPjPKjS0_iii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define internal void @_ZL7prescanILb0ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { +entry: + %g_odata.addr = alloca i32*, align 8 + %g_idata.addr = alloca i32*, align 8 + %g_blockSums.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %blockIndex.addr = alloca i32, align 4 + %baseIndex.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32* %g_odata, i32** %g_odata.addr, align 8 + store i32* %g_idata, i32** %g_idata.addr, align 8 + store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 %blockIndex, i32* %blockIndex.addr, align 4 + store i32 %baseIndex, i32* %baseIndex.addr, align 4 + %kernel_args = alloca i8*, i64 6, align 16 + %0 = bitcast i32** %g_odata.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %g_idata.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %g_blockSums.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32* %n.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %blockIndex.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %baseIndex.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %13 = load i64, i64* %shmem_size, align 8 + %14 = load i8*, i8** %stream, align 8 + %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %16 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %22 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %24 = load i64, i64* %23, align 8 + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast i8* %14 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb1EEvPjPKjS0_iii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt5frexpfPi(float %__x, i32* %__exp) #6 comdat { +entry: + %__x.addr = alloca float, align 4 + %__exp.addr = alloca i32*, align 8 + store float %__x, float* %__x.addr, align 4 + store i32* %__exp, i32** %__exp.addr, align 8 + %0 = load float, float* %__x.addr, align 4 + %1 = load i32*, i32** %__exp.addr, align 8 + %call = call float @frexpf(float %0, i32* %1) #3 + ret float %call +} + +; Function Attrs: nounwind +declare dso_local float @frexpf(float, i32*) #11 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EEC2Ev(%"struct.std::_Vector_base"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implC2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implC2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator"* + call void @_ZNSaIP5INodeEC2Ev(%"class.std::allocator"* %0) #3 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 0 + store %class.INode** null, %class.INode*** %_M_start, align 8 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 1 + store %class.INode** null, %class.INode*** %_M_finish, align 8 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 2 + store %class.INode** null, %class.INode*** %_M_end_of_storage, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaIP5INodeEC2Ev(%"class.std::allocator"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %this, %"class.std::allocator"** %this.addr, align 8 + %this1 = load %"class.std::allocator"*, %"class.std::allocator"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator"* %this1 to %"class.__gnu_cxx::new_allocator"* + call void @_ZN9__gnu_cxx13new_allocatorIP5INodeEC2Ev(%"class.__gnu_cxx::new_allocator"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIP5INodeEC2Ev(%"class.__gnu_cxx::new_allocator"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8_DestroyIPP5INodeS1_EvT_S3_RSaIT0_E(%class.INode** %__first, %class.INode** %__last, %"class.std::allocator"* dereferenceable(1) %0) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %.addr = alloca %"class.std::allocator"*, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %"class.std::allocator"* %0, %"class.std::allocator"** %.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__last.addr, align 8 + call void @_ZSt8_DestroyIPP5INodeEvT_S3_(%class.INode** %1, %class.INode** %2) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + ret %"class.std::allocator"* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EED2Ev(%"struct.std::_Vector_base"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %_M_start, align 8 + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 2 + %1 = load %class.INode**, %class.INode*** %_M_end_of_storage, align 8 + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start4 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %_M_start4, align 8 + %sub.ptr.lhs.cast = ptrtoint %class.INode** %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %class.INode** %2 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + invoke void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE13_M_deallocateEPS1_m(%"struct.std::_Vector_base"* %this1, %class.INode** %0, i64 %sub.ptr.div) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %_M_impl5 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl5) #3 + ret void + +lpad: ; preds = %entry + %3 = landingpad { i8*, i32 } + cleanup + %4 = extractvalue { i8*, i32 } %3, 0 + store i8* %4, i8** %exn.slot, align 8 + %5 = extractvalue { i8*, i32 } %3, 1 + store i32 %5, i32* %ehselector.slot, align 4 + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val7 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val7 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8_DestroyIPP5INodeEvT_S3_(%class.INode** %__first, %class.INode** %__last) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + call void @_ZNSt12_Destroy_auxILb1EE9__destroyIPP5INodeEEvT_S5_(%class.INode** %0, %class.INode** %1) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Destroy_auxILb1EE9__destroyIPP5INodeEEvT_S5_(%class.INode** %0, %class.INode** %1) #6 comdat align 2 { +entry: + %.addr = alloca %class.INode**, align 8 + %.addr1 = alloca %class.INode**, align 8 + store %class.INode** %0, %class.INode*** %.addr, align 8 + store %class.INode** %1, %class.INode*** %.addr1, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE13_M_deallocateEPS1_m(%"struct.std::_Vector_base"* %this, %class.INode** %__p, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__p.addr, align 8 + %tobool = icmp ne %class.INode** %0, null + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + %2 = load %class.INode**, %class.INode*** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE10deallocateERS3_PS2_m(%"class.std::allocator"* dereferenceable(1) %1, %class.INode** %2, i64 %3) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator"* + call void @_ZNSaIP5INodeED2Ev(%"class.std::allocator"* %0) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE10deallocateERS3_PS2_m(%"class.std::allocator"* dereferenceable(1) %__a, %class.INode** %__p, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load %class.INode**, %class.INode*** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorIP5INodeE10deallocateEPS2_m(%"class.__gnu_cxx::new_allocator"* %1, %class.INode** %2, i64 %3) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIP5INodeE10deallocateEPS2_m(%"class.__gnu_cxx::new_allocator"* %this, %class.INode** %__p, i64 %0) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + %.addr = alloca i64, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + store i64 %0, i64* %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__p.addr, align 8 + %2 = bitcast %class.INode** %1 to i8* + call void @_ZdlPv(i8* %2) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaIP5INodeED2Ev(%"class.std::allocator"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %this, %"class.std::allocator"** %this.addr, align 8 + %this1 = load %"class.std::allocator"*, %"class.std::allocator"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator"* %this1 to %"class.__gnu_cxx::new_allocator"* + call void @_ZN9__gnu_cxx13new_allocatorIP5INodeED2Ev(%"class.__gnu_cxx::new_allocator"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIP5INodeED2Ev(%"class.__gnu_cxx::new_allocator"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIP5INodeSaIS1_EEC2ERKS3_(%"class.std::vector"* %this, %"class.std::vector"* dereferenceable(24) %__x) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %__x.addr = alloca %"class.std::vector"*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %agg.tmp5 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store %"class.std::vector"* %__x, %"class.std::vector"** %__x.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %1 = load %"class.std::vector"*, %"class.std::vector"** %__x.addr, align 8 + %call = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %1) + %2 = load %"class.std::vector"*, %"class.std::vector"** %__x.addr, align 8 + %3 = bitcast %"class.std::vector"* %2 to %"struct.std::_Vector_base"* + %call2 = call dereferenceable(1) %"class.std::allocator"* @_ZNKSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %3) + %call3 = call dereferenceable(1) %"class.std::allocator"* @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE17_S_select_on_copyERKS3_(%"class.std::allocator"* dereferenceable(1) %call2) + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EEC2EmRKS2_(%"struct.std::_Vector_base"* %0, i64 %call, %"class.std::allocator"* dereferenceable(1) %call3) + %4 = load %"class.std::vector"*, %"class.std::vector"** %__x.addr, align 8 + %call4 = invoke %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %4) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call4, %class.INode*** %coerce.dive, align 8 + %5 = load %"class.std::vector"*, %"class.std::vector"** %__x.addr, align 8 + %call7 = invoke %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %5) + to label %invoke.cont6 unwind label %lpad + +invoke.cont6: ; preds = %invoke.cont + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp5, i32 0, i32 0 + store %class.INode** %call7, %class.INode*** %coerce.dive8, align 8 + %6 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %6, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %7 = load %class.INode**, %class.INode*** %_M_start, align 8 + %8 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call10 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %8) + to label %invoke.cont9 unwind label %lpad + +invoke.cont9: ; preds = %invoke.cont6 + %coerce.dive11 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + %9 = load %class.INode**, %class.INode*** %coerce.dive11, align 8 + %coerce.dive12 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp5, i32 0, i32 0 + %10 = load %class.INode**, %class.INode*** %coerce.dive12, align 8 + %call14 = invoke %class.INode** @_ZSt22__uninitialized_copy_aIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_S3_ET0_T_SC_SB_RSaIT1_E(%class.INode** %9, %class.INode** %10, %class.INode** %7, %"class.std::allocator"* dereferenceable(1) %call10) + to label %invoke.cont13 unwind label %lpad + +invoke.cont13: ; preds = %invoke.cont9 + %11 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl15 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %11, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl15, i32 0, i32 1 + store %class.INode** %call14, %class.INode*** %_M_finish, align 8 + ret void + +lpad: ; preds = %invoke.cont9, %invoke.cont6, %invoke.cont, %entry + %12 = landingpad { i8*, i32 } + cleanup + %13 = extractvalue { i8*, i32 } %12, 0 + store i8* %13, i8** %exn.slot, align 8 + %14 = extractvalue { i8*, i32 } %12, 1 + store i32 %14, i32* %ehselector.slot, align 4 + %15 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + invoke void @_ZNSt12_Vector_baseIP5INodeSaIS1_EED2Ev(%"struct.std::_Vector_base"* %15) + to label %invoke.cont16 unwind label %terminate.lpad + +invoke.cont16: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont16 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val17 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val17 + +terminate.lpad: ; preds = %lpad + %16 = landingpad { i8*, i32 } + catch i8* null + %17 = extractvalue { i8*, i32 } %16, 0 + call void @__clang_call_terminate(i8* %17) #16 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt9make_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_(%class.INode** %__first.coerce, %class.INode** %__last.coerce) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__comp = alloca %struct.NodeCmp, align 1 + %__cmp = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter", align 1 + %agg.tmp = alloca %struct.NodeCmp, align 1 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp3 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + call void @_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEC2ES2_(%"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %__cmp) + %0 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp2 to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %2 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3 to i8* + %3 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 8, i1 false) + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp2, i32 0, i32 0 + %4 = load %class.INode**, %class.INode*** %coerce.dive4, align 8 + %coerce.dive5 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3, i32 0, i32 0 + %5 = load %class.INode**, %class.INode*** %coerce.dive5, align 8 + call void @_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEENS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_SD_RT0_(%class.INode** %4, %class.INode** %5, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* dereferenceable(1) %__cmp) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + call void @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS3_(%"class.__gnu_cxx::__normal_iterator"* %retval, %class.INode*** dereferenceable(8) %_M_start) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %retval, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %coerce.dive, align 8 + ret %class.INode** %1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + call void @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS3_(%"class.__gnu_cxx::__normal_iterator"* %retval, %class.INode*** dereferenceable(8) %_M_finish) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %retval, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %coerce.dive, align 8 + ret %class.INode** %1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %class.INode**, %class.INode*** %_M_finish, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 0 + %3 = load %class.INode**, %class.INode*** %_M_start, align 8 + %sub.ptr.lhs.cast = ptrtoint %class.INode** %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %class.INode** %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + ret i64 %sub.ptr.div +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator"* @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE17_S_select_on_copyERKS3_(%"class.std::allocator"* dereferenceable(1) %__a) #6 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + ret %"class.std::allocator"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator"* @_ZNKSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + ret %"class.std::allocator"* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EEC2EmRKS2_(%"struct.std::_Vector_base"* %this, i64 %__n, %"class.std::allocator"* dereferenceable(1) %__a) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %__n.addr = alloca i64, align 8 + %__a.addr = alloca %"class.std::allocator"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implC2ERKS2_(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, %"class.std::allocator"* dereferenceable(1) %0) + %1 = load i64, i64* %__n.addr, align 8 + invoke void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE17_M_create_storageEm(%"struct.std::_Vector_base"* %this1, i64 %1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + ret void + +lpad: ; preds = %entry + %2 = landingpad { i8*, i32 } + cleanup + %3 = extractvalue { i8*, i32 } %2, 0 + store i8* %3, i8** %exn.slot, align 8 + %4 = extractvalue { i8*, i32 } %2, 1 + store i32 %4, i32* %ehselector.slot, align 4 + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt22__uninitialized_copy_aIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_S3_ET0_T_SC_SB_RSaIT1_E(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %class.INode** %__result, %"class.std::allocator"* dereferenceable(1) %0) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__result.addr = alloca %class.INode**, align 8 + %.addr = alloca %"class.std::allocator"*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store %"class.std::allocator"* %0, %"class.std::allocator"** %.addr, align 8 + %1 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp to i8* + %2 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 8, i1 false) + %3 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2 to i8* + %4 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 8, i1 false) + %5 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + %6 = load %class.INode**, %class.INode*** %coerce.dive3, align 8 + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2, i32 0, i32 0 + %7 = load %class.INode**, %class.INode*** %coerce.dive4, align 8 + %call = call %class.INode** @_ZSt18uninitialized_copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET0_T_SC_SB_(%class.INode** %6, %class.INode** %7, %class.INode** %5) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + call void @_ZN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS4_(%"class.__gnu_cxx::__normal_iterator.10"* %retval, %class.INode*** dereferenceable(8) %_M_start) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %retval, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %coerce.dive, align 8 + ret %class.INode** %1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + call void @_ZN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS4_(%"class.__gnu_cxx::__normal_iterator.10"* %retval, %class.INode*** dereferenceable(8) %_M_finish) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %retval, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %coerce.dive, align 8 + ret %class.INode** %1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE12_Vector_implC2ERKS2_(%"struct.std::_Vector_base >::_Vector_impl"* %this, %"class.std::allocator"* dereferenceable(1) %__a) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + %__a.addr = alloca %"class.std::allocator"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator"* + %1 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + call void @_ZNSaIP5INodeEC2ERKS1_(%"class.std::allocator"* %0, %"class.std::allocator"* dereferenceable(1) %1) #3 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 0 + store %class.INode** null, %class.INode*** %_M_start, align 8 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 1 + store %class.INode** null, %class.INode*** %_M_finish, align 8 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 2 + store %class.INode** null, %class.INode*** %_M_end_of_storage, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE17_M_create_storageEm(%"struct.std::_Vector_base"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %0 = load i64, i64* %__n.addr, align 8 + %call = call %class.INode** @_ZNSt12_Vector_baseIP5INodeSaIS1_EE11_M_allocateEm(%"struct.std::_Vector_base"* %this1, i64 %0) + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %_M_start, align 8 + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start3 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 0 + %1 = load %class.INode**, %class.INode*** %_M_start3, align 8 + %_M_impl4 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl4, i32 0, i32 1 + store %class.INode** %1, %class.INode*** %_M_finish, align 8 + %_M_impl5 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start6 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl5, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %_M_start6, align 8 + %3 = load i64, i64* %__n.addr, align 8 + %add.ptr = getelementptr inbounds %class.INode*, %class.INode** %2, i64 %3 + %_M_impl7 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl7, i32 0, i32 2 + store %class.INode** %add.ptr, %class.INode*** %_M_end_of_storage, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaIP5INodeEC2ERKS1_(%"class.std::allocator"* %this, %"class.std::allocator"* dereferenceable(1) %__a) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator"*, align 8 + %__a.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %this, %"class.std::allocator"** %this.addr, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + %this1 = load %"class.std::allocator"*, %"class.std::allocator"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator"* %this1 to %"class.__gnu_cxx::new_allocator"* + %1 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %2 = bitcast %"class.std::allocator"* %1 to %"class.__gnu_cxx::new_allocator"* + call void @_ZN9__gnu_cxx13new_allocatorIP5INodeEC2ERKS3_(%"class.__gnu_cxx::new_allocator"* %0, %"class.__gnu_cxx::new_allocator"* dereferenceable(1) %2) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIP5INodeEC2ERKS3_(%"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"* dereferenceable(1) %0) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %"class.__gnu_cxx::new_allocator"* %0, %"class.__gnu_cxx::new_allocator"** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNSt12_Vector_baseIP5INodeSaIS1_EE11_M_allocateEm(%"struct.std::_Vector_base"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ne i64 %0, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %class.INode** @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE8allocateERS3_m(%"class.std::allocator"* dereferenceable(1) %1, i64 %2) + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi %class.INode** [ %call, %cond.true ], [ null, %cond.false ] + ret %class.INode** %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE8allocateERS3_m(%"class.std::allocator"* dereferenceable(1) %__a, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %class.INode** @_ZN9__gnu_cxx13new_allocatorIP5INodeE8allocateEmPKv(%"class.__gnu_cxx::new_allocator"* %1, i64 %2, i8* null) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZN9__gnu_cxx13new_allocatorIP5INodeE8allocateEmPKv(%"class.__gnu_cxx::new_allocator"* %this, i64 %__n, i8* %0) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__n.addr = alloca i64, align 8 + %.addr = alloca i8*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %0, i8** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorIP5INodeE8max_sizeEv(%"class.__gnu_cxx::new_allocator"* %this1) #3 + %cmp = icmp ugt i64 %1, %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_ZSt17__throw_bad_allocv() #19 + unreachable + +if.end: ; preds = %entry + %2 = load i64, i64* %__n.addr, align 8 + %mul = mul i64 %2, 8 + %call2 = call i8* @_Znwm(i64 %mul) + %3 = bitcast i8* %call2 to %class.INode** + ret %class.INode** %3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNK9__gnu_cxx13new_allocatorIP5INodeE8max_sizeEv(%"class.__gnu_cxx::new_allocator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret i64 2305843009213693951 +} + +; Function Attrs: noreturn +declare dso_local void @_ZSt17__throw_bad_allocv() #15 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt18uninitialized_copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET0_T_SC_SB_(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %class.INode** %__result) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__result.addr = alloca %class.INode**, align 8 + %__assignable = alloca i8, align 1 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store i8 1, i8* %__assignable, align 1 + %0 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %2 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2 to i8* + %3 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 8, i1 false) + %4 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + %5 = load %class.INode**, %class.INode*** %coerce.dive3, align 8 + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2, i32 0, i32 0 + %6 = load %class.INode**, %class.INode*** %coerce.dive4, align 8 + %call = call %class.INode** @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS5_SaIS5_EEEEPS5_EET0_T_SE_SD_(%class.INode** %5, %class.INode** %6, %class.INode** %4) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS5_SaIS5_EEEEPS5_EET0_T_SE_SD_(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %class.INode** %__result) #0 comdat align 2 { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__result.addr = alloca %class.INode**, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %2 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2 to i8* + %3 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 8, i1 false) + %4 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + %5 = load %class.INode**, %class.INode*** %coerce.dive3, align 8 + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2, i32 0, i32 0 + %6 = load %class.INode**, %class.INode*** %coerce.dive4, align 8 + %call = call %class.INode** @_ZSt4copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET0_T_SC_SB_(%class.INode** %5, %class.INode** %6, %class.INode** %4) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt4copyIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET0_T_SC_SB_(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %class.INode** %__result) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__result.addr = alloca %class.INode**, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp5 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp6 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2 to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp2, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive3, align 8 + %call = call %class.INode** @_ZSt12__miter_baseIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEET_SA_(%class.INode** %2) + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive4, align 8 + %3 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp6 to i8* + %4 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 8, i1 false) + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp6, i32 0, i32 0 + %5 = load %class.INode**, %class.INode*** %coerce.dive7, align 8 + %call8 = call %class.INode** @_ZSt12__miter_baseIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEET_SA_(%class.INode** %5) + %coerce.dive9 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp5, i32 0, i32 0 + store %class.INode** %call8, %class.INode*** %coerce.dive9, align 8 + %6 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %coerce.dive10 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + %7 = load %class.INode**, %class.INode*** %coerce.dive10, align 8 + %coerce.dive11 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp5, i32 0, i32 0 + %8 = load %class.INode**, %class.INode*** %coerce.dive11, align 8 + %call12 = call %class.INode** @_ZSt14__copy_move_a2ILb0EN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET1_T0_SC_SB_(%class.INode** %7, %class.INode** %8, %class.INode** %6) + ret %class.INode** %call12 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt14__copy_move_a2ILb0EN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEPS3_ET1_T0_SC_SB_(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %class.INode** %__result) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__result.addr = alloca %class.INode**, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %agg.tmp3 = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %coerce.dive2 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive2, align 8 + %call = call %class.INode** @_ZSt12__niter_baseIPKP5INodeSt6vectorIS1_SaIS1_EEET_N9__gnu_cxx17__normal_iteratorIS7_T0_EE(%class.INode** %2) + %3 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp3 to i8* + %4 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 8, i1 false) + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %agg.tmp3, i32 0, i32 0 + %5 = load %class.INode**, %class.INode*** %coerce.dive4, align 8 + %call5 = call %class.INode** @_ZSt12__niter_baseIPKP5INodeSt6vectorIS1_SaIS1_EEET_N9__gnu_cxx17__normal_iteratorIS7_T0_EE(%class.INode** %5) + %6 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call6 = call %class.INode** @_ZSt12__niter_baseIPP5INodeET_S3_(%class.INode** %6) + %call7 = call %class.INode** @_ZSt13__copy_move_aILb0EPKP5INodePS1_ET1_T0_S6_S5_(%class.INode** %call, %class.INode** %call5, %class.INode** %call6) + ret %class.INode** %call7 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt12__miter_baseIN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS3_SaIS3_EEEEET_SA_(%class.INode** %__it.coerce) #6 comdat { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %__it = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__it, i32 0, i32 0 + store %class.INode** %__it.coerce, %class.INode*** %coerce.dive, align 8 + %0 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %retval to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator.10"* %__it to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %retval, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive1, align 8 + ret %class.INode** %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt13__copy_move_aILb0EPKP5INodePS1_ET1_T0_S6_S5_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + %__simple = alloca i8, align 1 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store i8 1, i8* %__simple, align 1 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call = call %class.INode** @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mIP5INodeEEPT_PKS5_S8_S6_(%class.INode** %0, %class.INode** %1, %class.INode** %2) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt12__niter_baseIPKP5INodeSt6vectorIS1_SaIS1_EEET_N9__gnu_cxx17__normal_iteratorIS7_T0_EE(%class.INode** %__it.coerce) #0 comdat { +entry: + %__it = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %__it, i32 0, i32 0 + store %class.INode** %__it.coerce, %class.INode*** %coerce.dive, align 8 + %call = call dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.10"* %__it) + %0 = load %class.INode**, %class.INode*** %call, align 8 + ret %class.INode** %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt12__niter_baseIPP5INodeET_S3_(%class.INode** %__it) #6 comdat { +entry: + %__it.addr = alloca %class.INode**, align 8 + store %class.INode** %__it, %class.INode*** %__it.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__it.addr, align 8 + ret %class.INode** %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mIP5INodeEEPT_PKS5_S8_S6_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #6 comdat align 2 { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + %_Num = alloca i64, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %sub.ptr.lhs.cast = ptrtoint %class.INode** %0 to i64 + %sub.ptr.rhs.cast = ptrtoint %class.INode** %1 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + store i64 %sub.ptr.div, i64* %_Num, align 8 + %2 = load i64, i64* %_Num, align 8 + %tobool = icmp ne i64 %2, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %3 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %4 = bitcast %class.INode** %3 to i8* + %5 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %6 = bitcast %class.INode** %5 to i8* + %7 = load i64, i64* %_Num, align 8 + %mul = mul i64 8, %7 + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %4, i8* align 8 %6, i64 %mul, i1 false) + br label %if.end + +if.end: ; preds = %if.then, %entry + %8 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %9 = load i64, i64* %_Num, align 8 + %add.ptr = getelementptr inbounds %class.INode*, %class.INode** %8, i64 %9 + ret %class.INode** %add.ptr +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #4 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.10"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator.10"*, align 8 + store %"class.__gnu_cxx::__normal_iterator.10"* %this, %"class.__gnu_cxx::__normal_iterator.10"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator.10"*, %"class.__gnu_cxx::__normal_iterator.10"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %this1, i32 0, i32 0 + ret %class.INode*** %_M_current +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS4_(%"class.__gnu_cxx::__normal_iterator.10"* %this, %class.INode*** dereferenceable(8) %__i) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator.10"*, align 8 + %__i.addr = alloca %class.INode***, align 8 + store %"class.__gnu_cxx::__normal_iterator.10"* %this, %"class.__gnu_cxx::__normal_iterator.10"** %this.addr, align 8 + store %class.INode*** %__i, %class.INode**** %__i.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator.10"*, %"class.__gnu_cxx::__normal_iterator.10"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %this1, i32 0, i32 0 + %0 = load %class.INode***, %class.INode**** %__i.addr, align 8 + %1 = load %class.INode**, %class.INode*** %0, align 8 + store %class.INode** %1, %class.INode*** %_M_current, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEC2ES2_(%"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %this) unnamed_addr #6 comdat align 2 { +entry: + %__comp = alloca %struct.NodeCmp, align 1 + %this.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %this, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %this.addr, align 8 + %this1 = load %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %this.addr, align 8 + %_M_comp = getelementptr inbounds %"struct.__gnu_cxx::__ops::_Iter_comp_iter", %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %this1, i32 0, i32 0 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEENS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_SD_RT0_(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* dereferenceable(1) %__comp) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__comp.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, align 8 + %__len = alloca i64, align 8 + %__parent = alloca i64, align 8 + %__value = alloca %class.INode*, align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp6 = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter", align 1 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %__comp, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %__comp.addr, align 8 + %call = call i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__last, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__first) + %cmp = icmp slt i64 %call, 2 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %return + +if.end: ; preds = %entry + %call2 = call i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__last, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__first) + store i64 %call2, i64* %__len, align 8 + %0 = load i64, i64* %__len, align 8 + %sub = sub nsw i64 %0, 2 + %div = sdiv i64 %sub, 2 + store i64 %div, i64* %__parent, align 8 + br label %while.body + +while.body: ; preds = %if.end, %if.end10 + %1 = load i64, i64* %__parent, align 8 + %call3 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %1) + %coerce.dive4 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp, i32 0, i32 0 + store %class.INode** %call3, %class.INode*** %coerce.dive4, align 8 + %call5 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp) + %2 = load %class.INode*, %class.INode** %call5, align 8 + store %class.INode* %2, %class.INode** %__value, align 8 + %3 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp to i8* + %4 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 8, i1 false) + %5 = load i64, i64* %__parent, align 8 + %6 = load i64, i64* %__len, align 8 + %7 = load %class.INode*, %class.INode** %__value, align 8 + %8 = load %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %__comp.addr, align 8 + %9 = bitcast %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %agg.tmp6 to i8* + %10 = bitcast %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %8 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %9, i8* align 1 %10, i64 1, i1 false) + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %11 = load %class.INode**, %class.INode*** %coerce.dive7, align 8 + call void @_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_T0_SE_T1_T2_(%class.INode** %11, i64 %5, i64 %6, %class.INode* %7) + %12 = load i64, i64* %__parent, align 8 + %cmp8 = icmp eq i64 %12, 0 + br i1 %cmp8, label %if.then9, label %if.end10 + +if.then9: ; preds = %while.body + br label %return + +if.end10: ; preds = %while.body + %13 = load i64, i64* %__parent, align 8 + %dec = add nsw i64 %13, -1 + store i64 %dec, i64* %__parent, align 8 + br label %while.body + +return: ; preds = %if.then9, %if.then + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__lhs, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__rhs) #0 comdat { +entry: + %__lhs.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + %__rhs.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %__lhs, %"class.__gnu_cxx::__normal_iterator"** %__lhs.addr, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %__rhs, %"class.__gnu_cxx::__normal_iterator"** %__rhs.addr, align 8 + %0 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %__lhs.addr, align 8 + %call = call dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %0) + %1 = load %class.INode**, %class.INode*** %call, align 8 + %2 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %__rhs.addr, align 8 + %call1 = call dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %2) + %3 = load %class.INode**, %class.INode*** %call1, align 8 + %sub.ptr.lhs.cast = ptrtoint %class.INode** %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %class.INode** %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + ret i64 %sub.ptr.div +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %this, i64 %__n) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + %__n.addr = alloca i64, align 8 + %ref.tmp = alloca %class.INode**, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %_M_current, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %add.ptr = getelementptr inbounds %class.INode*, %class.INode** %0, i64 %1 + store %class.INode** %add.ptr, %class.INode*** %ref.tmp, align 8 + call void @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS3_(%"class.__gnu_cxx::__normal_iterator"* %retval, %class.INode*** dereferenceable(8) %ref.tmp) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %retval, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive, align 8 + ret %class.INode** %2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %_M_current, align 8 + ret %class.INode** %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_T0_SE_T1_T2_(%class.INode** %__first.coerce, i64 %__holeIndex, i64 %__len, %class.INode* %__value) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__comp = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter", align 1 + %__holeIndex.addr = alloca i64, align 8 + %__len.addr = alloca i64, align 8 + %__value.addr = alloca %class.INode*, align 8 + %__topIndex = alloca i64, align 8 + %__secondChild = alloca i64, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp2 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp12 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp23 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp28 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__cmp = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_val", align 1 + %agg.tmp34 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + store i64 %__holeIndex, i64* %__holeIndex.addr, align 8 + store i64 %__len, i64* %__len.addr, align 8 + store %class.INode* %__value, %class.INode** %__value.addr, align 8 + %0 = load i64, i64* %__holeIndex.addr, align 8 + store i64 %0, i64* %__topIndex, align 8 + %1 = load i64, i64* %__holeIndex.addr, align 8 + store i64 %1, i64* %__secondChild, align 8 + br label %while.cond + +while.cond: ; preds = %if.end, %entry + %2 = load i64, i64* %__secondChild, align 8 + %3 = load i64, i64* %__len.addr, align 8 + %sub = sub nsw i64 %3, 1 + %div = sdiv i64 %sub, 2 + %cmp = icmp slt i64 %2, %div + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %4 = load i64, i64* %__secondChild, align 8 + %add = add nsw i64 %4, 1 + %mul = mul nsw i64 2, %add + store i64 %mul, i64* %__secondChild, align 8 + %5 = load i64, i64* %__secondChild, align 8 + %call = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %5) + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive1, align 8 + %6 = load i64, i64* %__secondChild, align 8 + %sub3 = sub nsw i64 %6, 1 + %call4 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %sub3) + %coerce.dive5 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp2, i32 0, i32 0 + store %class.INode** %call4, %class.INode*** %coerce.dive5, align 8 + %coerce.dive6 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %7 = load %class.INode**, %class.INode*** %coerce.dive6, align 8 + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp2, i32 0, i32 0 + %8 = load %class.INode**, %class.INode*** %coerce.dive7, align 8 + %call8 = call zeroext i1 @_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEclINS_17__normal_iteratorIPP5INodeSt6vectorIS7_SaIS7_EEEESC_EEbT_T0_(%"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %__comp, %class.INode** %7, %class.INode** %8) + br i1 %call8, label %if.then, label %if.end + +if.then: ; preds = %while.body + %9 = load i64, i64* %__secondChild, align 8 + %dec = add nsw i64 %9, -1 + store i64 %dec, i64* %__secondChild, align 8 + br label %if.end + +if.end: ; preds = %if.then, %while.body + %10 = load i64, i64* %__secondChild, align 8 + %call9 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %10) + %coerce.dive10 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp, i32 0, i32 0 + store %class.INode** %call9, %class.INode*** %coerce.dive10, align 8 + %call11 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp) + %11 = load %class.INode*, %class.INode** %call11, align 8 + %12 = load i64, i64* %__holeIndex.addr, align 8 + %call13 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %12) + %coerce.dive14 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp12, i32 0, i32 0 + store %class.INode** %call13, %class.INode*** %coerce.dive14, align 8 + %call15 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp12) + store %class.INode* %11, %class.INode** %call15, align 8 + %13 = load i64, i64* %__secondChild, align 8 + store i64 %13, i64* %__holeIndex.addr, align 8 + br label %while.cond + +while.end: ; preds = %while.cond + %14 = load i64, i64* %__len.addr, align 8 + %and = and i64 %14, 1 + %cmp16 = icmp eq i64 %and, 0 + br i1 %cmp16, label %land.lhs.true, label %if.end33 + +land.lhs.true: ; preds = %while.end + %15 = load i64, i64* %__secondChild, align 8 + %16 = load i64, i64* %__len.addr, align 8 + %sub17 = sub nsw i64 %16, 2 + %div18 = sdiv i64 %sub17, 2 + %cmp19 = icmp eq i64 %15, %div18 + br i1 %cmp19, label %if.then20, label %if.end33 + +if.then20: ; preds = %land.lhs.true + %17 = load i64, i64* %__secondChild, align 8 + %add21 = add nsw i64 %17, 1 + %mul22 = mul nsw i64 2, %add21 + store i64 %mul22, i64* %__secondChild, align 8 + %18 = load i64, i64* %__secondChild, align 8 + %sub24 = sub nsw i64 %18, 1 + %call25 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %sub24) + %coerce.dive26 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp23, i32 0, i32 0 + store %class.INode** %call25, %class.INode*** %coerce.dive26, align 8 + %call27 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp23) + %19 = load %class.INode*, %class.INode** %call27, align 8 + %20 = load i64, i64* %__holeIndex.addr, align 8 + %call29 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %20) + %coerce.dive30 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp28, i32 0, i32 0 + store %class.INode** %call29, %class.INode*** %coerce.dive30, align 8 + %call31 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp28) + store %class.INode* %19, %class.INode** %call31, align 8 + %21 = load i64, i64* %__secondChild, align 8 + %sub32 = sub nsw i64 %21, 1 + store i64 %sub32, i64* %__holeIndex.addr, align 8 + br label %if.end33 + +if.end33: ; preds = %if.then20, %land.lhs.true, %while.end + call void @_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEC2ERKNS0_15_Iter_comp_iterIS2_EE(%"struct.__gnu_cxx::__ops::_Iter_comp_val"* %__cmp, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* dereferenceable(1) %__comp) + %22 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp34 to i8* + %23 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %22, i8* align 8 %23, i64 8, i1 false) + %24 = load i64, i64* %__holeIndex.addr, align 8 + %25 = load i64, i64* %__topIndex, align 8 + %26 = load %class.INode*, %class.INode** %__value.addr, align 8 + %coerce.dive35 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp34, i32 0, i32 0 + %27 = load %class.INode**, %class.INode*** %coerce.dive35, align 8 + call void @_ZSt11__push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops14_Iter_comp_valI7NodeCmpEEEvT_T0_SE_T1_RT2_(%class.INode** %27, i64 %24, i64 %25, %class.INode* %26, %"struct.__gnu_cxx::__ops::_Iter_comp_val"* dereferenceable(1) %__cmp) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + ret %class.INode*** %_M_current +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS3_(%"class.__gnu_cxx::__normal_iterator"* %this, %class.INode*** dereferenceable(8) %__i) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + %__i.addr = alloca %class.INode***, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + store %class.INode*** %__i, %class.INode**** %__i.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + %0 = load %class.INode***, %class.INode**** %__i.addr, align 8 + %1 = load %class.INode**, %class.INode*** %0, align 8 + store %class.INode** %1, %class.INode*** %_M_current, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEclINS_17__normal_iteratorIPP5INodeSt6vectorIS7_SaIS7_EEEESC_EEbT_T0_(%"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %this, %class.INode** %__it1.coerce, %class.INode** %__it2.coerce) #0 comdat align 2 { +entry: + %__it1 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__it2 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__it1, i32 0, i32 0 + store %class.INode** %__it1.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__it2, i32 0, i32 0 + store %class.INode** %__it2.coerce, %class.INode*** %coerce.dive1, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %this, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %this.addr, align 8 + %this2 = load %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %this.addr, align 8 + %_M_comp = getelementptr inbounds %"struct.__gnu_cxx::__ops::_Iter_comp_iter", %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %this2, i32 0, i32 0 + %call = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %__it1) + %0 = load %class.INode*, %class.INode** %call, align 8 + %call3 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %__it2) + %1 = load %class.INode*, %class.INode** %call3, align 8 + %call4 = call zeroext i1 @_ZNK7NodeCmpclEPK5INodeS2_(%struct.NodeCmp* %_M_comp, %class.INode* %0, %class.INode* %1) + ret i1 %call4 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEC2ERKNS0_15_Iter_comp_iterIS2_EE(%"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* dereferenceable(1) %__comp) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, align 8 + %__comp.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %this.addr, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %__comp, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %__comp.addr, align 8 + %this1 = load %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %this.addr, align 8 + %_M_comp = getelementptr inbounds %"struct.__gnu_cxx::__ops::_Iter_comp_val", %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this1, i32 0, i32 0 + %0 = load %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %__comp.addr, align 8 + %_M_comp2 = getelementptr inbounds %"struct.__gnu_cxx::__ops::_Iter_comp_iter", %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %0, i32 0, i32 0 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt11__push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops14_Iter_comp_valI7NodeCmpEEEvT_T0_SE_T1_RT2_(%class.INode** %__first.coerce, i64 %__holeIndex, i64 %__topIndex, %class.INode* %__value, %"struct.__gnu_cxx::__ops::_Iter_comp_val"* dereferenceable(1) %__comp) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__holeIndex.addr = alloca i64, align 8 + %__topIndex.addr = alloca i64, align 8 + %__value.addr = alloca %class.INode*, align 8 + %__comp.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, align 8 + %__parent = alloca i64, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp7 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %ref.tmp13 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + store i64 %__holeIndex, i64* %__holeIndex.addr, align 8 + store i64 %__topIndex, i64* %__topIndex.addr, align 8 + store %class.INode* %__value, %class.INode** %__value.addr, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %__comp, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %__comp.addr, align 8 + %0 = load i64, i64* %__holeIndex.addr, align 8 + %sub = sub nsw i64 %0, 1 + %div = sdiv i64 %sub, 2 + store i64 %div, i64* %__parent, align 8 + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %1 = load i64, i64* %__holeIndex.addr, align 8 + %2 = load i64, i64* %__topIndex.addr, align 8 + %cmp = icmp sgt i64 %1, %2 + br i1 %cmp, label %land.rhs, label %land.end + +land.rhs: ; preds = %while.cond + %3 = load %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %__comp.addr, align 8 + %4 = load i64, i64* %__parent, align 8 + %call = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %4) + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive1, align 8 + %coerce.dive2 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %5 = load %class.INode**, %class.INode*** %coerce.dive2, align 8 + %call3 = call zeroext i1 @_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEclINS_17__normal_iteratorIPP5INodeSt6vectorIS7_SaIS7_EEEES7_EEbT_RT0_(%"struct.__gnu_cxx::__ops::_Iter_comp_val"* %3, %class.INode** %5, %class.INode** dereferenceable(8) %__value.addr) + br label %land.end + +land.end: ; preds = %land.rhs, %while.cond + %6 = phi i1 [ false, %while.cond ], [ %call3, %land.rhs ] + br i1 %6, label %while.body, label %while.end + +while.body: ; preds = %land.end + %7 = load i64, i64* %__parent, align 8 + %call4 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %7) + %coerce.dive5 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp, i32 0, i32 0 + store %class.INode** %call4, %class.INode*** %coerce.dive5, align 8 + %call6 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp) + %8 = load %class.INode*, %class.INode** %call6, align 8 + %9 = load i64, i64* %__holeIndex.addr, align 8 + %call8 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %9) + %coerce.dive9 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp7, i32 0, i32 0 + store %class.INode** %call8, %class.INode*** %coerce.dive9, align 8 + %call10 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp7) + store %class.INode* %8, %class.INode** %call10, align 8 + %10 = load i64, i64* %__parent, align 8 + store i64 %10, i64* %__holeIndex.addr, align 8 + %11 = load i64, i64* %__holeIndex.addr, align 8 + %sub11 = sub nsw i64 %11, 1 + %div12 = sdiv i64 %sub11, 2 + store i64 %div12, i64* %__parent, align 8 + br label %while.cond + +while.end: ; preds = %land.end + %12 = load %class.INode*, %class.INode** %__value.addr, align 8 + %13 = load i64, i64* %__holeIndex.addr, align 8 + %call14 = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEplEl(%"class.__gnu_cxx::__normal_iterator"* %__first, i64 %13) + %coerce.dive15 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp13, i32 0, i32 0 + store %class.INode** %call14, %class.INode*** %coerce.dive15, align 8 + %call16 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp13) + store %class.INode* %12, %class.INode** %call16, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNK7NodeCmpclEPK5INodeS2_(%struct.NodeCmp* %this, %class.INode* %lhs, %class.INode* %rhs) #6 comdat align 2 { +entry: + %this.addr = alloca %struct.NodeCmp*, align 8 + %lhs.addr = alloca %class.INode*, align 8 + %rhs.addr = alloca %class.INode*, align 8 + store %struct.NodeCmp* %this, %struct.NodeCmp** %this.addr, align 8 + store %class.INode* %lhs, %class.INode** %lhs.addr, align 8 + store %class.INode* %rhs, %class.INode** %rhs.addr, align 8 + %this1 = load %struct.NodeCmp*, %struct.NodeCmp** %this.addr, align 8 + %0 = load %class.INode*, %class.INode** %lhs.addr, align 8 + %f = getelementptr inbounds %class.INode, %class.INode* %0, i32 0, i32 1 + %1 = load i32, i32* %f, align 8 + %2 = load %class.INode*, %class.INode** %rhs.addr, align 8 + %f2 = getelementptr inbounds %class.INode, %class.INode* %2, i32 0, i32 1 + %3 = load i32, i32* %f2, align 8 + %cmp = icmp sgt i32 %1, %3 + ret i1 %cmp +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEclINS_17__normal_iteratorIPP5INodeSt6vectorIS7_SaIS7_EEEES7_EEbT_RT0_(%"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this, %class.INode** %__it.coerce, %class.INode** dereferenceable(8) %__val) #6 comdat align 2 { +entry: + %__it = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, align 8 + %__val.addr = alloca %class.INode**, align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__it, i32 0, i32 0 + store %class.INode** %__it.coerce, %class.INode*** %coerce.dive, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %this.addr, align 8 + store %class.INode** %__val, %class.INode*** %__val.addr, align 8 + %this1 = load %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %this.addr, align 8 + %_M_comp = getelementptr inbounds %"struct.__gnu_cxx::__ops::_Iter_comp_val", %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this1, i32 0, i32 0 + %call = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %__it) + %0 = load %class.INode*, %class.INode** %call, align 8 + %1 = load %class.INode**, %class.INode*** %__val.addr, align 8 + %2 = load %class.INode*, %class.INode** %1, align 8 + %call2 = call zeroext i1 @_ZNK7NodeCmpclEPK5INodeS2_(%struct.NodeCmp* %_M_comp, %class.INode* %0, %class.INode* %2) + ret i1 %call2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIP5INodeSaIS1_EE9push_backERKS1_(%"class.std::vector"* %this, %class.INode** dereferenceable(8) %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %__x.addr = alloca %class.INode**, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store %class.INode** %__x, %class.INode*** %__x.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %class.INode**, %class.INode*** %_M_finish, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 2 + %3 = load %class.INode**, %class.INode*** %_M_end_of_storage, align 8 + %cmp = icmp ne %class.INode** %1, %3 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %4 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %4, i32 0, i32 0 + %5 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3 to %"class.std::allocator"* + %6 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl4 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %6, i32 0, i32 0 + %_M_finish5 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl4, i32 0, i32 1 + %7 = load %class.INode**, %class.INode*** %_M_finish5, align 8 + %8 = load %class.INode**, %class.INode*** %__x.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE9constructIS2_EEvRS3_PS2_RKT_(%"class.std::allocator"* dereferenceable(1) %5, %class.INode** %7, %class.INode** dereferenceable(8) %8) + %9 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %9, i32 0, i32 0 + %_M_finish7 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6, i32 0, i32 1 + %10 = load %class.INode**, %class.INode*** %_M_finish7, align 8 + %incdec.ptr = getelementptr inbounds %class.INode*, %class.INode** %10, i32 1 + store %class.INode** %incdec.ptr, %class.INode*** %_M_finish7, align 8 + br label %if.end + +if.else: ; preds = %entry + %call = call %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE3endEv(%"class.std::vector"* %this1) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive, align 8 + %11 = load %class.INode**, %class.INode*** %__x.addr, align 8 + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %12 = load %class.INode**, %class.INode*** %coerce.dive8, align 8 + call void @_ZNSt6vectorIP5INodeSaIS1_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS1_S3_EERKS1_(%"class.std::vector"* %this1, %class.INode** %12, %class.INode** dereferenceable(8) %11) + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt9push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_(%class.INode** %__first.coerce, %class.INode** %__last.coerce) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__comp = alloca %struct.NodeCmp, align 1 + %__cmp = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_val", align 1 + %agg.tmp = alloca %struct.NodeCmp, align 1 + %__value = alloca %class.INode*, align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp4 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + call void @_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEC2ES2_(%"struct.__gnu_cxx::__ops::_Iter_comp_val"* %__cmp) + %call = call %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEmiEl(%"class.__gnu_cxx::__normal_iterator"* %__last, i64 1) + %coerce.dive2 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive2, align 8 + %call3 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %ref.tmp) + %0 = load %class.INode*, %class.INode** %call3, align 8 + store %class.INode* %0, %class.INode** %__value, align 8 + %1 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp4 to i8* + %2 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 8, i1 false) + %call5 = call i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__last, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__first) + %sub = sub nsw i64 %call5, 1 + %3 = load %class.INode*, %class.INode** %__value, align 8 + %coerce.dive6 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp4, i32 0, i32 0 + %4 = load %class.INode**, %class.INode*** %coerce.dive6, align 8 + call void @_ZSt11__push_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops14_Iter_comp_valI7NodeCmpEEEvT_T0_SE_T1_RT2_(%class.INode** %4, i64 %sub, i64 0, %class.INode* %3, %"struct.__gnu_cxx::__ops::_Iter_comp_val"* dereferenceable(1) %__cmp) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE9constructIS2_EEvRS3_PS2_RKT_(%"class.std::allocator"* dereferenceable(1) %__a, %class.INode** %__p, %class.INode** dereferenceable(8) %__arg) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + %__arg.addr = alloca %class.INode**, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + store %class.INode** %__arg, %class.INode*** %__arg.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load %class.INode**, %class.INode*** %__p.addr, align 8 + %3 = load %class.INode**, %class.INode*** %__arg.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorIP5INodeE9constructEPS2_RKS2_(%"class.__gnu_cxx::new_allocator"* %1, %class.INode** %2, %class.INode** dereferenceable(8) %3) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIP5INodeSaIS1_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS1_S3_EERKS1_(%"class.std::vector"* %this, %class.INode** %__position.coerce, %class.INode** dereferenceable(8) %__x) #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %__position = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + %__x.addr = alloca %class.INode**, align 8 + %__len = alloca i64, align 8 + %__elems_before = alloca i64, align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__new_start = alloca %class.INode**, align 8 + %__new_finish = alloca %class.INode**, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__position, i32 0, i32 0 + store %class.INode** %__position.coerce, %class.INode*** %coerce.dive, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store %class.INode** %__x, %class.INode*** %__x.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE12_M_check_lenEmPKc(%"class.std::vector"* %this1, i64 1, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.26, i64 0, i64 0)) + store i64 %call, i64* %__len, align 8 + %call2 = call %class.INode** @_ZNSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %this1) + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp, i32 0, i32 0 + store %class.INode** %call2, %class.INode*** %coerce.dive3, align 8 + %call4 = call i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__position, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %ref.tmp) + store i64 %call4, i64* %__elems_before, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %1 = load i64, i64* %__len, align 8 + %call5 = call %class.INode** @_ZNSt12_Vector_baseIP5INodeSaIS1_EE11_M_allocateEm(%"struct.std::_Vector_base"* %0, i64 %1) + store %class.INode** %call5, %class.INode*** %__new_start, align 8 + %2 = load %class.INode**, %class.INode*** %__new_start, align 8 + store %class.INode** %2, %class.INode*** %__new_finish, align 8 + %3 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %3, i32 0, i32 0 + %4 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + %5 = load %class.INode**, %class.INode*** %__new_start, align 8 + %6 = load i64, i64* %__elems_before, align 8 + %add.ptr = getelementptr inbounds %class.INode*, %class.INode** %5, i64 %6 + %7 = load %class.INode**, %class.INode*** %__x.addr, align 8 + invoke void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE9constructIS2_EEvRS3_PS2_RKT_(%"class.std::allocator"* dereferenceable(1) %4, %class.INode** %add.ptr, %class.INode** dereferenceable(8) %7) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + store %class.INode** null, %class.INode*** %__new_finish, align 8 + %8 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %8, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6, i32 0, i32 0 + %9 = load %class.INode**, %class.INode*** %_M_start, align 8 + %call8 = invoke dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %__position) + to label %invoke.cont7 unwind label %lpad + +invoke.cont7: ; preds = %invoke.cont + %10 = load %class.INode**, %class.INode*** %call8, align 8 + %11 = load %class.INode**, %class.INode*** %__new_start, align 8 + %12 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call10 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %12) + to label %invoke.cont9 unwind label %lpad + +invoke.cont9: ; preds = %invoke.cont7 + %call12 = invoke %class.INode** @_ZSt34__uninitialized_move_if_noexcept_aIPP5INodeS2_SaIS1_EET0_T_S5_S4_RT1_(%class.INode** %9, %class.INode** %10, %class.INode** %11, %"class.std::allocator"* dereferenceable(1) %call10) + to label %invoke.cont11 unwind label %lpad + +invoke.cont11: ; preds = %invoke.cont9 + store %class.INode** %call12, %class.INode*** %__new_finish, align 8 + %13 = load %class.INode**, %class.INode*** %__new_finish, align 8 + %incdec.ptr = getelementptr inbounds %class.INode*, %class.INode** %13, i32 1 + store %class.INode** %incdec.ptr, %class.INode*** %__new_finish, align 8 + %call14 = invoke dereferenceable(8) %class.INode*** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %__position) + to label %invoke.cont13 unwind label %lpad + +invoke.cont13: ; preds = %invoke.cont11 + %14 = load %class.INode**, %class.INode*** %call14, align 8 + %15 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl15 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %15, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl15, i32 0, i32 1 + %16 = load %class.INode**, %class.INode*** %_M_finish, align 8 + %17 = load %class.INode**, %class.INode*** %__new_finish, align 8 + %18 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call17 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %18) + to label %invoke.cont16 unwind label %lpad + +invoke.cont16: ; preds = %invoke.cont13 + %call19 = invoke %class.INode** @_ZSt34__uninitialized_move_if_noexcept_aIPP5INodeS2_SaIS1_EET0_T_S5_S4_RT1_(%class.INode** %14, %class.INode** %16, %class.INode** %17, %"class.std::allocator"* dereferenceable(1) %call17) + to label %invoke.cont18 unwind label %lpad + +invoke.cont18: ; preds = %invoke.cont16 + store %class.INode** %call19, %class.INode*** %__new_finish, align 8 + br label %try.cont + +lpad: ; preds = %invoke.cont16, %invoke.cont13, %invoke.cont11, %invoke.cont9, %invoke.cont7, %invoke.cont, %entry + %19 = landingpad { i8*, i32 } + catch i8* null + %20 = extractvalue { i8*, i32 } %19, 0 + store i8* %20, i8** %exn.slot, align 8 + %21 = extractvalue { i8*, i32 } %19, 1 + store i32 %21, i32* %ehselector.slot, align 4 + br label %catch + +catch: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %22 = call i8* @__cxa_begin_catch(i8* %exn) #3 + %23 = load %class.INode**, %class.INode*** %__new_finish, align 8 + %tobool = icmp ne %class.INode** %23, null + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %catch + %24 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl20 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %24, i32 0, i32 0 + %25 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl20 to %"class.std::allocator"* + %26 = load %class.INode**, %class.INode*** %__new_start, align 8 + %27 = load i64, i64* %__elems_before, align 8 + %add.ptr21 = getelementptr inbounds %class.INode*, %class.INode** %26, i64 %27 + invoke void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE7destroyERS3_PS2_(%"class.std::allocator"* dereferenceable(1) %25, %class.INode** %add.ptr21) + to label %invoke.cont23 unwind label %lpad22 + +invoke.cont23: ; preds = %if.then + br label %if.end + +lpad22: ; preds = %invoke.cont27, %if.end, %invoke.cont24, %if.else, %if.then + %28 = landingpad { i8*, i32 } + cleanup + %29 = extractvalue { i8*, i32 } %28, 0 + store i8* %29, i8** %exn.slot, align 8 + %30 = extractvalue { i8*, i32 } %28, 1 + store i32 %30, i32* %ehselector.slot, align 4 + invoke void @__cxa_end_catch() + to label %invoke.cont28 unwind label %terminate.lpad + +if.else: ; preds = %catch + %31 = load %class.INode**, %class.INode*** %__new_start, align 8 + %32 = load %class.INode**, %class.INode*** %__new_finish, align 8 + %33 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call25 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %33) + to label %invoke.cont24 unwind label %lpad22 + +invoke.cont24: ; preds = %if.else + invoke void @_ZSt8_DestroyIPP5INodeS1_EvT_S3_RSaIT0_E(%class.INode** %31, %class.INode** %32, %"class.std::allocator"* dereferenceable(1) %call25) + to label %invoke.cont26 unwind label %lpad22 + +invoke.cont26: ; preds = %invoke.cont24 + br label %if.end + +if.end: ; preds = %invoke.cont26, %invoke.cont23 + %34 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %35 = load %class.INode**, %class.INode*** %__new_start, align 8 + %36 = load i64, i64* %__len, align 8 + invoke void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE13_M_deallocateEPS1_m(%"struct.std::_Vector_base"* %34, %class.INode** %35, i64 %36) + to label %invoke.cont27 unwind label %lpad22 + +invoke.cont27: ; preds = %if.end + invoke void @__cxa_rethrow() #19 + to label %unreachable unwind label %lpad22 + +invoke.cont28: ; preds = %lpad22 + br label %eh.resume + +try.cont: ; preds = %invoke.cont18 + %37 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl29 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %37, i32 0, i32 0 + %_M_start30 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl29, i32 0, i32 0 + %38 = load %class.INode**, %class.INode*** %_M_start30, align 8 + %39 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl31 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %39, i32 0, i32 0 + %_M_finish32 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl31, i32 0, i32 1 + %40 = load %class.INode**, %class.INode*** %_M_finish32, align 8 + %41 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call33 = call dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %41) + call void @_ZSt8_DestroyIPP5INodeS1_EvT_S3_RSaIT0_E(%class.INode** %38, %class.INode** %40, %"class.std::allocator"* dereferenceable(1) %call33) + %42 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %43 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl34 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %43, i32 0, i32 0 + %_M_start35 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl34, i32 0, i32 0 + %44 = load %class.INode**, %class.INode*** %_M_start35, align 8 + %45 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl36 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %45, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl36, i32 0, i32 2 + %46 = load %class.INode**, %class.INode*** %_M_end_of_storage, align 8 + %47 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl37 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %47, i32 0, i32 0 + %_M_start38 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl37, i32 0, i32 0 + %48 = load %class.INode**, %class.INode*** %_M_start38, align 8 + %sub.ptr.lhs.cast = ptrtoint %class.INode** %46 to i64 + %sub.ptr.rhs.cast = ptrtoint %class.INode** %48 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + call void @_ZNSt12_Vector_baseIP5INodeSaIS1_EE13_M_deallocateEPS1_m(%"struct.std::_Vector_base"* %42, %class.INode** %44, i64 %sub.ptr.div) + %49 = load %class.INode**, %class.INode*** %__new_start, align 8 + %50 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl39 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %50, i32 0, i32 0 + %_M_start40 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl39, i32 0, i32 0 + store %class.INode** %49, %class.INode*** %_M_start40, align 8 + %51 = load %class.INode**, %class.INode*** %__new_finish, align 8 + %52 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl41 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %52, i32 0, i32 0 + %_M_finish42 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl41, i32 0, i32 1 + store %class.INode** %51, %class.INode*** %_M_finish42, align 8 + %53 = load %class.INode**, %class.INode*** %__new_start, align 8 + %54 = load i64, i64* %__len, align 8 + %add.ptr43 = getelementptr inbounds %class.INode*, %class.INode** %53, i64 %54 + %55 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl44 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %55, i32 0, i32 0 + %_M_end_of_storage45 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl44, i32 0, i32 2 + store %class.INode** %add.ptr43, %class.INode*** %_M_end_of_storage45, align 8 + ret void + +eh.resume: ; preds = %invoke.cont28 + %exn46 = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn46, 0 + %lpad.val47 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val47 + +terminate.lpad: ; preds = %lpad22 + %56 = landingpad { i8*, i32 } + catch i8* null + %57 = extractvalue { i8*, i32 } %56, 0 + call void @__clang_call_terminate(i8* %57) #16 + unreachable + +unreachable: ; preds = %invoke.cont27 + unreachable +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIP5INodeE9constructEPS2_RKS2_(%"class.__gnu_cxx::new_allocator"* %this, %class.INode** %__p, %class.INode** dereferenceable(8) %__val) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + %__val.addr = alloca %class.INode**, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + store %class.INode** %__val, %class.INode*** %__val.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__p.addr, align 8 + %1 = bitcast %class.INode** %0 to i8* + %2 = bitcast i8* %1 to %class.INode** + %3 = load %class.INode**, %class.INode*** %__val.addr, align 8 + %4 = load %class.INode*, %class.INode** %3, align 8 + store %class.INode* %4, %class.INode** %2, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIP5INodeSaIS1_EE12_M_check_lenEmPKc(%"class.std::vector"* %this, i64 %__n, i8* %__s) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %__n.addr = alloca i64, align 8 + %__s.addr = alloca i8*, align 8 + %__len = alloca i64, align 8 + %ref.tmp = alloca i64, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %__s, i8** %__s.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE8max_sizeEv(%"class.std::vector"* %this1) + %call2 = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %this1) + %sub = sub i64 %call, %call2 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ult i64 %sub, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i8*, i8** %__s.addr, align 8 + call void @_ZSt20__throw_length_errorPKc(i8* %1) #19 + unreachable + +if.end: ; preds = %entry + %call3 = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %this1) + %call4 = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %this1) + store i64 %call4, i64* %ref.tmp, align 8 + %call5 = call dereferenceable(8) i64* @_ZSt3maxImERKT_S2_S2_(i64* dereferenceable(8) %ref.tmp, i64* dereferenceable(8) %__n.addr) + %2 = load i64, i64* %call5, align 8 + %add = add i64 %call3, %2 + store i64 %add, i64* %__len, align 8 + %3 = load i64, i64* %__len, align 8 + %call6 = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE4sizeEv(%"class.std::vector"* %this1) + %cmp7 = icmp ult i64 %3, %call6 + br i1 %cmp7, label %cond.true, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %4 = load i64, i64* %__len, align 8 + %call8 = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE8max_sizeEv(%"class.std::vector"* %this1) + %cmp9 = icmp ugt i64 %4, %call8 + br i1 %cmp9, label %cond.true, label %cond.false + +cond.true: ; preds = %lor.lhs.false, %if.end + %call10 = call i64 @_ZNKSt6vectorIP5INodeSaIS1_EE8max_sizeEv(%"class.std::vector"* %this1) + br label %cond.end + +cond.false: ; preds = %lor.lhs.false + %5 = load i64, i64* %__len, align 8 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i64 [ %call10, %cond.true ], [ %5, %cond.false ] + ret i64 %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt34__uninitialized_move_if_noexcept_aIPP5INodeS2_SaIS1_EET0_T_S5_S4_RT1_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result, %"class.std::allocator"* dereferenceable(1) %__alloc) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + %__alloc.addr = alloca %"class.std::allocator"*, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store %"class.std::allocator"* %__alloc, %"class.std::allocator"** %__alloc.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %3 = load %"class.std::allocator"*, %"class.std::allocator"** %__alloc.addr, align 8 + %call = call %class.INode** @_ZSt22__uninitialized_copy_aIPP5INodeS2_S1_ET0_T_S4_S3_RSaIT1_E(%class.INode** %0, %class.INode** %1, %class.INode** %2, %"class.std::allocator"* dereferenceable(1) %3) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE7destroyERS3_PS2_(%"class.std::allocator"* dereferenceable(1) %__a, %class.INode** %__p) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load %class.INode**, %class.INode*** %__p.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorIP5INodeE7destroyEPS2_(%"class.__gnu_cxx::new_allocator"* %1, %class.INode** %2) + ret void +} + +declare dso_local void @__cxa_rethrow() + +declare dso_local void @__cxa_end_catch() + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIP5INodeSaIS1_EE8max_sizeEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call = call dereferenceable(1) %"class.std::allocator"* @_ZNKSt12_Vector_baseIP5INodeSaIS1_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %0) + %call2 = call i64 @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE8max_sizeERKS3_(%"class.std::allocator"* dereferenceable(1) %call) + ret i64 %call2 +} + +; Function Attrs: noreturn +declare dso_local void @_ZSt20__throw_length_errorPKc(i8*) #15 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) i64* @_ZSt3maxImERKT_S2_S2_(i64* dereferenceable(8) %__a, i64* dereferenceable(8) %__b) #6 comdat { +entry: + %retval = alloca i64*, align 8 + %__a.addr = alloca i64*, align 8 + %__b.addr = alloca i64*, align 8 + store i64* %__a, i64** %__a.addr, align 8 + store i64* %__b, i64** %__b.addr, align 8 + %0 = load i64*, i64** %__a.addr, align 8 + %1 = load i64, i64* %0, align 8 + %2 = load i64*, i64** %__b.addr, align 8 + %3 = load i64, i64* %2, align 8 + %cmp = icmp ult i64 %1, %3 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %4 = load i64*, i64** %__b.addr, align 8 + store i64* %4, i64** %retval, align 8 + br label %return + +if.end: ; preds = %entry + %5 = load i64*, i64** %__a.addr, align 8 + store i64* %5, i64** %retval, align 8 + br label %return + +return: ; preds = %if.end, %if.then + %6 = load i64*, i64** %retval, align 8 + ret i64* %6 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE8max_sizeERKS3_(%"class.std::allocator"* dereferenceable(1) %__a) #6 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorIP5INodeE8max_sizeEv(%"class.__gnu_cxx::new_allocator"* %1) #3 + ret i64 %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt22__uninitialized_copy_aIPP5INodeS2_S1_ET0_T_S4_S3_RSaIT1_E(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result, %"class.std::allocator"* dereferenceable(1) %0) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + %.addr = alloca %"class.std::allocator"*, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store %"class.std::allocator"* %0, %"class.std::allocator"** %.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %3 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call = call %class.INode** @_ZSt18uninitialized_copyIPP5INodeS2_ET0_T_S4_S3_(%class.INode** %1, %class.INode** %2, %class.INode** %3) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt18uninitialized_copyIPP5INodeS2_ET0_T_S4_S3_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + %__assignable = alloca i8, align 1 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store i8 1, i8* %__assignable, align 1 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call = call %class.INode** @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIPP5INodeS4_EET0_T_S6_S5_(%class.INode** %0, %class.INode** %1, %class.INode** %2) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIPP5INodeS4_EET0_T_S6_S5_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #0 comdat align 2 { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call = call %class.INode** @_ZSt4copyIPP5INodeS2_ET0_T_S4_S3_(%class.INode** %0, %class.INode** %1, %class.INode** %2) + ret %class.INode** %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt4copyIPP5INodeS2_ET0_T_S4_S3_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %call = call %class.INode** @_ZSt12__miter_baseIPP5INodeET_S3_(%class.INode** %0) + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %call1 = call %class.INode** @_ZSt12__miter_baseIPP5INodeET_S3_(%class.INode** %1) + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call2 = call %class.INode** @_ZSt14__copy_move_a2ILb0EPP5INodeS2_ET1_T0_S4_S3_(%class.INode** %call, %class.INode** %call1, %class.INode** %2) + ret %class.INode** %call2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt14__copy_move_a2ILb0EPP5INodeS2_ET1_T0_S4_S3_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #0 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %call = call %class.INode** @_ZSt12__niter_baseIPP5INodeET_S3_(%class.INode** %0) + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %call1 = call %class.INode** @_ZSt12__niter_baseIPP5INodeET_S3_(%class.INode** %1) + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call2 = call %class.INode** @_ZSt12__niter_baseIPP5INodeET_S3_(%class.INode** %2) + %call3 = call %class.INode** @_ZSt13__copy_move_aILb0EPP5INodeS2_ET1_T0_S4_S3_(%class.INode** %call, %class.INode** %call1, %class.INode** %call2) + ret %class.INode** %call3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt12__miter_baseIPP5INodeET_S3_(%class.INode** %__it) #6 comdat { +entry: + %__it.addr = alloca %class.INode**, align 8 + store %class.INode** %__it, %class.INode*** %__it.addr, align 8 + %0 = load %class.INode**, %class.INode*** %__it.addr, align 8 + ret %class.INode** %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZSt13__copy_move_aILb0EPP5INodeS2_ET1_T0_S4_S3_(%class.INode** %__first, %class.INode** %__last, %class.INode** %__result) #6 comdat { +entry: + %__first.addr = alloca %class.INode**, align 8 + %__last.addr = alloca %class.INode**, align 8 + %__result.addr = alloca %class.INode**, align 8 + %__simple = alloca i8, align 1 + store %class.INode** %__first, %class.INode*** %__first.addr, align 8 + store %class.INode** %__last, %class.INode*** %__last.addr, align 8 + store %class.INode** %__result, %class.INode*** %__result.addr, align 8 + store i8 1, i8* %__simple, align 1 + %0 = load %class.INode**, %class.INode*** %__first.addr, align 8 + %1 = load %class.INode**, %class.INode*** %__last.addr, align 8 + %2 = load %class.INode**, %class.INode*** %__result.addr, align 8 + %call = call %class.INode** @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mIP5INodeEEPT_PKS5_S8_S6_(%class.INode** %0, %class.INode** %1, %class.INode** %2) + ret %class.INode** %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIP5INodeE7destroyEPS2_(%"class.__gnu_cxx::new_allocator"* %this, %class.INode** %__p) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__p.addr = alloca %class.INode**, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %class.INode** %__p, %class.INode*** %__p.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx5__ops14_Iter_comp_valI7NodeCmpEC2ES2_(%"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this) unnamed_addr #6 comdat align 2 { +entry: + %__comp = alloca %struct.NodeCmp, align 1 + %this.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %this.addr, align 8 + %this1 = load %"struct.__gnu_cxx::__ops::_Iter_comp_val"*, %"struct.__gnu_cxx::__ops::_Iter_comp_val"** %this.addr, align 8 + %_M_comp = getelementptr inbounds %"struct.__gnu_cxx::__ops::_Iter_comp_val", %"struct.__gnu_cxx::__ops::_Iter_comp_val"* %this1, i32 0, i32 0 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEmiEl(%"class.__gnu_cxx::__normal_iterator"* %this, i64 %__n) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + %__n.addr = alloca i64, align 8 + %ref.tmp = alloca %class.INode**, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %_M_current, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %idx.neg = sub i64 0, %1 + %add.ptr = getelementptr inbounds %class.INode*, %class.INode** %0, i64 %idx.neg + store %class.INode** %add.ptr, %class.INode*** %ref.tmp, align 8 + call void @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEC2ERKS3_(%"class.__gnu_cxx::__normal_iterator"* %retval, %class.INode*** dereferenceable(8) %ref.tmp) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %retval, i32 0, i32 0 + %2 = load %class.INode**, %class.INode*** %coerce.dive, align 8 + ret %class.INode** %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE5frontEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator.10", align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %call = call %class.INode** @_ZNKSt6vectorIP5INodeSaIS1_EE5beginEv(%"class.std::vector"* %this1) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %ref.tmp, i32 0, i32 0 + store %class.INode** %call, %class.INode*** %coerce.dive, align 8 + %call2 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator.10"* %ref.tmp) + ret %class.INode** %call2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPKP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator.10"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator.10"*, align 8 + store %"class.__gnu_cxx::__normal_iterator.10"* %this, %"class.__gnu_cxx::__normal_iterator.10"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator.10"*, %"class.__gnu_cxx::__normal_iterator.10"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.10", %"class.__gnu_cxx::__normal_iterator.10"* %this1, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %_M_current, align 8 + ret %class.INode** %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8pop_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEE7NodeCmpEvT_SA_T0_(%class.INode** %__first.coerce, %class.INode** %__last.coerce) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__comp = alloca %struct.NodeCmp, align 1 + %__cmp = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter", align 1 + %agg.tmp = alloca %struct.NodeCmp, align 1 + %agg.tmp3 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp4 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp5 = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + %call = call i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__last, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__first) + %cmp = icmp sgt i64 %call, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_ZN9__gnu_cxx5__ops15_Iter_comp_iterI7NodeCmpEC2ES2_(%"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %__cmp) + %call2 = call dereferenceable(8) %"class.__gnu_cxx::__normal_iterator"* @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEmmEv(%"class.__gnu_cxx::__normal_iterator"* %__last) + %0 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3 to i8* + %1 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %2 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp4 to i8* + %3 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 8, i1 false) + %4 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp5 to i8* + %5 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %4, i8* align 8 %5, i64 8, i1 false) + %coerce.dive6 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp3, i32 0, i32 0 + %6 = load %class.INode**, %class.INode*** %coerce.dive6, align 8 + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp4, i32 0, i32 0 + %7 = load %class.INode**, %class.INode*** %coerce.dive7, align 8 + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp5, i32 0, i32 0 + %8 = load %class.INode**, %class.INode*** %coerce.dive8, align 8 + call void @_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEENS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_SD_SD_RT0_(%class.INode** %6, %class.INode** %7, %class.INode** %8, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* dereferenceable(1) %__cmp) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIP5INodeSaIS1_EE8pop_backEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %class.INode**, %class.INode*** %_M_finish, align 8 + %incdec.ptr = getelementptr inbounds %class.INode*, %class.INode** %1, i32 -1 + store %class.INode** %incdec.ptr, %class.INode*** %_M_finish, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %3 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2 to %"class.std::allocator"* + %4 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %4, i32 0, i32 0 + %_M_finish4 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3, i32 0, i32 1 + %5 = load %class.INode**, %class.INode*** %_M_finish4, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaIP5INodeEE7destroyERS3_PS2_(%"class.std::allocator"* dereferenceable(1) %3, %class.INode** %5) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %"class.__gnu_cxx::__normal_iterator"* @_ZN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEmmEv(%"class.__gnu_cxx::__normal_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + %0 = load %class.INode**, %class.INode*** %_M_current, align 8 + %incdec.ptr = getelementptr inbounds %class.INode*, %class.INode** %0, i32 -1 + store %class.INode** %incdec.ptr, %class.INode*** %_M_current, align 8 + ret %"class.__gnu_cxx::__normal_iterator"* %this1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEENS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_SD_SD_RT0_(%class.INode** %__first.coerce, %class.INode** %__last.coerce, %class.INode** %__result.coerce, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* dereferenceable(1) %__comp) #0 comdat { +entry: + %__first = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__last = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__result = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__comp.addr = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, align 8 + %__value = alloca %class.INode*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %agg.tmp6 = alloca %"struct.__gnu_cxx::__ops::_Iter_comp_iter", align 1 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__first, i32 0, i32 0 + store %class.INode** %__first.coerce, %class.INode*** %coerce.dive, align 8 + %coerce.dive1 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__last, i32 0, i32 0 + store %class.INode** %__last.coerce, %class.INode*** %coerce.dive1, align 8 + %coerce.dive2 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__result, i32 0, i32 0 + store %class.INode** %__result.coerce, %class.INode*** %coerce.dive2, align 8 + store %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %__comp, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %__comp.addr, align 8 + %call = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %__result) + %0 = load %class.INode*, %class.INode** %call, align 8 + store %class.INode* %0, %class.INode** %__value, align 8 + %call3 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %__first) + %1 = load %class.INode*, %class.INode** %call3, align 8 + %call4 = call dereferenceable(8) %class.INode** @_ZNK9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS2_SaIS2_EEEdeEv(%"class.__gnu_cxx::__normal_iterator"* %__result) + store %class.INode* %1, %class.INode** %call4, align 8 + %2 = bitcast %"class.__gnu_cxx::__normal_iterator"* %agg.tmp to i8* + %3 = bitcast %"class.__gnu_cxx::__normal_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 8, i1 false) + %call5 = call i64 @_ZN9__gnu_cxxmiIPP5INodeSt6vectorIS2_SaIS2_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKSA_SD_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__last, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__first) + %4 = load %class.INode*, %class.INode** %__value, align 8 + %5 = load %"struct.__gnu_cxx::__ops::_Iter_comp_iter"*, %"struct.__gnu_cxx::__ops::_Iter_comp_iter"** %__comp.addr, align 8 + %6 = bitcast %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %agg.tmp6 to i8* + %7 = bitcast %"struct.__gnu_cxx::__ops::_Iter_comp_iter"* %5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %6, i8* align 1 %7, i64 1, i1 false) + %coerce.dive7 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %8 = load %class.INode**, %class.INode*** %coerce.dive7, align 8 + call void @_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIPP5INodeSt6vectorIS3_SaIS3_EEEElS3_NS0_5__ops15_Iter_comp_iterI7NodeCmpEEEvT_T0_SE_T1_T2_(%class.INode** %8, i64 0, i64 %call5, %class.INode* %4) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE11lower_boundERS6_(%"class.std::map"* %this, i8* dereferenceable(1) %__x) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::map"*, align 8 + %__x.addr = alloca i8*, align 8 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + store i8* %__x, i8** %__x.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + %0 = load i8*, i8** %__x.addr, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11lower_boundERS1_(%"class.std::_Rb_tree"* %_M_t, i8* dereferenceable(1) %0) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + ret %"struct.std::_Rb_tree_node_base"* %1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEeqERKS6_(%"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %__x) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + %__x.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + store %"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + store %"struct.std::_Rb_tree_iterator"* %__x, %"struct.std::_Rb_tree_iterator"** %__x.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %1 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %__x.addr, align 8 + %_M_node2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %1, i32 0, i32 0 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node2, align 8 + %cmp = icmp eq %"struct.std::_Rb_tree_node_base"* %0, %2 + ret i1 %cmp +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNKSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE8key_compEv(%"class.std::map"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::map"*, align 8 + %undef.agg.tmp = alloca %"struct.std::less", align 1 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + call void @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8key_compEv(%"class.std::_Rb_tree"* %_M_t) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %this, i8* dereferenceable(1) %__x, i8* dereferenceable(1) %__y) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::less"*, align 8 + %__x.addr = alloca i8*, align 8 + %__y.addr = alloca i8*, align 8 + store %"struct.std::less"* %this, %"struct.std::less"** %this.addr, align 8 + store i8* %__x, i8** %__x.addr, align 8 + store i8* %__y, i8** %__y.addr, align 8 + %this1 = load %"struct.std::less"*, %"struct.std::less"** %this.addr, align 8 + %0 = load i8*, i8** %__x.addr, align 8 + %1 = load i8, i8* %0, align 1 + %conv = zext i8 %1 to i32 + %2 = load i8*, i8** %__y.addr, align 8 + %3 = load i8, i8* %2, align 1 + %conv2 = zext i8 %3 to i32 + %cmp = icmp slt i32 %conv, %conv2 + ret i1 %cmp +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(48) %"struct.std::pair"* @_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEdeEv(%"struct.std::_Rb_tree_iterator"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + store %"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %1 = bitcast %"struct.std::_Rb_tree_node_base"* %0 to %"struct.std::_Rb_tree_node"* + %call = call %"struct.std::pair"* @_ZNSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %1) + ret %"struct.std::pair"* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt3mapIhSt6vectorIbSaIbEESt4lessIhESaISt4pairIKhS2_EEE6insertESt17_Rb_tree_iteratorIS7_ERKS7_(%"class.std::map"* %this, %"struct.std::_Rb_tree_node_base"* %__position.coerce, %"struct.std::pair"* dereferenceable(48) %__x) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %__position = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::map"*, align 8 + %__x.addr = alloca %"struct.std::pair"*, align 8 + %agg.tmp = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__position, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %__position.coerce, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + store %"class.std::map"* %this, %"class.std::map"** %this.addr, align 8 + store %"struct.std::pair"* %__x, %"struct.std::pair"** %__x.addr, align 8 + %this1 = load %"class.std::map"*, %"class.std::map"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"class.std::map", %"class.std::map"* %this1, i32 0, i32 0 + call void @_ZNSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2ERKSt17_Rb_tree_iteratorIS5_E(%"struct.std::_Rb_tree_const_iterator"* %agg.tmp, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %__position) + %0 = load %"struct.std::pair"*, %"struct.std::pair"** %__x.addr, align 8 + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %agg.tmp, i32 0, i32 0 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_insert_unique_ESt23_Rb_tree_const_iteratorIS5_ERKS5_(%"class.std::_Rb_tree"* %_M_t, %"struct.std::_Rb_tree_node_base"* %1, %"struct.std::pair"* dereferenceable(48) %0) + %coerce.dive3 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive3, align 8 + %coerce.dive4 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive4, align 8 + ret %"struct.std::_Rb_tree_node_base"* %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt4pairIKhSt6vectorIbSaIbEEEC2ERS0_RKS3_(%"struct.std::pair"* %this, i8* dereferenceable(1) %__a, %"class.std::vector.0"* dereferenceable(40) %__b) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::pair"*, align 8 + %__a.addr = alloca i8*, align 8 + %__b.addr = alloca %"class.std::vector.0"*, align 8 + store %"struct.std::pair"* %this, %"struct.std::pair"** %this.addr, align 8 + store i8* %__a, i8** %__a.addr, align 8 + store %"class.std::vector.0"* %__b, %"class.std::vector.0"** %__b.addr, align 8 + %this1 = load %"struct.std::pair"*, %"struct.std::pair"** %this.addr, align 8 + %0 = bitcast %"struct.std::pair"* %this1 to %"class.std::__pair_base"* + %first = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %this1, i32 0, i32 0 + %1 = load i8*, i8** %__a.addr, align 8 + %2 = load i8, i8* %1, align 1 + store i8 %2, i8* %first, align 8 + %second = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %this1, i32 0, i32 1 + %3 = load %"class.std::vector.0"*, %"class.std::vector.0"** %__b.addr, align 8 + call void @_ZNSt6vectorIbSaIbEEC2ERKS1_(%"class.std::vector.0"* %second, %"class.std::vector.0"* dereferenceable(40) %3) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11lower_boundERS1_(%"class.std::_Rb_tree"* %this, i8* dereferenceable(1) %__k) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__k.addr = alloca i8*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store i8* %__k, i8** %__k.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_beginEv(%"class.std::_Rb_tree"* %this1) + %call2 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_M_endEv(%"class.std::_Rb_tree"* %this1) + %0 = load i8*, i8** %__k.addr, align 8 + %call3 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE14_M_lower_boundEPSt13_Rb_tree_nodeIS5_EPSt18_Rb_tree_node_baseRS1_(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %call, %"struct.std::_Rb_tree_node_base"* %call2, i8* dereferenceable(1) %0) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call3, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + %coerce.dive4 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive4, align 8 + ret %"struct.std::_Rb_tree_node_base"* %1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE14_M_lower_boundEPSt13_Rb_tree_nodeIS5_EPSt18_Rb_tree_node_baseRS1_(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node"* %__x, %"struct.std::_Rb_tree_node_base"* %__y, i8* dereferenceable(1) %__k) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__x.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %__y.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %__k.addr = alloca i8*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__x, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + store %"struct.std::_Rb_tree_node_base"* %__y, %"struct.std::_Rb_tree_node_base"** %__y.addr, align 8 + store i8* %__k, i8** %__k.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + br label %while.cond + +while.cond: ; preds = %if.end, %entry + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %cmp = icmp ne %"struct.std::_Rb_tree_node"* %0, null + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %1, i32 0, i32 0 + %2 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %call = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt13_Rb_tree_nodeIS5_E(%"struct.std::_Rb_tree_node"* %2) + %3 = load i8*, i8** %__k.addr, align 8 + %call2 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare, i8* dereferenceable(1) %call, i8* dereferenceable(1) %3) + br i1 %call2, label %if.else, label %if.then + +if.then: ; preds = %while.body + %4 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %5 = bitcast %"struct.std::_Rb_tree_node"* %4 to %"struct.std::_Rb_tree_node_base"* + store %"struct.std::_Rb_tree_node_base"* %5, %"struct.std::_Rb_tree_node_base"** %__y.addr, align 8 + %6 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %7 = bitcast %"struct.std::_Rb_tree_node"* %6 to %"struct.std::_Rb_tree_node_base"* + %call3 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE7_S_leftEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %7) + store %"struct.std::_Rb_tree_node"* %call3, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + br label %if.end + +if.else: ; preds = %while.body + %8 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %9 = bitcast %"struct.std::_Rb_tree_node"* %8 to %"struct.std::_Rb_tree_node_base"* + %call4 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %9) + store %"struct.std::_Rb_tree_node"* %call4, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond + +while.end: ; preds = %while.cond + %10 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__y.addr, align 8 + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %retval, %"struct.std::_Rb_tree_node_base"* %10) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %11 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + ret %"struct.std::_Rb_tree_node_base"* %11 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_M_endEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 0 + ret %"struct.std::_Rb_tree_node_base"* %_M_header +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt13_Rb_tree_nodeIS5_E(%"struct.std::_Rb_tree_node"* %__x) #0 comdat align 2 { +entry: + %__x.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %ref.tmp = alloca %"struct.std::_Select1st", align 1 + store %"struct.std::_Rb_tree_node"* %__x, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %call = call dereferenceable(48) %"struct.std::pair"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_valueEPKSt13_Rb_tree_nodeIS5_E(%"struct.std::_Rb_tree_node"* %0) + %call1 = call dereferenceable(1) i8* @_ZNKSt10_Select1stISt4pairIKhSt6vectorIbSaIbEEEEclERKS5_(%"struct.std::_Select1st"* %ref.tmp, %"struct.std::pair"* dereferenceable(48) %call) + ret i8* %call1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) i8* @_ZNKSt10_Select1stISt4pairIKhSt6vectorIbSaIbEEEEclERKS5_(%"struct.std::_Select1st"* %this, %"struct.std::pair"* dereferenceable(48) %__x) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Select1st"*, align 8 + %__x.addr = alloca %"struct.std::pair"*, align 8 + store %"struct.std::_Select1st"* %this, %"struct.std::_Select1st"** %this.addr, align 8 + store %"struct.std::pair"* %__x, %"struct.std::pair"** %__x.addr, align 8 + %this1 = load %"struct.std::_Select1st"*, %"struct.std::_Select1st"** %this.addr, align 8 + %0 = load %"struct.std::pair"*, %"struct.std::pair"** %__x.addr, align 8 + %first = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %0, i32 0, i32 0 + ret i8* %first +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(48) %"struct.std::pair"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_valueEPKSt13_Rb_tree_nodeIS5_E(%"struct.std::_Rb_tree_node"* %__x) #0 comdat align 2 { +entry: + %__x.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"struct.std::_Rb_tree_node"* %__x, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x.addr, align 8 + %call = call %"struct.std::pair"* @_ZNKSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %0) + ret %"struct.std::pair"* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8key_compEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %0, i32 0, i32 0 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_insert_unique_ESt23_Rb_tree_const_iteratorIS5_ERKS5_(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node_base"* %__pos.coerce, %"struct.std::pair"* dereferenceable(48) %__x) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %__pos = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__x.addr = alloca %"struct.std::pair"*, align 8 + %__an = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node", align 8 + %agg.tmp = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %__pos, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %__pos.coerce, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::pair"* %__x, %"struct.std::pair"** %__x.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_Alloc_nodeC2ERSB_(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %__an, %"class.std::_Rb_tree"* dereferenceable(48) %this1) + %0 = bitcast %"struct.std::_Rb_tree_const_iterator"* %agg.tmp to i8* + %1 = bitcast %"struct.std::_Rb_tree_const_iterator"* %__pos to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %2 = load %"struct.std::pair"*, %"struct.std::pair"** %__x.addr, align 8 + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %agg.tmp, i32 0, i32 0 + %3 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_insert_unique_INSB_11_Alloc_nodeEEESt17_Rb_tree_iteratorIS5_ESt23_Rb_tree_const_iteratorIS5_ERKS5_RT_(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node_base"* %3, %"struct.std::pair"* dereferenceable(48) %2, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* dereferenceable(8) %__an) + %coerce.dive3 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive3, align 8 + %coerce.dive4 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %4 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive4, align 8 + ret %"struct.std::_Rb_tree_node_base"* %4 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_Alloc_nodeC2ERSB_(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %this, %"class.std::_Rb_tree"* dereferenceable(48) %__t) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, align 8 + %__t.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %this, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %this.addr, align 8 + store %"class.std::_Rb_tree"* %__t, %"class.std::_Rb_tree"** %__t.addr, align 8 + %this1 = load %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node", %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %this1, i32 0, i32 0 + %0 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %__t.addr, align 8 + store %"class.std::_Rb_tree"* %0, %"class.std::_Rb_tree"** %_M_t, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_insert_unique_INSB_11_Alloc_nodeEEESt17_Rb_tree_iteratorIS5_ESt23_Rb_tree_const_iteratorIS5_ERKS5_RT_(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node_base"* %__position.coerce, %"struct.std::pair"* dereferenceable(48) %__v, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* dereferenceable(8) %__node_gen) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %__position = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__v.addr = alloca %"struct.std::pair"*, align 8 + %__node_gen.addr = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, align 8 + %__res = alloca %"struct.std::pair.11", align 8 + %agg.tmp = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %ref.tmp = alloca %"struct.std::_Select1st", align 1 + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %__position, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %__position.coerce, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::pair"* %__v, %"struct.std::pair"** %__v.addr, align 8 + store %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %__node_gen, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %__node_gen.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Rb_tree_const_iterator"* %agg.tmp to i8* + %1 = bitcast %"struct.std::_Rb_tree_const_iterator"* %__position to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 8, i1 false) + %2 = load %"struct.std::pair"*, %"struct.std::pair"** %__v.addr, align 8 + %call = call dereferenceable(1) i8* @_ZNKSt10_Select1stISt4pairIKhSt6vectorIbSaIbEEEEclERKS5_(%"struct.std::_Select1st"* %ref.tmp, %"struct.std::pair"* dereferenceable(48) %2) + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %agg.tmp, i32 0, i32 0 + %3 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + %call3 = call { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS5_ERS1_(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node_base"* %3, i8* dereferenceable(1) %call) + %4 = bitcast %"struct.std::pair.11"* %__res to { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* + %5 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %4, i32 0, i32 0 + %6 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call3, 0 + store %"struct.std::_Rb_tree_node_base"* %6, %"struct.std::_Rb_tree_node_base"** %5, align 8 + %7 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %4, i32 0, i32 1 + %8 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call3, 1 + store %"struct.std::_Rb_tree_node_base"* %8, %"struct.std::_Rb_tree_node_base"** %7, align 8 + %second = getelementptr inbounds %"struct.std::pair.11", %"struct.std::pair.11"* %__res, i32 0, i32 1 + %9 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %second, align 8 + %tobool = icmp ne %"struct.std::_Rb_tree_node_base"* %9, null + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %first = getelementptr inbounds %"struct.std::pair.11", %"struct.std::pair.11"* %__res, i32 0, i32 0 + %10 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %first, align 8 + %second4 = getelementptr inbounds %"struct.std::pair.11", %"struct.std::pair.11"* %__res, i32 0, i32 1 + %11 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %second4, align 8 + %12 = load %"struct.std::pair"*, %"struct.std::pair"** %__v.addr, align 8 + %13 = load %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %__node_gen.addr, align 8 + %call5 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE10_M_insert_INSB_11_Alloc_nodeEEESt17_Rb_tree_iteratorIS5_EPSt18_Rb_tree_node_baseSH_RKS5_RT_(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node_base"* %10, %"struct.std::_Rb_tree_node_base"* %11, %"struct.std::pair"* dereferenceable(48) %12, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* dereferenceable(8) %13) + %coerce.dive6 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call5, %"struct.std::_Rb_tree_node_base"** %coerce.dive6, align 8 + br label %return + +if.end: ; preds = %entry + %first7 = getelementptr inbounds %"struct.std::pair.11", %"struct.std::pair.11"* %__res, i32 0, i32 0 + %14 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %first7, align 8 + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %retval, %"struct.std::_Rb_tree_node_base"* %14) + br label %return + +return: ; preds = %if.end, %if.then + %coerce.dive8 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %15 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive8, align 8 + ret %"struct.std::_Rb_tree_node_base"* %15 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS5_ERS1_(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node_base"* %__position.coerce, i8* dereferenceable(1) %__k) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::pair.11", align 8 + %__position = alloca %"struct.std::_Rb_tree_const_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__k.addr = alloca i8*, align 8 + %__pos = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %__before = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp37 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %__after = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp55 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %ref.tmp69 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %ref.tmp78 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %__position, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %__position.coerce, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store i8* %__k, i8** %__k.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEE13_M_const_castEv(%"struct.std::_Rb_tree_const_iterator"* %__position) + %coerce.dive2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %coerce.dive2, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %call3 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_M_endEv(%"class.std::_Rb_tree"* %this1) + %cmp = icmp eq %"struct.std::_Rb_tree_node_base"* %0, %call3 + br i1 %cmp, label %if.then, label %if.else12 + +if.then: ; preds = %entry + %call4 = call i64 @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE4sizeEv(%"class.std::_Rb_tree"* %this1) + %cmp5 = icmp ugt i64 %call4, 0 + br i1 %cmp5, label %land.lhs.true, label %if.else + +land.lhs.true: ; preds = %if.then + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %1, i32 0, i32 0 + %call6 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_rightmostEv(%"class.std::_Rb_tree"* %this1) + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %call6, align 8 + %call7 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %2) + %3 = load i8*, i8** %__k.addr, align 8 + %call8 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare, i8* dereferenceable(1) %call7, i8* dereferenceable(1) %3) + br i1 %call8, label %if.then9, label %if.else + +if.then9: ; preds = %land.lhs.true + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %ref.tmp, align 8 + %call10 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_rightmostEv(%"class.std::_Rb_tree"* %this1) + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %call10) + br label %return + +if.else: ; preds = %land.lhs.true, %if.then + %4 = load i8*, i8** %__k.addr, align 8 + %call11 = call { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE24_M_get_insert_unique_posERS1_(%"class.std::_Rb_tree"* %this1, i8* dereferenceable(1) %4) + %5 = bitcast %"struct.std::pair.11"* %retval to { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* + %6 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %5, i32 0, i32 0 + %7 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call11, 0 + store %"struct.std::_Rb_tree_node_base"* %7, %"struct.std::_Rb_tree_node_base"** %6, align 8 + %8 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %5, i32 0, i32 1 + %9 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call11, 1 + store %"struct.std::_Rb_tree_node_base"* %9, %"struct.std::_Rb_tree_node_base"** %8, align 8 + br label %return + +if.else12: ; preds = %entry + %_M_impl13 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %10 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl13 to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare14 = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %10, i32 0, i32 0 + %11 = load i8*, i8** %__k.addr, align 8 + %_M_node15 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %12 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node15, align 8 + %call16 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %12) + %call17 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare14, i8* dereferenceable(1) %11, i8* dereferenceable(1) %call16) + br i1 %call17, label %if.then18, label %if.else44 + +if.then18: ; preds = %if.else12 + %13 = bitcast %"struct.std::_Rb_tree_iterator"* %__before to i8* + %14 = bitcast %"struct.std::_Rb_tree_iterator"* %__pos to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 8, i1 false) + %_M_node19 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %15 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node19, align 8 + %call20 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_leftmostEv(%"class.std::_Rb_tree"* %this1) + %16 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %call20, align 8 + %cmp21 = icmp eq %"struct.std::_Rb_tree_node_base"* %15, %16 + br i1 %cmp21, label %if.then22, label %if.else25 + +if.then22: ; preds = %if.then18 + %call23 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_leftmostEv(%"class.std::_Rb_tree"* %this1) + %call24 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_leftmostEv(%"class.std::_Rb_tree"* %this1) + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %call23, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %call24) + br label %return + +if.else25: ; preds = %if.then18 + %_M_impl26 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %17 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl26 to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare27 = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %17, i32 0, i32 0 + %call28 = call dereferenceable(8) %"struct.std::_Rb_tree_iterator"* @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEmmEv(%"struct.std::_Rb_tree_iterator"* %__before) + %_M_node29 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %call28, i32 0, i32 0 + %18 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node29, align 8 + %call30 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %18) + %19 = load i8*, i8** %__k.addr, align 8 + %call31 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare27, i8* dereferenceable(1) %call30, i8* dereferenceable(1) %19) + br i1 %call31, label %if.then32, label %if.else42 + +if.then32: ; preds = %if.else25 + %_M_node33 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__before, i32 0, i32 0 + %20 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node33, align 8 + %call34 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %20) + %cmp35 = icmp eq %"struct.std::_Rb_tree_node"* %call34, null + br i1 %cmp35, label %if.then36, label %if.else39 + +if.then36: ; preds = %if.then32 + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %ref.tmp37, align 8 + %_M_node38 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__before, i32 0, i32 0 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp37, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node38) + br label %return + +if.else39: ; preds = %if.then32 + %_M_node40 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %_M_node41 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node40, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node41) + br label %return + +if.else42: ; preds = %if.else25 + %21 = load i8*, i8** %__k.addr, align 8 + %call43 = call { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE24_M_get_insert_unique_posERS1_(%"class.std::_Rb_tree"* %this1, i8* dereferenceable(1) %21) + %22 = bitcast %"struct.std::pair.11"* %retval to { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* + %23 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %22, i32 0, i32 0 + %24 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call43, 0 + store %"struct.std::_Rb_tree_node_base"* %24, %"struct.std::_Rb_tree_node_base"** %23, align 8 + %25 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %22, i32 0, i32 1 + %26 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call43, 1 + store %"struct.std::_Rb_tree_node_base"* %26, %"struct.std::_Rb_tree_node_base"** %25, align 8 + br label %return + +if.else44: ; preds = %if.else12 + %_M_impl45 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %27 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl45 to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare46 = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %27, i32 0, i32 0 + %_M_node47 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %28 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node47, align 8 + %call48 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %28) + %29 = load i8*, i8** %__k.addr, align 8 + %call49 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare46, i8* dereferenceable(1) %call48, i8* dereferenceable(1) %29) + br i1 %call49, label %if.then50, label %if.else76 + +if.then50: ; preds = %if.else44 + %30 = bitcast %"struct.std::_Rb_tree_iterator"* %__after to i8* + %31 = bitcast %"struct.std::_Rb_tree_iterator"* %__pos to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %30, i8* align 8 %31, i64 8, i1 false) + %_M_node51 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %32 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node51, align 8 + %call52 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_rightmostEv(%"class.std::_Rb_tree"* %this1) + %33 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %call52, align 8 + %cmp53 = icmp eq %"struct.std::_Rb_tree_node_base"* %32, %33 + br i1 %cmp53, label %if.then54, label %if.else57 + +if.then54: ; preds = %if.then50 + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %ref.tmp55, align 8 + %call56 = call dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_rightmostEv(%"class.std::_Rb_tree"* %this1) + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp55, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %call56) + br label %return + +if.else57: ; preds = %if.then50 + %_M_impl58 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %34 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl58 to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare59 = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %34, i32 0, i32 0 + %35 = load i8*, i8** %__k.addr, align 8 + %call60 = call dereferenceable(8) %"struct.std::_Rb_tree_iterator"* @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEppEv(%"struct.std::_Rb_tree_iterator"* %__after) + %_M_node61 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %call60, i32 0, i32 0 + %36 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node61, align 8 + %call62 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %36) + %call63 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare59, i8* dereferenceable(1) %35, i8* dereferenceable(1) %call62) + br i1 %call63, label %if.then64, label %if.else74 + +if.then64: ; preds = %if.else57 + %_M_node65 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + %37 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node65, align 8 + %call66 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %37) + %cmp67 = icmp eq %"struct.std::_Rb_tree_node"* %call66, null + br i1 %cmp67, label %if.then68, label %if.else71 + +if.then68: ; preds = %if.then64 + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %ref.tmp69, align 8 + %_M_node70 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp69, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node70) + br label %return + +if.else71: ; preds = %if.then64 + %_M_node72 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__after, i32 0, i32 0 + %_M_node73 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__after, i32 0, i32 0 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node72, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node73) + br label %return + +if.else74: ; preds = %if.else57 + %38 = load i8*, i8** %__k.addr, align 8 + %call75 = call { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE24_M_get_insert_unique_posERS1_(%"class.std::_Rb_tree"* %this1, i8* dereferenceable(1) %38) + %39 = bitcast %"struct.std::pair.11"* %retval to { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* + %40 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %39, i32 0, i32 0 + %41 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call75, 0 + store %"struct.std::_Rb_tree_node_base"* %41, %"struct.std::_Rb_tree_node_base"** %40, align 8 + %42 = getelementptr inbounds { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %39, i32 0, i32 1 + %43 = extractvalue { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %call75, 1 + store %"struct.std::_Rb_tree_node_base"* %43, %"struct.std::_Rb_tree_node_base"** %42, align 8 + br label %return + +if.else76: ; preds = %if.else44 + %_M_node77 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__pos, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %ref.tmp78, align 8 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node77, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp78) + br label %return + +return: ; preds = %if.else76, %if.else74, %if.else71, %if.then68, %if.then54, %if.else42, %if.else39, %if.then36, %if.then22, %if.else, %if.then9 + %44 = bitcast %"struct.std::pair.11"* %retval to { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* + %45 = load { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %44, align 8 + ret { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %45 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE10_M_insert_INSB_11_Alloc_nodeEEESt17_Rb_tree_iteratorIS5_EPSt18_Rb_tree_node_baseSH_RKS5_RT_(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"* %__p, %"struct.std::pair"* dereferenceable(48) %__v, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* dereferenceable(8) %__node_gen) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__x.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %__p.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %__v.addr = alloca %"struct.std::pair"*, align 8 + %__node_gen.addr = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, align 8 + %__insert_left = alloca i8, align 1 + %ref.tmp = alloca %"struct.std::_Select1st", align 1 + %__z = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + store %"struct.std::_Rb_tree_node_base"* %__p, %"struct.std::_Rb_tree_node_base"** %__p.addr, align 8 + store %"struct.std::pair"* %__v, %"struct.std::pair"** %__v.addr, align 8 + store %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %__node_gen, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %__node_gen.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %cmp = icmp ne %"struct.std::_Rb_tree_node_base"* %0, null + br i1 %cmp, label %lor.end, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__p.addr, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_M_endEv(%"class.std::_Rb_tree"* %this1) + %cmp2 = icmp eq %"struct.std::_Rb_tree_node_base"* %1, %call + br i1 %cmp2, label %lor.end, label %lor.rhs + +lor.rhs: ; preds = %lor.lhs.false + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %2 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %2, i32 0, i32 0 + %3 = load %"struct.std::pair"*, %"struct.std::pair"** %__v.addr, align 8 + %call3 = call dereferenceable(1) i8* @_ZNKSt10_Select1stISt4pairIKhSt6vectorIbSaIbEEEEclERKS5_(%"struct.std::_Select1st"* %ref.tmp, %"struct.std::pair"* dereferenceable(48) %3) + %4 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__p.addr, align 8 + %call4 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %4) + %call5 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare, i8* dereferenceable(1) %call3, i8* dereferenceable(1) %call4) + br label %lor.end + +lor.end: ; preds = %lor.rhs, %lor.lhs.false, %entry + %5 = phi i1 [ true, %lor.lhs.false ], [ true, %entry ], [ %call5, %lor.rhs ] + %frombool = zext i1 %5 to i8 + store i8 %frombool, i8* %__insert_left, align 1 + %6 = load %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %__node_gen.addr, align 8 + %7 = load %"struct.std::pair"*, %"struct.std::pair"** %__v.addr, align 8 + %call6 = call %"struct.std::_Rb_tree_node"* @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_Alloc_nodeclIS5_EEPSt13_Rb_tree_nodeIS5_ERKT_(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %6, %"struct.std::pair"* dereferenceable(48) %7) + store %"struct.std::_Rb_tree_node"* %call6, %"struct.std::_Rb_tree_node"** %__z, align 8 + %8 = load i8, i8* %__insert_left, align 1 + %tobool = trunc i8 %8 to i1 + %9 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__z, align 8 + %10 = bitcast %"struct.std::_Rb_tree_node"* %9 to %"struct.std::_Rb_tree_node_base"* + %11 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__p.addr, align 8 + %_M_impl7 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %12 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl7 to i8* + %add.ptr = getelementptr inbounds i8, i8* %12, i64 8 + %13 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %13, i32 0, i32 0 + call void @_ZSt29_Rb_tree_insert_and_rebalancebPSt18_Rb_tree_node_baseS0_RS_(i1 zeroext %tobool, %"struct.std::_Rb_tree_node_base"* %10, %"struct.std::_Rb_tree_node_base"* %11, %"struct.std::_Rb_tree_node_base"* dereferenceable(32) %_M_header) #3 + %_M_impl8 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %14 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl8 to i8* + %add.ptr9 = getelementptr inbounds i8, i8* %14, i64 8 + %15 = bitcast i8* %add.ptr9 to %"struct.std::_Rb_tree_header"* + %_M_node_count = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %15, i32 0, i32 1 + %16 = load i64, i64* %_M_node_count, align 8 + %inc = add i64 %16, 1 + store i64 %inc, i64* %_M_node_count, align 8 + %17 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__z, align 8 + %18 = bitcast %"struct.std::_Rb_tree_node"* %17 to %"struct.std::_Rb_tree_node_base"* + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %retval, %"struct.std::_Rb_tree_node_base"* %18) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %19 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + ret %"struct.std::_Rb_tree_node_base"* %19 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node_base"* @_ZNKSt23_Rb_tree_const_iteratorISt4pairIKhSt6vectorIbSaIbEEEE13_M_const_castEv(%"struct.std::_Rb_tree_const_iterator"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Rb_tree_iterator", align 8 + %this.addr = alloca %"struct.std::_Rb_tree_const_iterator"*, align 8 + store %"struct.std::_Rb_tree_const_iterator"* %this, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_const_iterator"*, %"struct.std::_Rb_tree_const_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_const_iterator", %"struct.std::_Rb_tree_const_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %retval, %"struct.std::_Rb_tree_node_base"* %0) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %retval, i32 0, i32 0 + %1 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + ret %"struct.std::_Rb_tree_node_base"* %1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE4sizeEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_node_count = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 1 + %2 = load i64, i64* %_M_node_count, align 8 + ret i64 %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %__x) #0 comdat align 2 { +entry: + %__x.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %ref.tmp = alloca %"struct.std::_Select1st", align 1 + store %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %call = call dereferenceable(48) %"struct.std::pair"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_valueEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %0) + %call1 = call dereferenceable(1) i8* @_ZNKSt10_Select1stISt4pairIKhSt6vectorIbSaIbEEEEclERKS5_(%"struct.std::_Select1st"* %ref.tmp, %"struct.std::pair"* dereferenceable(48) %call) + ret i8* %call1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE12_M_rightmostEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 0 + %_M_right = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header, i32 0, i32 3 + ret %"struct.std::_Rb_tree_node_base"** %_M_right +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %this, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %__a, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %__b) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::pair.11"*, align 8 + %__a.addr = alloca %"struct.std::_Rb_tree_node_base"**, align 8 + %__b.addr = alloca %"struct.std::_Rb_tree_node_base"**, align 8 + store %"struct.std::pair.11"* %this, %"struct.std::pair.11"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node_base"** %__a, %"struct.std::_Rb_tree_node_base"*** %__a.addr, align 8 + store %"struct.std::_Rb_tree_node_base"** %__b, %"struct.std::_Rb_tree_node_base"*** %__b.addr, align 8 + %this1 = load %"struct.std::pair.11"*, %"struct.std::pair.11"** %this.addr, align 8 + %0 = bitcast %"struct.std::pair.11"* %this1 to %"class.std::__pair_base.12"* + %first = getelementptr inbounds %"struct.std::pair.11", %"struct.std::pair.11"* %this1, i32 0, i32 0 + %1 = load %"struct.std::_Rb_tree_node_base"**, %"struct.std::_Rb_tree_node_base"*** %__a.addr, align 8 + %2 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %1, align 8 + store %"struct.std::_Rb_tree_node_base"* %2, %"struct.std::_Rb_tree_node_base"** %first, align 8 + %second = getelementptr inbounds %"struct.std::pair.11", %"struct.std::pair.11"* %this1, i32 0, i32 1 + %3 = load %"struct.std::_Rb_tree_node_base"**, %"struct.std::_Rb_tree_node_base"*** %__b.addr, align 8 + %4 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %3, align 8 + store %"struct.std::_Rb_tree_node_base"* %4, %"struct.std::_Rb_tree_node_base"** %second, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE24_M_get_insert_unique_posERS1_(%"class.std::_Rb_tree"* %this, i8* dereferenceable(1) %__k) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::pair.11", align 8 + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__k.addr = alloca i8*, align 8 + %__x = alloca %"struct.std::_Rb_tree_node"*, align 8 + %__y = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %__comp = alloca i8, align 1 + %__j = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp = alloca %"struct.std::_Rb_tree_iterator", align 8 + %ref.tmp11 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %ref.tmp19 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + %ref.tmp22 = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store i8* %__k, i8** %__k.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_M_beginEv(%"class.std::_Rb_tree"* %this1) + store %"struct.std::_Rb_tree_node"* %call, %"struct.std::_Rb_tree_node"** %__x, align 8 + %call2 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_M_endEv(%"class.std::_Rb_tree"* %this1) + store %"struct.std::_Rb_tree_node_base"* %call2, %"struct.std::_Rb_tree_node_base"** %__y, align 8 + store i8 1, i8* %__comp, align 1 + br label %while.cond + +while.cond: ; preds = %cond.end, %entry + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %cmp = icmp ne %"struct.std::_Rb_tree_node"* %0, null + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %2 = bitcast %"struct.std::_Rb_tree_node"* %1 to %"struct.std::_Rb_tree_node_base"* + store %"struct.std::_Rb_tree_node_base"* %2, %"struct.std::_Rb_tree_node_base"** %__y, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %3 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %3, i32 0, i32 0 + %4 = load i8*, i8** %__k.addr, align 8 + %5 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %call3 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt13_Rb_tree_nodeIS5_E(%"struct.std::_Rb_tree_node"* %5) + %call4 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare, i8* dereferenceable(1) %4, i8* dereferenceable(1) %call3) + %frombool = zext i1 %call4 to i8 + store i8 %frombool, i8* %__comp, align 1 + %6 = load i8, i8* %__comp, align 1 + %tobool = trunc i8 %6 to i1 + br i1 %tobool, label %cond.true, label %cond.false + +cond.true: ; preds = %while.body + %7 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %8 = bitcast %"struct.std::_Rb_tree_node"* %7 to %"struct.std::_Rb_tree_node_base"* + %call5 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE7_S_leftEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %8) + br label %cond.end + +cond.false: ; preds = %while.body + %9 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %10 = bitcast %"struct.std::_Rb_tree_node"* %9 to %"struct.std::_Rb_tree_node_base"* + %call6 = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_rightEPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %10) + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi %"struct.std::_Rb_tree_node"* [ %call5, %cond.true ], [ %call6, %cond.false ] + store %"struct.std::_Rb_tree_node"* %cond, %"struct.std::_Rb_tree_node"** %__x, align 8 + br label %while.cond + +while.end: ; preds = %while.cond + %11 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__y, align 8 + call void @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEC2EPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_iterator"* %__j, %"struct.std::_Rb_tree_node_base"* %11) + %12 = load i8, i8* %__comp, align 1 + %tobool7 = trunc i8 %12 to i1 + br i1 %tobool7, label %if.then, label %if.end13 + +if.then: ; preds = %while.end + %call8 = call %"struct.std::_Rb_tree_node_base"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE5beginEv(%"class.std::_Rb_tree"* %this1) + %coerce.dive = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %ref.tmp, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call8, %"struct.std::_Rb_tree_node_base"** %coerce.dive, align 8 + %call9 = call zeroext i1 @_ZNKSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEeqERKS6_(%"struct.std::_Rb_tree_iterator"* %__j, %"struct.std::_Rb_tree_iterator"* dereferenceable(8) %ref.tmp) + br i1 %call9, label %if.then10, label %if.else + +if.then10: ; preds = %if.then + %13 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %14 = bitcast %"struct.std::_Rb_tree_node"* %13 to %"struct.std::_Rb_tree_node_base"* + store %"struct.std::_Rb_tree_node_base"* %14, %"struct.std::_Rb_tree_node_base"** %ref.tmp11, align 8 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp11, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %__y) + br label %return + +if.else: ; preds = %if.then + %call12 = call dereferenceable(8) %"struct.std::_Rb_tree_iterator"* @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEmmEv(%"struct.std::_Rb_tree_iterator"* %__j) + br label %if.end + +if.end: ; preds = %if.else + br label %if.end13 + +if.end13: ; preds = %if.end, %while.end + %_M_impl14 = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %15 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl14 to %"struct.std::_Rb_tree_key_compare"* + %_M_key_compare15 = getelementptr inbounds %"struct.std::_Rb_tree_key_compare", %"struct.std::_Rb_tree_key_compare"* %15, i32 0, i32 0 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__j, i32 0, i32 0 + %16 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %call16 = call dereferenceable(1) i8* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE6_S_keyEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %16) + %17 = load i8*, i8** %__k.addr, align 8 + %call17 = call zeroext i1 @_ZNKSt4lessIhEclERKhS2_(%"struct.std::less"* %_M_key_compare15, i8* dereferenceable(1) %call16, i8* dereferenceable(1) %17) + br i1 %call17, label %if.then18, label %if.end20 + +if.then18: ; preds = %if.end13 + %18 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__x, align 8 + %19 = bitcast %"struct.std::_Rb_tree_node"* %18 to %"struct.std::_Rb_tree_node_base"* + store %"struct.std::_Rb_tree_node_base"* %19, %"struct.std::_Rb_tree_node_base"** %ref.tmp19, align 8 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp19, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %__y) + br label %return + +if.end20: ; preds = %if.end13 + %_M_node21 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %__j, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* null, %"struct.std::_Rb_tree_node_base"** %ref.tmp22, align 8 + call void @_ZNSt4pairIPSt18_Rb_tree_node_baseS1_EC2ERKS1_S4_(%"struct.std::pair.11"* %retval, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %_M_node21, %"struct.std::_Rb_tree_node_base"** dereferenceable(8) %ref.tmp22) + br label %return + +return: ; preds = %if.end20, %if.then18, %if.then10 + %20 = bitcast %"struct.std::pair.11"* %retval to { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* + %21 = load { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }, { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }* %20, align 8 + ret { %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* } %21 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %"struct.std::_Rb_tree_node_base"** @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_leftmostEv(%"class.std::_Rb_tree"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"class.std::_Rb_tree", %"class.std::_Rb_tree"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Rb_tree_impl"* %_M_impl to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 8 + %1 = bitcast i8* %add.ptr to %"struct.std::_Rb_tree_header"* + %_M_header = getelementptr inbounds %"struct.std::_Rb_tree_header", %"struct.std::_Rb_tree_header"* %1, i32 0, i32 0 + %_M_left = getelementptr inbounds %"struct.std::_Rb_tree_node_base", %"struct.std::_Rb_tree_node_base"* %_M_header, i32 0, i32 2 + ret %"struct.std::_Rb_tree_node_base"** %_M_left +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %"struct.std::_Rb_tree_iterator"* @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEmmEv(%"struct.std::_Rb_tree_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + store %"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZSt18_Rb_tree_decrementPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %0) #10 + %_M_node2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %_M_node2, align 8 + ret %"struct.std::_Rb_tree_iterator"* %this1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %"struct.std::_Rb_tree_iterator"* @_ZNSt17_Rb_tree_iteratorISt4pairIKhSt6vectorIbSaIbEEEEppEv(%"struct.std::_Rb_tree_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree_iterator"*, align 8 + store %"struct.std::_Rb_tree_iterator"* %this, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Rb_tree_iterator"*, %"struct.std::_Rb_tree_iterator"** %this.addr, align 8 + %_M_node = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %_M_node, align 8 + %call = call %"struct.std::_Rb_tree_node_base"* @_ZSt18_Rb_tree_incrementPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %0) #10 + %_M_node2 = getelementptr inbounds %"struct.std::_Rb_tree_iterator", %"struct.std::_Rb_tree_iterator"* %this1, i32 0, i32 0 + store %"struct.std::_Rb_tree_node_base"* %call, %"struct.std::_Rb_tree_node_base"** %_M_node2, align 8 + ret %"struct.std::_Rb_tree_iterator"* %this1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(48) %"struct.std::pair"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE8_S_valueEPKSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"* %__x) #0 comdat align 2 { +entry: + %__x.addr = alloca %"struct.std::_Rb_tree_node_base"*, align 8 + store %"struct.std::_Rb_tree_node_base"* %__x, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %0 = load %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"** %__x.addr, align 8 + %1 = bitcast %"struct.std::_Rb_tree_node_base"* %0 to %"struct.std::_Rb_tree_node"* + %call = call %"struct.std::pair"* @_ZNKSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %1) + ret %"struct.std::pair"* %call +} + +; Function Attrs: nounwind readonly +declare dso_local %"struct.std::_Rb_tree_node_base"* @_ZSt18_Rb_tree_decrementPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"*) #14 + +; Function Attrs: nounwind readonly +declare dso_local %"struct.std::_Rb_tree_node_base"* @_ZSt18_Rb_tree_incrementPSt18_Rb_tree_node_base(%"struct.std::_Rb_tree_node_base"*) #14 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_Alloc_nodeclIS5_EEPSt13_Rb_tree_nodeIS5_ERKT_(%"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %this, %"struct.std::pair"* dereferenceable(48) %__arg) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, align 8 + %__arg.addr = alloca %"struct.std::pair"*, align 8 + store %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %this, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %this.addr, align 8 + store %"struct.std::pair"* %__arg, %"struct.std::pair"** %__arg.addr, align 8 + %this1 = load %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"*, %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"** %this.addr, align 8 + %_M_t = getelementptr inbounds %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node", %"struct.std::_Rb_tree > >, std::_Select1st > > >, std::less, std::allocator > > > >::_Alloc_node"* %this1, i32 0, i32 0 + %0 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %_M_t, align 8 + %1 = load %"struct.std::pair"*, %"struct.std::pair"** %__arg.addr, align 8 + %call = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE14_M_create_nodeERKS5_(%"class.std::_Rb_tree"* %0, %"struct.std::pair"* dereferenceable(48) %1) + ret %"struct.std::_Rb_tree_node"* %call +} + +; Function Attrs: nounwind +declare dso_local void @_ZSt29_Rb_tree_insert_and_rebalancebPSt18_Rb_tree_node_baseS0_RS_(i1 zeroext, %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* dereferenceable(32)) #11 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE14_M_create_nodeERKS5_(%"class.std::_Rb_tree"* %this, %"struct.std::pair"* dereferenceable(48) %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__x.addr = alloca %"struct.std::pair"*, align 8 + %__tmp = alloca %"struct.std::_Rb_tree_node"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::pair"* %__x, %"struct.std::pair"** %__x.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_get_nodeEv(%"class.std::_Rb_tree"* %this1) + store %"struct.std::_Rb_tree_node"* %call, %"struct.std::_Rb_tree_node"** %__tmp, align 8 + %0 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__tmp, align 8 + %1 = load %"struct.std::pair"*, %"struct.std::pair"** %__x.addr, align 8 + call void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_construct_nodeEPSt13_Rb_tree_nodeIS5_ERKS5_(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %0, %"struct.std::pair"* dereferenceable(48) %1) + %2 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__tmp, align 8 + ret %"struct.std::_Rb_tree_node"* %2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_get_nodeEv(%"class.std::_Rb_tree"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + %call = call dereferenceable(1) %"class.std::allocator.4"* @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE21_M_get_Node_allocatorEv(%"class.std::_Rb_tree"* %this1) + %call2 = call %"struct.std::_Rb_tree_node"* @_ZN9__gnu_cxx14__alloc_traitsISaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEE8allocateERS9_m(%"class.std::allocator.4"* dereferenceable(1) %call, i64 1) + ret %"struct.std::_Rb_tree_node"* %call2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE17_M_construct_nodeEPSt13_Rb_tree_nodeIS5_ERKS5_(%"class.std::_Rb_tree"* %this, %"struct.std::_Rb_tree_node"* %__node, %"struct.std::pair"* dereferenceable(48) %__x) #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::_Rb_tree"*, align 8 + %__node.addr = alloca %"struct.std::_Rb_tree_node"*, align 8 + %__x.addr = alloca %"struct.std::pair"*, align 8 + %ref.tmp = alloca %"class.std::allocator.7", align 1 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::_Rb_tree"* %this, %"class.std::_Rb_tree"** %this.addr, align 8 + store %"struct.std::_Rb_tree_node"* %__node, %"struct.std::_Rb_tree_node"** %__node.addr, align 8 + store %"struct.std::pair"* %__x, %"struct.std::pair"** %__x.addr, align 8 + %this1 = load %"class.std::_Rb_tree"*, %"class.std::_Rb_tree"** %this.addr, align 8 + invoke void @_ZNKSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE13get_allocatorEv(%"class.std::allocator.7"* sret %ref.tmp, %"class.std::_Rb_tree"* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %0 = bitcast %"class.std::allocator.7"* %ref.tmp to %"class.__gnu_cxx::new_allocator.8"* + %1 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__node.addr, align 8 + %call = invoke %"struct.std::pair"* @_ZNSt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEE9_M_valptrEv(%"struct.std::_Rb_tree_node"* %1) + to label %invoke.cont3 unwind label %lpad2 + +invoke.cont3: ; preds = %invoke.cont + %2 = load %"struct.std::pair"*, %"struct.std::pair"** %__x.addr, align 8 + invoke void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEE9constructEPS6_RKS6_(%"class.__gnu_cxx::new_allocator.8"* %0, %"struct.std::pair"* %call, %"struct.std::pair"* dereferenceable(48) %2) + to label %invoke.cont4 unwind label %lpad2 + +invoke.cont4: ; preds = %invoke.cont3 + call void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.std::allocator.7"* %ref.tmp) #3 + br label %try.cont + +lpad: ; preds = %entry + %3 = landingpad { i8*, i32 } + catch i8* null + %4 = extractvalue { i8*, i32 } %3, 0 + store i8* %4, i8** %exn.slot, align 8 + %5 = extractvalue { i8*, i32 } %3, 1 + store i32 %5, i32* %ehselector.slot, align 4 + br label %catch + +lpad2: ; preds = %invoke.cont3, %invoke.cont + %6 = landingpad { i8*, i32 } + catch i8* null + %7 = extractvalue { i8*, i32 } %6, 0 + store i8* %7, i8** %exn.slot, align 8 + %8 = extractvalue { i8*, i32 } %6, 1 + store i32 %8, i32* %ehselector.slot, align 4 + call void @_ZNSaISt4pairIKhSt6vectorIbSaIbEEEED2Ev(%"class.std::allocator.7"* %ref.tmp) #3 + br label %catch + +catch: ; preds = %lpad2, %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %9 = call i8* @__cxa_begin_catch(i8* %exn) #3 + %10 = load %"struct.std::_Rb_tree_node"*, %"struct.std::_Rb_tree_node"** %__node.addr, align 8 + invoke void @_ZNSt8_Rb_treeIhSt4pairIKhSt6vectorIbSaIbEEESt10_Select1stIS5_ESt4lessIhESaIS5_EE11_M_put_nodeEPSt13_Rb_tree_nodeIS5_E(%"class.std::_Rb_tree"* %this1, %"struct.std::_Rb_tree_node"* %10) + to label %invoke.cont6 unwind label %lpad5 + +invoke.cont6: ; preds = %catch + invoke void @__cxa_rethrow() #19 + to label %unreachable unwind label %lpad5 + +lpad5: ; preds = %invoke.cont6, %catch + %11 = landingpad { i8*, i32 } + cleanup + %12 = extractvalue { i8*, i32 } %11, 0 + store i8* %12, i8** %exn.slot, align 8 + %13 = extractvalue { i8*, i32 } %11, 1 + store i32 %13, i32* %ehselector.slot, align 4 + invoke void @__cxa_end_catch() + to label %invoke.cont7 unwind label %terminate.lpad + +invoke.cont7: ; preds = %lpad5 + br label %eh.resume + +try.cont: ; preds = %invoke.cont4 + ret void + +eh.resume: ; preds = %invoke.cont7 + %exn8 = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn8, 0 + %lpad.val9 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val9 + +terminate.lpad: ; preds = %lpad5 + %14 = landingpad { i8*, i32 } + catch i8* null + %15 = extractvalue { i8*, i32 } %14, 0 + call void @__clang_call_terminate(i8* %15) #16 + unreachable + +unreachable: ; preds = %invoke.cont6 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZN9__gnu_cxx14__alloc_traitsISaISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEEE8allocateERS9_m(%"class.std::allocator.4"* dereferenceable(1) %__a, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.4"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator.4"* %__a, %"class.std::allocator.4"** %__a.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator.4"*, %"class.std::allocator.4"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.4"* %0 to %"class.__gnu_cxx::new_allocator.5"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %"struct.std::_Rb_tree_node"* @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE8allocateEmPKv(%"class.__gnu_cxx::new_allocator.5"* %1, i64 %2, i8* null) + ret %"struct.std::_Rb_tree_node"* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %"struct.std::_Rb_tree_node"* @_ZN9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE8allocateEmPKv(%"class.__gnu_cxx::new_allocator.5"* %this, i64 %__n, i8* %0) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.5"*, align 8 + %__n.addr = alloca i64, align 8 + %.addr = alloca i8*, align 8 + store %"class.__gnu_cxx::new_allocator.5"* %this, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %0, i8** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.5"*, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE8max_sizeEv(%"class.__gnu_cxx::new_allocator.5"* %this1) #3 + %cmp = icmp ugt i64 %1, %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_ZSt17__throw_bad_allocv() #19 + unreachable + +if.end: ; preds = %entry + %2 = load i64, i64* %__n.addr, align 8 + %mul = mul i64 %2, 80 + %call2 = call i8* @_Znwm(i64 %mul) + %3 = bitcast i8* %call2 to %"struct.std::_Rb_tree_node"* + ret %"struct.std::_Rb_tree_node"* %3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNK9__gnu_cxx13new_allocatorISt13_Rb_tree_nodeISt4pairIKhSt6vectorIbSaIbEEEEE8max_sizeEv(%"class.__gnu_cxx::new_allocator.5"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.5"*, align 8 + store %"class.__gnu_cxx::new_allocator.5"* %this, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.5"*, %"class.__gnu_cxx::new_allocator.5"** %this.addr, align 8 + ret i64 230584300921369395 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorISt4pairIKhSt6vectorIbSaIbEEEE9constructEPS6_RKS6_(%"class.__gnu_cxx::new_allocator.8"* %this, %"struct.std::pair"* %__p, %"struct.std::pair"* dereferenceable(48) %__val) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.8"*, align 8 + %__p.addr = alloca %"struct.std::pair"*, align 8 + %__val.addr = alloca %"struct.std::pair"*, align 8 + store %"class.__gnu_cxx::new_allocator.8"* %this, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + store %"struct.std::pair"* %__p, %"struct.std::pair"** %__p.addr, align 8 + store %"struct.std::pair"* %__val, %"struct.std::pair"** %__val.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.8"*, %"class.__gnu_cxx::new_allocator.8"** %this.addr, align 8 + %0 = load %"struct.std::pair"*, %"struct.std::pair"** %__p.addr, align 8 + %1 = bitcast %"struct.std::pair"* %0 to i8* + %2 = bitcast i8* %1 to %"struct.std::pair"* + %3 = load %"struct.std::pair"*, %"struct.std::pair"** %__val.addr, align 8 + call void @_ZNSt4pairIKhSt6vectorIbSaIbEEEC2ERKS4_(%"struct.std::pair"* %2, %"struct.std::pair"* dereferenceable(48) %3) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt4pairIKhSt6vectorIbSaIbEEEC2ERKS4_(%"struct.std::pair"* %this, %"struct.std::pair"* dereferenceable(48) %0) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::pair"*, align 8 + %.addr = alloca %"struct.std::pair"*, align 8 + store %"struct.std::pair"* %this, %"struct.std::pair"** %this.addr, align 8 + store %"struct.std::pair"* %0, %"struct.std::pair"** %.addr, align 8 + %this1 = load %"struct.std::pair"*, %"struct.std::pair"** %this.addr, align 8 + %1 = bitcast %"struct.std::pair"* %this1 to %"class.std::__pair_base"* + %2 = load %"struct.std::pair"*, %"struct.std::pair"** %.addr, align 8 + %3 = bitcast %"struct.std::pair"* %2 to %"class.std::__pair_base"* + %first = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %this1, i32 0, i32 0 + %4 = load %"struct.std::pair"*, %"struct.std::pair"** %.addr, align 8 + %first2 = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %4, i32 0, i32 0 + %5 = load i8, i8* %first2, align 8 + store i8 %5, i8* %first, align 8 + %second = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %this1, i32 0, i32 1 + %6 = load %"struct.std::pair"*, %"struct.std::pair"** %.addr, align 8 + %second3 = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %6, i32 0, i32 1 + call void @_ZNSt6vectorIbSaIbEEC2ERKS1_(%"class.std::vector.0"* %second, %"class.std::vector.0"* dereferenceable(40) %second3) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %ref.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %ref.tmp2 = alloca %"struct.std::_Bit_const_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %call = call { i64*, i32 } @_ZNKSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %this1) + %0 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + %2 = extractvalue { i64*, i32 } %call, 0 + store i64* %2, i64** %1, align 8 + %3 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + %4 = extractvalue { i64*, i32 } %call, 1 + store i32 %4, i32* %3, align 8 + %5 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp to %"struct.std::_Bit_iterator_base"* + %call3 = call { i64*, i32 } @_ZNKSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %this1) + %6 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp2 to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + %8 = extractvalue { i64*, i32 } %call3, 0 + store i64* %8, i64** %7, align 8 + %9 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + %10 = extractvalue { i64*, i32 } %call3, 1 + store i32 %10, i32* %9, align 8 + %11 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp2 to %"struct.std::_Bit_iterator_base"* + %call4 = call i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %5, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %11) + ret i64 %call4 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIbSaIbEE8capacityEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %ref.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %ref.tmp2 = alloca %"struct.std::_Bit_const_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %call = call i64* @_ZNKSt13_Bvector_baseISaIbEE13_Bvector_impl11_M_end_addrEv(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl) + call void @_ZNSt19_Bit_const_iteratorC2EPmj(%"struct.std::_Bit_const_iterator"* %ref.tmp, i64* %call, i32 0) + %1 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp to %"struct.std::_Bit_iterator_base"* + %call3 = call { i64*, i32 } @_ZNKSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %this1) + %2 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp2 to { i64*, i32 }* + %3 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %2, i32 0, i32 0 + %4 = extractvalue { i64*, i32 } %call3, 0 + store i64* %4, i64** %3, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %2, i32 0, i32 1 + %6 = extractvalue { i64*, i32 } %call3, 1 + store i32 %6, i32* %5, align 8 + %7 = bitcast %"struct.std::_Bit_const_iterator"* %ref.tmp2 to %"struct.std::_Bit_iterator_base"* + %call4 = call i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %1, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %7) + ret i64 %call4 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEE13_M_deallocateEv(%"struct.std::_Bvector_base"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bvector_base"*, align 8 + %__n = alloca i64, align 8 + %ref.tmp = alloca %"struct.std::_Bit_iterator", align 8 + store %"struct.std::_Bvector_base"* %this, %"struct.std::_Bvector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base"*, %"struct.std::_Bvector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 0 + %0 = bitcast %"struct.std::_Bit_iterator"* %_M_start to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %0, i32 0, i32 0 + %1 = load i64*, i64** %_M_p, align 8 + %tobool = icmp ne i64* %1, null + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_impl2 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %call = call i64* @_ZNKSt13_Bvector_baseISaIbEE13_Bvector_impl11_M_end_addrEv(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl2) + %_M_impl3 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %_M_start4 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl3, i32 0, i32 0 + %2 = bitcast %"struct.std::_Bit_iterator"* %_M_start4 to %"struct.std::_Bit_iterator_base"* + %_M_p5 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %2, i32 0, i32 0 + %3 = load i64*, i64** %_M_p5, align 8 + %sub.ptr.lhs.cast = ptrtoint i64* %call to i64 + %sub.ptr.rhs.cast = ptrtoint i64* %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + store i64 %sub.ptr.div, i64* %__n, align 8 + %_M_impl6 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %4 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl6 to %"class.std::allocator.1"* + %_M_impl7 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl7, i32 0, i32 2 + %5 = load i64*, i64** %_M_end_of_storage, align 8 + %6 = load i64, i64* %__n, align 8 + %idx.neg = sub i64 0, %6 + %add.ptr = getelementptr inbounds i64, i64* %5, i64 %idx.neg + %7 = load i64, i64* %__n, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaImEE10deallocateERS1_Pmm(%"class.std::allocator.1"* dereferenceable(1) %4, i64* %add.ptr, i64 %7) + call void @_ZNSt13_Bit_iteratorC2Ev(%"struct.std::_Bit_iterator"* %ref.tmp) + %_M_impl8 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl8, i32 0, i32 1 + %8 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to i8* + %9 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %8, i8* align 8 %9, i64 12, i1 false) + %_M_impl9 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %_M_start10 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl9, i32 0, i32 0 + %10 = bitcast %"struct.std::_Bit_iterator"* %_M_start10 to i8* + %11 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %10, i8* align 8 %11, i64 12, i1 false) + %_M_impl11 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %_M_end_of_storage12 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl11, i32 0, i32 2 + store i64* null, i64** %_M_end_of_storage12, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIbSaIbEE13_M_initializeEm(%"class.std::vector.0"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__n.addr = alloca i64, align 8 + %__q = alloca i64*, align 8 + %ref.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %ref.tmp7 = alloca %"struct.std::_Bit_iterator", align 8 + %ref.tmp10 = alloca %"struct.std::_Bit_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = load i64, i64* %__n.addr, align 8 + %tobool = icmp ne i64 %0, 0 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call i64* @_ZNSt13_Bvector_baseISaIbEE11_M_allocateEm(%"struct.std::_Bvector_base"* %1, i64 %2) + store i64* %call, i64** %__q, align 8 + %3 = load i64*, i64** %__q, align 8 + %4 = load i64, i64* %__n.addr, align 8 + %call2 = call i64 @_ZNSt13_Bvector_baseISaIbEE8_S_nwordEm(i64 %4) + %add.ptr = getelementptr inbounds i64, i64* %3, i64 %call2 + %5 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %5, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 2 + store i64* %add.ptr, i64** %_M_end_of_storage, align 8 + %6 = load i64*, i64** %__q, align 8 + %call3 = call i64* @_ZSt11__addressofImEPT_RS0_(i64* dereferenceable(8) %6) + call void @_ZNSt13_Bit_iteratorC2EPmj(%"struct.std::_Bit_iterator"* %ref.tmp, i64* %call3, i32 0) + %7 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl4 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %7, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl4, i32 0, i32 0 + %8 = bitcast %"struct.std::_Bit_iterator"* %_M_start to i8* + %9 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %8, i8* align 8 %9, i64 12, i1 false) + br label %if.end + +if.else: ; preds = %entry + %10 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl5 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %10, i32 0, i32 0 + %_M_end_of_storage6 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl5, i32 0, i32 2 + store i64* null, i64** %_M_end_of_storage6, align 8 + call void @_ZNSt13_Bit_iteratorC2EPmj(%"struct.std::_Bit_iterator"* %ref.tmp7, i64* null, i32 0) + %11 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl8 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %11, i32 0, i32 0 + %_M_start9 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl8, i32 0, i32 0 + %12 = bitcast %"struct.std::_Bit_iterator"* %_M_start9 to i8* + %13 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp7 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %12, i8* align 8 %13, i64 12, i1 false) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %14 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl11 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %14, i32 0, i32 0 + %_M_start12 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl11, i32 0, i32 0 + %15 = load i64, i64* %__n.addr, align 8 + %call13 = call { i64*, i32 } @_ZNKSt13_Bit_iteratorplEl(%"struct.std::_Bit_iterator"* %_M_start12, i64 %15) + %16 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp10 to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call13, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call13, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl14 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %21, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl14, i32 0, i32 1 + %22 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to i8* + %23 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp10 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %22, i8* align 8 %23, i64 12, i1 false) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt6vectorIbSaIbEE15_M_copy_alignedESt19_Bit_const_iteratorS2_St13_Bit_iterator(%"class.std::vector.0"* %this, i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, %"struct.std::_Bit_iterator"* byval(%"struct.std::_Bit_iterator") align 8 %__result) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__q = alloca i64*, align 8 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp6 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %6 = bitcast %"struct.std::_Bit_const_iterator"* %__first to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %6, i32 0, i32 0 + %7 = load i64*, i64** %_M_p, align 8 + %8 = bitcast %"struct.std::_Bit_const_iterator"* %__last to %"struct.std::_Bit_iterator_base"* + %_M_p2 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %8, i32 0, i32 0 + %9 = load i64*, i64** %_M_p2, align 8 + %10 = bitcast %"struct.std::_Bit_iterator"* %__result to %"struct.std::_Bit_iterator_base"* + %_M_p3 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %10, i32 0, i32 0 + %11 = load i64*, i64** %_M_p3, align 8 + %call = call i64* @_ZSt4copyIPmS0_ET0_T_S2_S1_(i64* %7, i64* %9, i64* %11) + store i64* %call, i64** %__q, align 8 + %12 = bitcast %"struct.std::_Bit_const_iterator"* %__last to %"struct.std::_Bit_iterator_base"* + %_M_p4 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %12, i32 0, i32 0 + %13 = load i64*, i64** %_M_p4, align 8 + call void @_ZNSt19_Bit_const_iteratorC2EPmj(%"struct.std::_Bit_const_iterator"* %agg.tmp, i64* %13, i32 0) + %14 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp5 to i8* + %15 = bitcast %"struct.std::_Bit_const_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %14, i8* align 8 %15, i64 16, i1 false) + %16 = load i64*, i64** %__q, align 8 + call void @_ZNSt13_Bit_iteratorC2EPmj(%"struct.std::_Bit_iterator"* %agg.tmp6, i64* %16, i32 0) + %17 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %18 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %17, i32 0, i32 0 + %19 = load i64*, i64** %18, align 8 + %20 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %17, i32 0, i32 1 + %21 = load i32, i32* %20, align 8 + %22 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp5 to { i64*, i32 }* + %23 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %22, i32 0, i32 0 + %24 = load i64*, i64** %23, align 8 + %25 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %22, i32 0, i32 1 + %26 = load i32, i32* %25, align 8 + %27 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to { i64*, i32 }* + %28 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %27, i32 0, i32 0 + %29 = load i64*, i64** %28, align 8 + %30 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %27, i32 0, i32 1 + %31 = load i32, i32* %30, align 8 + %call7 = call { i64*, i32 } @_ZSt4copyISt19_Bit_const_iteratorSt13_Bit_iteratorET0_T_S3_S2_(i64* %19, i32 %21, i64* %24, i32 %26, i64* %29, i32 %31) + %32 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %33 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %32, i32 0, i32 0 + %34 = extractvalue { i64*, i32 } %call7, 0 + store i64* %34, i64** %33, align 8 + %35 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %32, i32 0, i32 1 + %36 = extractvalue { i64*, i32 } %call7, 1 + store i32 %36, i32* %35, align 8 + %37 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %38 = load { i64*, i32 }, { i64*, i32 }* %37, align 8 + ret { i64*, i32 } %38 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %this) #6 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 0 + %1 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %2 = bitcast %"struct.std::_Bit_iterator"* %_M_start to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 16, i1 false) + %3 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %4 = load { i64*, i32 }, { i64*, i32 }* %3, align 8 + ret { i64*, i32 } %4 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZNKSt13_Bvector_baseISaIbEE13_Bvector_impl11_M_end_addrEv(%"struct.std::_Bvector_base >::_Bvector_impl"* %this) #0 comdat align 2 { +entry: + %retval = alloca i64*, align 8 + %this.addr = alloca %"struct.std::_Bvector_base >::_Bvector_impl"*, align 8 + store %"struct.std::_Bvector_base >::_Bvector_impl"* %this, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base >::_Bvector_impl"*, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 2 + %0 = load i64*, i64** %_M_end_of_storage, align 8 + %tobool = icmp ne i64* %0, null + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_end_of_storage2 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 2 + %1 = load i64*, i64** %_M_end_of_storage2, align 8 + %arrayidx = getelementptr inbounds i64, i64* %1, i64 -1 + %call = call i64* @_ZSt11__addressofImEPT_RS0_(i64* dereferenceable(8) %arrayidx) + %add.ptr = getelementptr inbounds i64, i64* %call, i64 1 + store i64* %add.ptr, i64** %retval, align 8 + br label %return + +if.end: ; preds = %entry + store i64* null, i64** %retval, align 8 + br label %return + +return: ; preds = %if.end, %if.then + %2 = load i64*, i64** %retval, align 8 + ret i64* %2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64* @_ZSt11__addressofImEPT_RS0_(i64* dereferenceable(8) %__r) #6 comdat { +entry: + %__r.addr = alloca i64*, align 8 + store i64* %__r, i64** %__r.addr, align 8 + %0 = load i64*, i64** %__r.addr, align 8 + ret i64* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaImEE10deallocateERS1_Pmm(%"class.std::allocator.1"* dereferenceable(1) %__a, i64* %__p, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.1"*, align 8 + %__p.addr = alloca i64*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator.1"* %__a, %"class.std::allocator.1"** %__a.addr, align 8 + store i64* %__p, i64** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.1"* %0 to %"class.__gnu_cxx::new_allocator.2"* + %2 = load i64*, i64** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorImE10deallocateEPmm(%"class.__gnu_cxx::new_allocator.2"* %1, i64* %2, i64 %3) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorImE10deallocateEPmm(%"class.__gnu_cxx::new_allocator.2"* %this, i64* %__p, i64 %0) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + %__p.addr = alloca i64*, align 8 + %.addr = alloca i64, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + store i64* %__p, i64** %__p.addr, align 8 + store i64 %0, i64* %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.2"*, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + %1 = load i64*, i64** %__p.addr, align 8 + %2 = bitcast i64* %1 to i8* + call void @_ZdlPv(i8* %2) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZNSt13_Bvector_baseISaIbEE11_M_allocateEm(%"struct.std::_Bvector_base"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bvector_base"*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Bvector_base"* %this, %"struct.std::_Bvector_base"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Bvector_base"*, %"struct.std::_Bvector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl to %"class.std::allocator.1"* + %1 = load i64, i64* %__n.addr, align 8 + %call = call i64 @_ZNSt13_Bvector_baseISaIbEE8_S_nwordEm(i64 %1) + %call2 = call i64* @_ZN9__gnu_cxx14__alloc_traitsISaImEE8allocateERS1_m(%"class.std::allocator.1"* dereferenceable(1) %0, i64 %call) + ret i64* %call2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNSt13_Bvector_baseISaIbEE8_S_nwordEm(i64 %__n) #6 comdat align 2 { +entry: + %__n.addr = alloca i64, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load i64, i64* %__n.addr, align 8 + %add = add i64 %0, 64 + %sub = sub i64 %add, 1 + %div = udiv i64 %sub, 64 + ret i64 %div +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bit_iteratorC2EPmj(%"struct.std::_Bit_iterator"* %this, i64* %__x, i32 %__y) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + %__x.addr = alloca i64*, align 8 + %__y.addr = alloca i32, align 4 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + store i64* %__x, i64** %__x.addr, align 8 + store i32 %__y, i32* %__y.addr, align 4 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %1 = load i64*, i64** %__x.addr, align 8 + %2 = load i32, i32* %__y.addr, align 4 + call void @_ZNSt18_Bit_iterator_baseC2EPmj(%"struct.std::_Bit_iterator_base"* %0, i64* %1, i32 %2) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNKSt13_Bit_iteratorplEl(%"struct.std::_Bit_iterator"* %this, i64 %__i) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + %__i.addr = alloca i64, align 8 + %__tmp = alloca %"struct.std::_Bit_iterator", align 8 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + store i64 %__i, i64* %__i.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__tmp to i8* + %1 = bitcast %"struct.std::_Bit_iterator"* %this1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 16, i1 false) + %2 = load i64, i64* %__i.addr, align 8 + %call = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorpLEl(%"struct.std::_Bit_iterator"* %__tmp, i64 %2) + %3 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %4 = bitcast %"struct.std::_Bit_iterator"* %call to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 16, i1 false) + %5 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %6 = load { i64*, i32 }, { i64*, i32 }* %5, align 8 + ret { i64*, i32 } %6 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZN9__gnu_cxx14__alloc_traitsISaImEE8allocateERS1_m(%"class.std::allocator.1"* dereferenceable(1) %__a, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.1"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator.1"* %__a, %"class.std::allocator.1"** %__a.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.1"* %0 to %"class.__gnu_cxx::new_allocator.2"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call i64* @_ZN9__gnu_cxx13new_allocatorImE8allocateEmPKv(%"class.__gnu_cxx::new_allocator.2"* %1, i64 %2, i8* null) + ret i64* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZN9__gnu_cxx13new_allocatorImE8allocateEmPKv(%"class.__gnu_cxx::new_allocator.2"* %this, i64 %__n, i8* %0) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + %__n.addr = alloca i64, align 8 + %.addr = alloca i8*, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %0, i8** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.2"*, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorImE8max_sizeEv(%"class.__gnu_cxx::new_allocator.2"* %this1) #3 + %cmp = icmp ugt i64 %1, %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_ZSt17__throw_bad_allocv() #19 + unreachable + +if.end: ; preds = %entry + %2 = load i64, i64* %__n.addr, align 8 + %mul = mul i64 %2, 8 + %call2 = call i8* @_Znwm(i64 %mul) + %3 = bitcast i8* %call2 to i64* + ret i64* %3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNK9__gnu_cxx13new_allocatorImE8max_sizeEv(%"class.__gnu_cxx::new_allocator.2"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.2"*, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + ret i64 2305843009213693951 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorpLEl(%"struct.std::_Bit_iterator"* %this, i64 %__i) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + %__i.addr = alloca i64, align 8 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + store i64 %__i, i64* %__i.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %1 = load i64, i64* %__i.addr, align 8 + call void @_ZNSt18_Bit_iterator_base7_M_incrEl(%"struct.std::_Bit_iterator_base"* %0, i64 %1) + ret %"struct.std::_Bit_iterator"* %this1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt18_Bit_iterator_base7_M_incrEl(%"struct.std::_Bit_iterator_base"* %this, i64 %__i) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator_base"*, align 8 + %__i.addr = alloca i64, align 8 + %__n = alloca i64, align 8 + store %"struct.std::_Bit_iterator_base"* %this, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + store i64 %__i, i64* %__i.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + %0 = load i64, i64* %__i.addr, align 8 + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + %1 = load i32, i32* %_M_offset, align 8 + %conv = zext i32 %1 to i64 + %add = add nsw i64 %0, %conv + store i64 %add, i64* %__n, align 8 + %2 = load i64, i64* %__n, align 8 + %div = sdiv i64 %2, 64 + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 0 + %3 = load i64*, i64** %_M_p, align 8 + %add.ptr = getelementptr inbounds i64, i64* %3, i64 %div + store i64* %add.ptr, i64** %_M_p, align 8 + %4 = load i64, i64* %__n, align 8 + %rem = srem i64 %4, 64 + store i64 %rem, i64* %__n, align 8 + %5 = load i64, i64* %__n, align 8 + %cmp = icmp slt i64 %5, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %6 = load i64, i64* %__n, align 8 + %add2 = add nsw i64 %6, 64 + store i64 %add2, i64* %__n, align 8 + %_M_p3 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 0 + %7 = load i64*, i64** %_M_p3, align 8 + %incdec.ptr = getelementptr inbounds i64, i64* %7, i32 -1 + store i64* %incdec.ptr, i64** %_M_p3, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + %8 = load i64, i64* %__n, align 8 + %conv4 = trunc i64 %8 to i32 + %_M_offset5 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + store i32 %conv4, i32* %_M_offset5, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZSt4copyIPmS0_ET0_T_S2_S1_(i64* %__first, i64* %__last, i64* %__result) #0 comdat { +entry: + %__first.addr = alloca i64*, align 8 + %__last.addr = alloca i64*, align 8 + %__result.addr = alloca i64*, align 8 + store i64* %__first, i64** %__first.addr, align 8 + store i64* %__last, i64** %__last.addr, align 8 + store i64* %__result, i64** %__result.addr, align 8 + %0 = load i64*, i64** %__first.addr, align 8 + %call = call i64* @_ZSt12__miter_baseIPmET_S1_(i64* %0) + %1 = load i64*, i64** %__last.addr, align 8 + %call1 = call i64* @_ZSt12__miter_baseIPmET_S1_(i64* %1) + %2 = load i64*, i64** %__result.addr, align 8 + %call2 = call i64* @_ZSt14__copy_move_a2ILb0EPmS0_ET1_T0_S2_S1_(i64* %call, i64* %call1, i64* %2) + ret i64* %call2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt4copyISt19_Bit_const_iteratorSt13_Bit_iteratorET0_T_S3_S2_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to i8* + %10 = bitcast %"struct.std::_Bit_const_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = load i64*, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = load i32, i32* %14, align 8 + %call = call { i64*, i32 } @_ZSt12__miter_baseISt19_Bit_const_iteratorET_S1_(i64* %13, i32 %15) + %16 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp3 to i8* + %22 = bitcast %"struct.std::_Bit_const_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 16, i1 false) + %23 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %call4 = call { i64*, i32 } @_ZSt12__miter_baseISt19_Bit_const_iteratorET_S1_(i64* %25, i32 %27) + %28 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp2 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = extractvalue { i64*, i32 } %call4, 0 + store i64* %30, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = extractvalue { i64*, i32 } %call4, 1 + store i32 %32, i32* %31, align 8 + %33 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to i8* + %34 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %33, i8* align 8 %34, i64 16, i1 false) + %35 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 0 + %37 = load i64*, i64** %36, align 8 + %38 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 1 + %39 = load i32, i32* %38, align 8 + %40 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp2 to { i64*, i32 }* + %41 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 0 + %42 = load i64*, i64** %41, align 8 + %43 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 1 + %44 = load i32, i32* %43, align 8 + %45 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %46 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 0 + %47 = load i64*, i64** %46, align 8 + %48 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 1 + %49 = load i32, i32* %48, align 8 + %call6 = call { i64*, i32 } @_ZSt14__copy_move_a2ILb0ESt19_Bit_const_iteratorSt13_Bit_iteratorET1_T0_S3_S2_(i64* %37, i32 %39, i64* %42, i32 %44, i64* %47, i32 %49) + %50 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 0 + %52 = extractvalue { i64*, i32 } %call6, 0 + store i64* %52, i64** %51, align 8 + %53 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 1 + %54 = extractvalue { i64*, i32 } %call6, 1 + store i32 %54, i32* %53, align 8 + %55 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %56 = load { i64*, i32 }, { i64*, i32 }* %55, align 8 + ret { i64*, i32 } %56 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZSt14__copy_move_a2ILb0EPmS0_ET1_T0_S2_S1_(i64* %__first, i64* %__last, i64* %__result) #0 comdat { +entry: + %__first.addr = alloca i64*, align 8 + %__last.addr = alloca i64*, align 8 + %__result.addr = alloca i64*, align 8 + store i64* %__first, i64** %__first.addr, align 8 + store i64* %__last, i64** %__last.addr, align 8 + store i64* %__result, i64** %__result.addr, align 8 + %0 = load i64*, i64** %__first.addr, align 8 + %call = call i64* @_ZSt12__niter_baseIPmET_S1_(i64* %0) + %1 = load i64*, i64** %__last.addr, align 8 + %call1 = call i64* @_ZSt12__niter_baseIPmET_S1_(i64* %1) + %2 = load i64*, i64** %__result.addr, align 8 + %call2 = call i64* @_ZSt12__niter_baseIPmET_S1_(i64* %2) + %call3 = call i64* @_ZSt13__copy_move_aILb0EPmS0_ET1_T0_S2_S1_(i64* %call, i64* %call1, i64* %call2) + ret i64* %call3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64* @_ZSt12__miter_baseIPmET_S1_(i64* %__it) #6 comdat { +entry: + %__it.addr = alloca i64*, align 8 + store i64* %__it, i64** %__it.addr, align 8 + %0 = load i64*, i64** %__it.addr, align 8 + ret i64* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64* @_ZSt13__copy_move_aILb0EPmS0_ET1_T0_S2_S1_(i64* %__first, i64* %__last, i64* %__result) #0 comdat { +entry: + %__first.addr = alloca i64*, align 8 + %__last.addr = alloca i64*, align 8 + %__result.addr = alloca i64*, align 8 + %__simple = alloca i8, align 1 + store i64* %__first, i64** %__first.addr, align 8 + store i64* %__last, i64** %__last.addr, align 8 + store i64* %__result, i64** %__result.addr, align 8 + store i8 1, i8* %__simple, align 1 + %0 = load i64*, i64** %__first.addr, align 8 + %1 = load i64*, i64** %__last.addr, align 8 + %2 = load i64*, i64** %__result.addr, align 8 + %call = call i64* @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mImEEPT_PKS3_S6_S4_(i64* %0, i64* %1, i64* %2) + ret i64* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64* @_ZSt12__niter_baseIPmET_S1_(i64* %__it) #6 comdat { +entry: + %__it.addr = alloca i64*, align 8 + store i64* %__it, i64** %__it.addr, align 8 + %0 = load i64*, i64** %__it.addr, align 8 + ret i64* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64* @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mImEEPT_PKS3_S6_S4_(i64* %__first, i64* %__last, i64* %__result) #6 comdat align 2 { +entry: + %__first.addr = alloca i64*, align 8 + %__last.addr = alloca i64*, align 8 + %__result.addr = alloca i64*, align 8 + %_Num = alloca i64, align 8 + store i64* %__first, i64** %__first.addr, align 8 + store i64* %__last, i64** %__last.addr, align 8 + store i64* %__result, i64** %__result.addr, align 8 + %0 = load i64*, i64** %__last.addr, align 8 + %1 = load i64*, i64** %__first.addr, align 8 + %sub.ptr.lhs.cast = ptrtoint i64* %0 to i64 + %sub.ptr.rhs.cast = ptrtoint i64* %1 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + store i64 %sub.ptr.div, i64* %_Num, align 8 + %2 = load i64, i64* %_Num, align 8 + %tobool = icmp ne i64 %2, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %3 = load i64*, i64** %__result.addr, align 8 + %4 = bitcast i64* %3 to i8* + %5 = load i64*, i64** %__first.addr, align 8 + %6 = bitcast i64* %5 to i8* + %7 = load i64, i64* %_Num, align 8 + %mul = mul i64 8, %7 + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %4, i8* align 8 %6, i64 %mul, i1 false) + br label %if.end + +if.end: ; preds = %if.then, %entry + %8 = load i64*, i64** %__result.addr, align 8 + %9 = load i64, i64* %_Num, align 8 + %add.ptr = getelementptr inbounds i64, i64* %8, i64 %9 + ret i64* %add.ptr +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt14__copy_move_a2ILb0ESt19_Bit_const_iteratorSt13_Bit_iteratorET1_T0_S3_S2_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp6 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to i8* + %10 = bitcast %"struct.std::_Bit_const_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = load i64*, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = load i32, i32* %14, align 8 + %call = call { i64*, i32 } @_ZSt12__niter_baseISt19_Bit_const_iteratorET_S1_(i64* %13, i32 %15) + %16 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp3 to i8* + %22 = bitcast %"struct.std::_Bit_const_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 16, i1 false) + %23 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %call4 = call { i64*, i32 } @_ZSt12__niter_baseISt19_Bit_const_iteratorET_S1_(i64* %25, i32 %27) + %28 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp2 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = extractvalue { i64*, i32 } %call4, 0 + store i64* %30, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = extractvalue { i64*, i32 } %call4, 1 + store i32 %32, i32* %31, align 8 + %33 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to i8* + %34 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %33, i8* align 8 %34, i64 16, i1 false) + %35 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to { i64*, i32 }* + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 0 + %37 = load i64*, i64** %36, align 8 + %38 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 1 + %39 = load i32, i32* %38, align 8 + %call7 = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %37, i32 %39) + %40 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %41 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 0 + %42 = extractvalue { i64*, i32 } %call7, 0 + store i64* %42, i64** %41, align 8 + %43 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 1 + %44 = extractvalue { i64*, i32 } %call7, 1 + store i32 %44, i32* %43, align 8 + %45 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %46 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 0 + %47 = load i64*, i64** %46, align 8 + %48 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 1 + %49 = load i32, i32* %48, align 8 + %50 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp2 to { i64*, i32 }* + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 0 + %52 = load i64*, i64** %51, align 8 + %53 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 1 + %54 = load i32, i32* %53, align 8 + %55 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %56 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 0 + %57 = load i64*, i64** %56, align 8 + %58 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 1 + %59 = load i32, i32* %58, align 8 + %call8 = call { i64*, i32 } @_ZSt13__copy_move_aILb0ESt19_Bit_const_iteratorSt13_Bit_iteratorET1_T0_S3_S2_(i64* %47, i32 %49, i64* %52, i32 %54, i64* %57, i32 %59) + %60 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %61 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 0 + %62 = extractvalue { i64*, i32 } %call8, 0 + store i64* %62, i64** %61, align 8 + %63 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 1 + %64 = extractvalue { i64*, i32 } %call8, 1 + store i32 %64, i32* %63, align 8 + %65 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %66 = load { i64*, i32 }, { i64*, i32 }* %65, align 8 + ret { i64*, i32 } %66 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt12__miter_baseISt19_Bit_const_iteratorET_S1_(i64* %__it.coerce0, i32 %__it.coerce1) #6 comdat { +entry: + %retval = alloca %"struct.std::_Bit_const_iterator", align 8 + %__it = alloca %"struct.std::_Bit_const_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__it to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__it.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__it.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %retval to i8* + %4 = bitcast %"struct.std::_Bit_const_iterator"* %__it to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 16, i1 false) + %5 = bitcast %"struct.std::_Bit_const_iterator"* %retval to { i64*, i32 }* + %6 = load { i64*, i32 }, { i64*, i32 }* %5, align 8 + ret { i64*, i32 } %6 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt13__copy_move_aILb0ESt19_Bit_const_iteratorSt13_Bit_iteratorET1_T0_S3_S2_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %__simple = alloca i8, align 1 + %agg.tmp = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + store i8 0, i8* %__simple, align 1 + %9 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to i8* + %10 = bitcast %"struct.std::_Bit_const_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to i8* + %12 = bitcast %"struct.std::_Bit_const_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %11, i8* align 8 %12, i64 16, i1 false) + %13 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to i8* + %14 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 16, i1 false) + %15 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp to { i64*, i32 }* + %16 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 0 + %17 = load i64*, i64** %16, align 8 + %18 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 1 + %19 = load i32, i32* %18, align 8 + %20 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp1 to { i64*, i32 }* + %21 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %20, i32 0, i32 0 + %22 = load i64*, i64** %21, align 8 + %23 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %20, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %25, i32 0, i32 0 + %27 = load i64*, i64** %26, align 8 + %28 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %25, i32 0, i32 1 + %29 = load i32, i32* %28, align 8 + %call = call { i64*, i32 } @_ZNSt11__copy_moveILb0ELb0ESt26random_access_iterator_tagE8__copy_mISt19_Bit_const_iteratorSt13_Bit_iteratorEET0_T_S6_S5_(i64* %17, i32 %19, i64* %22, i32 %24, i64* %27, i32 %29) + %30 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %30, i32 0, i32 0 + %32 = extractvalue { i64*, i32 } %call, 0 + store i64* %32, i64** %31, align 8 + %33 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %30, i32 0, i32 1 + %34 = extractvalue { i64*, i32 } %call, 1 + store i32 %34, i32* %33, align 8 + %35 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %36 = load { i64*, i32 }, { i64*, i32 }* %35, align 8 + ret { i64*, i32 } %36 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt12__niter_baseISt19_Bit_const_iteratorET_S1_(i64* %__it.coerce0, i32 %__it.coerce1) #6 comdat { +entry: + %retval = alloca %"struct.std::_Bit_const_iterator", align 8 + %__it = alloca %"struct.std::_Bit_const_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__it to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__it.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__it.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %retval to i8* + %4 = bitcast %"struct.std::_Bit_const_iterator"* %__it to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 16, i1 false) + %5 = bitcast %"struct.std::_Bit_const_iterator"* %retval to { i64*, i32 }* + %6 = load { i64*, i32 }, { i64*, i32 }* %5, align 8 + ret { i64*, i32 } %6 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %__it.coerce0, i32 %__it.coerce1) #6 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__it = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__it to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__it.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__it.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %4 = bitcast %"struct.std::_Bit_iterator"* %__it to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 16, i1 false) + %5 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %6 = load { i64*, i32 }, { i64*, i32 }* %5, align 8 + ret { i64*, i32 } %6 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt11__copy_moveILb0ELb0ESt26random_access_iterator_tagE8__copy_mISt19_Bit_const_iteratorSt13_Bit_iteratorEET0_T_S6_S5_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_const_iterator", align 8 + %__last = alloca %"struct.std::_Bit_const_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %__n = alloca i64, align 8 + %ref.tmp = alloca %"struct.std::_Bit_reference", align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_const_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_const_iterator"* %__last to %"struct.std::_Bit_iterator_base"* + %10 = bitcast %"struct.std::_Bit_const_iterator"* %__first to %"struct.std::_Bit_iterator_base"* + %call = call i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %9, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %10) + store i64 %call, i64* %__n, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %11 = load i64, i64* %__n, align 8 + %cmp = icmp sgt i64 %11, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call zeroext i1 @_ZNKSt19_Bit_const_iteratordeEv(%"struct.std::_Bit_const_iterator"* %__first) + %call2 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %__result) + %12 = bitcast %"struct.std::_Bit_reference"* %ref.tmp to { i64*, i64 }* + %13 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %12, i32 0, i32 0 + %14 = extractvalue { i64*, i64 } %call2, 0 + store i64* %14, i64** %13, align 8 + %15 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %12, i32 0, i32 1 + %16 = extractvalue { i64*, i64 } %call2, 1 + store i64 %16, i64* %15, align 8 + %call3 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSEb(%"struct.std::_Bit_reference"* %ref.tmp, i1 zeroext %call1) + %call4 = call dereferenceable(16) %"struct.std::_Bit_const_iterator"* @_ZNSt19_Bit_const_iteratorppEv(%"struct.std::_Bit_const_iterator"* %__first) + %call5 = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorppEv(%"struct.std::_Bit_iterator"* %__result) + br label %for.inc + +for.inc: ; preds = %for.body + %17 = load i64, i64* %__n, align 8 + %dec = add nsw i64 %17, -1 + store i64 %dec, i64* %__n, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %18 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %19 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %18, i8* align 8 %19, i64 16, i1 false) + %20 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %21 = load { i64*, i32 }, { i64*, i32 }* %20, align 8 + ret { i64*, i32 } %21 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_reference", align 8 + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %0, i32 0, i32 0 + %1 = load i64*, i64** %_M_p, align 8 + %2 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %2, i32 0, i32 1 + %3 = load i32, i32* %_M_offset, align 8 + %sh_prom = zext i32 %3 to i64 + %shl = shl i64 1, %sh_prom + call void @_ZNSt14_Bit_referenceC2EPmm(%"struct.std::_Bit_reference"* %retval, i64* %1, i64 %shl) + %4 = bitcast %"struct.std::_Bit_reference"* %retval to { i64*, i64 }* + %5 = load { i64*, i64 }, { i64*, i64 }* %4, align 8 + ret { i64*, i64 } %5 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSEb(%"struct.std::_Bit_reference"* %this, i1 zeroext %__x) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_reference"*, align 8 + %__x.addr = alloca i8, align 1 + store %"struct.std::_Bit_reference"* %this, %"struct.std::_Bit_reference"** %this.addr, align 8 + %frombool = zext i1 %__x to i8 + store i8 %frombool, i8* %__x.addr, align 1 + %this1 = load %"struct.std::_Bit_reference"*, %"struct.std::_Bit_reference"** %this.addr, align 8 + %0 = load i8, i8* %__x.addr, align 1 + %tobool = trunc i8 %0 to i1 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %entry + %_M_mask = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 1 + %1 = load i64, i64* %_M_mask, align 8 + %_M_p = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 0 + %2 = load i64*, i64** %_M_p, align 8 + %3 = load i64, i64* %2, align 8 + %or = or i64 %3, %1 + store i64 %or, i64* %2, align 8 + br label %if.end + +if.else: ; preds = %entry + %_M_mask2 = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 1 + %4 = load i64, i64* %_M_mask2, align 8 + %neg = xor i64 %4, -1 + %_M_p3 = getelementptr inbounds %"struct.std::_Bit_reference", %"struct.std::_Bit_reference"* %this1, i32 0, i32 0 + %5 = load i64*, i64** %_M_p3, align 8 + %6 = load i64, i64* %5, align 8 + %and = and i64 %6, %neg + store i64 %and, i64* %5, align 8 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret %"struct.std::_Bit_reference"* %this1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(16) %"struct.std::_Bit_const_iterator"* @_ZNSt19_Bit_const_iteratorppEv(%"struct.std::_Bit_const_iterator"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_const_iterator"*, align 8 + store %"struct.std::_Bit_const_iterator"* %this, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_const_iterator"*, %"struct.std::_Bit_const_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_const_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + call void @_ZNSt18_Bit_iterator_base10_M_bump_upEv(%"struct.std::_Bit_iterator_base"* %0) + ret %"struct.std::_Bit_const_iterator"* %this1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorppEv(%"struct.std::_Bit_iterator"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + call void @_ZNSt18_Bit_iterator_base10_M_bump_upEv(%"struct.std::_Bit_iterator_base"* %0) + ret %"struct.std::_Bit_iterator"* %this1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt18_Bit_iterator_base10_M_bump_upEv(%"struct.std::_Bit_iterator_base"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator_base"*, align 8 + store %"struct.std::_Bit_iterator_base"* %this, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + %0 = load i32, i32* %_M_offset, align 8 + %inc = add i32 %0, 1 + store i32 %inc, i32* %_M_offset, align 8 + %cmp = icmp eq i32 %0, 63 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_offset2 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + store i32 0, i32* %_M_offset2, align 8 + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 0 + %1 = load i64*, i64** %_M_p, align 8 + %incdec.ptr = getelementptr inbounds i64, i64* %1, i32 1 + store i64* %incdec.ptr, i64** %_M_p, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator.1"* @_ZN9__gnu_cxx14__alloc_traitsISaImEE17_S_select_on_copyERKS1_(%"class.std::allocator.1"* dereferenceable(1) %__a) #6 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.1"*, align 8 + store %"class.std::allocator.1"* %__a, %"class.std::allocator.1"** %__a.addr, align 8 + %0 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %__a.addr, align 8 + ret %"class.std::allocator.1"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator.1"* @_ZNKSt13_Bvector_baseISaIbEE20_M_get_Bit_allocatorEv(%"struct.std::_Bvector_base"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bvector_base"*, align 8 + store %"struct.std::_Bvector_base"* %this, %"struct.std::_Bvector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base"*, %"struct.std::_Bvector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl to %"class.std::allocator.1"* + ret %"class.std::allocator.1"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaIbEC2ImEERKSaIT_E(%"class.std::allocator.13"* %this, %"class.std::allocator.1"* dereferenceable(1) %0) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.13"*, align 8 + %.addr = alloca %"class.std::allocator.1"*, align 8 + store %"class.std::allocator.13"* %this, %"class.std::allocator.13"** %this.addr, align 8 + store %"class.std::allocator.1"* %0, %"class.std::allocator.1"** %.addr, align 8 + %this1 = load %"class.std::allocator.13"*, %"class.std::allocator.13"** %this.addr, align 8 + %1 = bitcast %"class.std::allocator.13"* %this1 to %"class.__gnu_cxx::new_allocator.14"* + call void @_ZN9__gnu_cxx13new_allocatorIbEC2Ev(%"class.__gnu_cxx::new_allocator.14"* %1) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEEC2ERKS0_(%"struct.std::_Bvector_base"* %this, %"class.std::allocator.13"* dereferenceable(1) %__a) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Bvector_base"*, align 8 + %__a.addr = alloca %"class.std::allocator.13"*, align 8 + %ref.tmp = alloca %"class.std::allocator.1", align 1 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Bvector_base"* %this, %"struct.std::_Bvector_base"** %this.addr, align 8 + store %"class.std::allocator.13"* %__a, %"class.std::allocator.13"** %__a.addr, align 8 + %this1 = load %"struct.std::_Bvector_base"*, %"struct.std::_Bvector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + %0 = load %"class.std::allocator.13"*, %"class.std::allocator.13"** %__a.addr, align 8 + call void @_ZNSaImEC2IbEERKSaIT_E(%"class.std::allocator.1"* %ref.tmp, %"class.std::allocator.13"* dereferenceable(1) %0) #3 + invoke void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implC2ERKSaImE(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, %"class.std::allocator.1"* dereferenceable(1) %ref.tmp) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + call void @_ZNSaImED2Ev(%"class.std::allocator.1"* %ref.tmp) #3 + ret void + +lpad: ; preds = %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + call void @_ZNSaImED2Ev(%"class.std::allocator.1"* %ref.tmp) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaIbED2Ev(%"class.std::allocator.13"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.13"*, align 8 + store %"class.std::allocator.13"* %this, %"class.std::allocator.13"** %this.addr, align 8 + %this1 = load %"class.std::allocator.13"*, %"class.std::allocator.13"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.13"* %this1 to %"class.__gnu_cxx::new_allocator.14"* + call void @_ZN9__gnu_cxx13new_allocatorIbED2Ev(%"class.__gnu_cxx::new_allocator.14"* %0) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEED2Ev(%"struct.std::_Bvector_base"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Bvector_base"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Bvector_base"* %this, %"struct.std::_Bvector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base"*, %"struct.std::_Bvector_base"** %this.addr, align 8 + invoke void @_ZNSt13_Bvector_baseISaIbEE13_M_deallocateEv(%"struct.std::_Bvector_base"* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + call void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implD2Ev(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl) #3 + ret void + +lpad: ; preds = %entry + %0 = landingpad { i8*, i32 } + cleanup + %1 = extractvalue { i8*, i32 } %0, 0 + store i8* %1, i8** %exn.slot, align 8 + %2 = extractvalue { i8*, i32 } %0, 1 + store i32 %2, i32* %ehselector.slot, align 4 + %_M_impl2 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %this1, i32 0, i32 0 + call void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implD2Ev(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl2) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIbEC2Ev(%"class.__gnu_cxx::new_allocator.14"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.14"*, align 8 + store %"class.__gnu_cxx::new_allocator.14"* %this, %"class.__gnu_cxx::new_allocator.14"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.14"*, %"class.__gnu_cxx::new_allocator.14"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaImEC2IbEERKSaIT_E(%"class.std::allocator.1"* %this, %"class.std::allocator.13"* dereferenceable(1) %0) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.1"*, align 8 + %.addr = alloca %"class.std::allocator.13"*, align 8 + store %"class.std::allocator.1"* %this, %"class.std::allocator.1"** %this.addr, align 8 + store %"class.std::allocator.13"* %0, %"class.std::allocator.13"** %.addr, align 8 + %this1 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %this.addr, align 8 + %1 = bitcast %"class.std::allocator.1"* %this1 to %"class.__gnu_cxx::new_allocator.2"* + call void @_ZN9__gnu_cxx13new_allocatorImEC2Ev(%"class.__gnu_cxx::new_allocator.2"* %1) #3 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implC2ERKSaImE(%"struct.std::_Bvector_base >::_Bvector_impl"* %this, %"class.std::allocator.1"* dereferenceable(1) %__a) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Bvector_base >::_Bvector_impl"*, align 8 + %__a.addr = alloca %"class.std::allocator.1"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Bvector_base >::_Bvector_impl"* %this, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + store %"class.std::allocator.1"* %__a, %"class.std::allocator.1"** %__a.addr, align 8 + %this1 = load %"struct.std::_Bvector_base >::_Bvector_impl"*, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %this1 to %"class.std::allocator.1"* + %1 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %__a.addr, align 8 + call void @_ZNSaImEC2ERKS_(%"class.std::allocator.1"* %0, %"class.std::allocator.1"* dereferenceable(1) %1) #3 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 0 + invoke void @_ZNSt13_Bit_iteratorC2Ev(%"struct.std::_Bit_iterator"* %_M_start) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 1 + invoke void @_ZNSt13_Bit_iteratorC2Ev(%"struct.std::_Bit_iterator"* %_M_finish) + to label %invoke.cont2 unwind label %lpad + +invoke.cont2: ; preds = %invoke.cont + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %this1, i32 0, i32 2 + store i64* null, i64** %_M_end_of_storage, align 8 + ret void + +lpad: ; preds = %invoke.cont, %entry + %2 = landingpad { i8*, i32 } + cleanup + %3 = extractvalue { i8*, i32 } %2, 0 + store i8* %3, i8** %exn.slot, align 8 + %4 = extractvalue { i8*, i32 } %2, 1 + store i32 %4, i32* %ehselector.slot, align 4 + %5 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %this1 to %"class.std::allocator.1"* + call void @_ZNSaImED2Ev(%"class.std::allocator.1"* %5) #3 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaImED2Ev(%"class.std::allocator.1"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.1"*, align 8 + store %"class.std::allocator.1"* %this, %"class.std::allocator.1"** %this.addr, align 8 + %this1 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.1"* %this1 to %"class.__gnu_cxx::new_allocator.2"* + call void @_ZN9__gnu_cxx13new_allocatorImED2Ev(%"class.__gnu_cxx::new_allocator.2"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaImEC2ERKS_(%"class.std::allocator.1"* %this, %"class.std::allocator.1"* dereferenceable(1) %__a) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.1"*, align 8 + %__a.addr = alloca %"class.std::allocator.1"*, align 8 + store %"class.std::allocator.1"* %this, %"class.std::allocator.1"** %this.addr, align 8 + store %"class.std::allocator.1"* %__a, %"class.std::allocator.1"** %__a.addr, align 8 + %this1 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.1"* %this1 to %"class.__gnu_cxx::new_allocator.2"* + %1 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %__a.addr, align 8 + %2 = bitcast %"class.std::allocator.1"* %1 to %"class.__gnu_cxx::new_allocator.2"* + call void @_ZN9__gnu_cxx13new_allocatorImEC2ERKS1_(%"class.__gnu_cxx::new_allocator.2"* %0, %"class.__gnu_cxx::new_allocator.2"* dereferenceable(1) %2) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorImEC2ERKS1_(%"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"* dereferenceable(1) %0) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + %.addr = alloca %"class.__gnu_cxx::new_allocator.2"*, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %this, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + store %"class.__gnu_cxx::new_allocator.2"* %0, %"class.__gnu_cxx::new_allocator.2"** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.2"*, %"class.__gnu_cxx::new_allocator.2"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorIbED2Ev(%"class.__gnu_cxx::new_allocator.14"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.14"*, align 8 + store %"class.__gnu_cxx::new_allocator.14"* %this, %"class.__gnu_cxx::new_allocator.14"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.14"*, %"class.__gnu_cxx::new_allocator.14"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt13_Bvector_baseISaIbEE13_Bvector_implD2Ev(%"struct.std::_Bvector_base >::_Bvector_impl"* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bvector_base >::_Bvector_impl"*, align 8 + store %"struct.std::_Bvector_base >::_Bvector_impl"* %this, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Bvector_base >::_Bvector_impl"*, %"struct.std::_Bvector_base >::_Bvector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bvector_base >::_Bvector_impl"* %this1 to %"class.std::allocator.1"* + call void @_ZNSaImED2Ev(%"class.std::allocator.1"* %0) #3 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt13_Bit_iteratorppEi(%"struct.std::_Bit_iterator"* %this, i32 %0) #6 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + %.addr = alloca i32, align 4 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + store i32 %0, i32* %.addr, align 4 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %1 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %2 = bitcast %"struct.std::_Bit_iterator"* %this1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 16, i1 false) + %3 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + call void @_ZNSt18_Bit_iterator_base10_M_bump_upEv(%"struct.std::_Bit_iterator_base"* %3) + %4 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %5 = load { i64*, i32 }, { i64*, i32 }* %4, align 8 + ret { i64*, i32 } %5 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorIbSaIbEE13_M_insert_auxESt13_Bit_iteratorb(%"class.std::vector.0"* %this, i64* %__position.coerce0, i32 %__position.coerce1, i1 zeroext %__x) #0 comdat align 2 { +entry: + %__position = alloca %"struct.std::_Bit_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__x.addr = alloca i8, align 1 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp6 = alloca %"struct.std::_Bit_iterator", align 8 + %coerce = alloca %"struct.std::_Bit_iterator", align 8 + %ref.tmp = alloca %"struct.std::_Bit_reference", align 8 + %__len = alloca i64, align 8 + %__q = alloca i64*, align 8 + %__start = alloca %"struct.std::_Bit_iterator", align 8 + %__i = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp19 = alloca %"struct.std::_Bit_const_iterator", align 8 + %ref.tmp20 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp22 = alloca %"struct.std::_Bit_const_iterator", align 8 + %agg.tmp23 = alloca %"struct.std::_Bit_iterator", align 8 + %ref.tmp26 = alloca %"struct.std::_Bit_reference", align 8 + %ref.tmp27 = alloca %"struct.std::_Bit_iterator", align 8 + %__finish = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp31 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp32 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp34 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__position to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__position.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__position.coerce1, i32* %2, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %frombool = zext i1 %__x to i8 + store i8 %frombool, i8* %__x.addr, align 1 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %3 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %3, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 1 + %4 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to %"struct.std::_Bit_iterator_base"* + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %4, i32 0, i32 0 + %5 = load i64*, i64** %_M_p, align 8 + %6 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %6, i32 0, i32 0 + %call = call i64* @_ZNKSt13_Bvector_baseISaIbEE13_Bvector_impl11_M_end_addrEv(%"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl2) + %cmp = icmp ne i64* %5, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %7 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to i8* + %8 = bitcast %"struct.std::_Bit_iterator"* %__position to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %7, i8* align 8 %8, i64 16, i1 false) + %9 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl4 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %9, i32 0, i32 0 + %_M_finish5 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl4, i32 0, i32 1 + %10 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to i8* + %11 = bitcast %"struct.std::_Bit_iterator"* %_M_finish5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %10, i8* align 8 %11, i64 16, i1 false) + %12 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl7 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %12, i32 0, i32 0 + %_M_finish8 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl7, i32 0, i32 1 + %call9 = call { i64*, i32 } @_ZNKSt13_Bit_iteratorplEl(%"struct.std::_Bit_iterator"* %_M_finish8, i64 1) + %13 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to { i64*, i32 }* + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %13, i32 0, i32 0 + %15 = extractvalue { i64*, i32 } %call9, 0 + store i64* %15, i64** %14, align 8 + %16 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %13, i32 0, i32 1 + %17 = extractvalue { i64*, i32 } %call9, 1 + store i32 %17, i32* %16, align 8 + %18 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %18, i32 0, i32 0 + %20 = load i64*, i64** %19, align 8 + %21 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %18, i32 0, i32 1 + %22 = load i32, i32* %21, align 8 + %23 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %28 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = load i64*, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = load i32, i32* %31, align 8 + %call10 = call { i64*, i32 } @_ZSt13copy_backwardISt13_Bit_iteratorS0_ET0_T_S2_S1_(i64* %20, i32 %22, i64* %25, i32 %27, i64* %30, i32 %32) + %33 = bitcast %"struct.std::_Bit_iterator"* %coerce to { i64*, i32 }* + %34 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %33, i32 0, i32 0 + %35 = extractvalue { i64*, i32 } %call10, 0 + store i64* %35, i64** %34, align 8 + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %33, i32 0, i32 1 + %37 = extractvalue { i64*, i32 } %call10, 1 + store i32 %37, i32* %36, align 8 + %38 = load i8, i8* %__x.addr, align 1 + %tobool = trunc i8 %38 to i1 + %call11 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %__position) + %39 = bitcast %"struct.std::_Bit_reference"* %ref.tmp to { i64*, i64 }* + %40 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %39, i32 0, i32 0 + %41 = extractvalue { i64*, i64 } %call11, 0 + store i64* %41, i64** %40, align 8 + %42 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %39, i32 0, i32 1 + %43 = extractvalue { i64*, i64 } %call11, 1 + store i64 %43, i64* %42, align 8 + %call12 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSEb(%"struct.std::_Bit_reference"* %ref.tmp, i1 zeroext %tobool) + %44 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl13 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %44, i32 0, i32 0 + %_M_finish14 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl13, i32 0, i32 1 + %call15 = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorppEv(%"struct.std::_Bit_iterator"* %_M_finish14) + br label %if.end + +if.else: ; preds = %entry + %call16 = call i64 @_ZNKSt6vectorIbSaIbEE12_M_check_lenEmPKc(%"class.std::vector.0"* %this1, i64 1, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.27, i64 0, i64 0)) + store i64 %call16, i64* %__len, align 8 + %45 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %46 = load i64, i64* %__len, align 8 + %call17 = call i64* @_ZNSt13_Bvector_baseISaIbEE11_M_allocateEm(%"struct.std::_Bvector_base"* %45, i64 %46) + store i64* %call17, i64** %__q, align 8 + %47 = load i64*, i64** %__q, align 8 + %call18 = call i64* @_ZSt11__addressofImEPT_RS0_(i64* dereferenceable(8) %47) + call void @_ZNSt13_Bit_iteratorC2EPmj(%"struct.std::_Bit_iterator"* %__start, i64* %call18, i32 0) + %call21 = call { i64*, i32 } @_ZNSt6vectorIbSaIbEE5beginEv(%"class.std::vector.0"* %this1) + %48 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp20 to { i64*, i32 }* + %49 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %48, i32 0, i32 0 + %50 = extractvalue { i64*, i32 } %call21, 0 + store i64* %50, i64** %49, align 8 + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %48, i32 0, i32 1 + %52 = extractvalue { i64*, i32 } %call21, 1 + store i32 %52, i32* %51, align 8 + call void @_ZNSt19_Bit_const_iteratorC2ERKSt13_Bit_iterator(%"struct.std::_Bit_const_iterator"* %agg.tmp19, %"struct.std::_Bit_iterator"* dereferenceable(16) %ref.tmp20) + call void @_ZNSt19_Bit_const_iteratorC2ERKSt13_Bit_iterator(%"struct.std::_Bit_const_iterator"* %agg.tmp22, %"struct.std::_Bit_iterator"* dereferenceable(16) %__position) + %53 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp23 to i8* + %54 = bitcast %"struct.std::_Bit_iterator"* %__start to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %53, i8* align 8 %54, i64 16, i1 false) + %55 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp19 to { i64*, i32 }* + %56 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 0 + %57 = load i64*, i64** %56, align 8 + %58 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 1 + %59 = load i32, i32* %58, align 8 + %60 = bitcast %"struct.std::_Bit_const_iterator"* %agg.tmp22 to { i64*, i32 }* + %61 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 0 + %62 = load i64*, i64** %61, align 8 + %63 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 1 + %64 = load i32, i32* %63, align 8 + %call24 = call { i64*, i32 } @_ZNSt6vectorIbSaIbEE15_M_copy_alignedESt19_Bit_const_iteratorS2_St13_Bit_iterator(%"class.std::vector.0"* %this1, i64* %57, i32 %59, i64* %62, i32 %64, %"struct.std::_Bit_iterator"* byval(%"struct.std::_Bit_iterator") align 8 %agg.tmp23) + %65 = bitcast %"struct.std::_Bit_iterator"* %__i to { i64*, i32 }* + %66 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %65, i32 0, i32 0 + %67 = extractvalue { i64*, i32 } %call24, 0 + store i64* %67, i64** %66, align 8 + %68 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %65, i32 0, i32 1 + %69 = extractvalue { i64*, i32 } %call24, 1 + store i32 %69, i32* %68, align 8 + %70 = load i8, i8* %__x.addr, align 1 + %tobool25 = trunc i8 %70 to i1 + %call28 = call { i64*, i32 } @_ZNSt13_Bit_iteratorppEi(%"struct.std::_Bit_iterator"* %__i, i32 0) + %71 = bitcast %"struct.std::_Bit_iterator"* %ref.tmp27 to { i64*, i32 }* + %72 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %71, i32 0, i32 0 + %73 = extractvalue { i64*, i32 } %call28, 0 + store i64* %73, i64** %72, align 8 + %74 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %71, i32 0, i32 1 + %75 = extractvalue { i64*, i32 } %call28, 1 + store i32 %75, i32* %74, align 8 + %call29 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %ref.tmp27) + %76 = bitcast %"struct.std::_Bit_reference"* %ref.tmp26 to { i64*, i64 }* + %77 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %76, i32 0, i32 0 + %78 = extractvalue { i64*, i64 } %call29, 0 + store i64* %78, i64** %77, align 8 + %79 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %76, i32 0, i32 1 + %80 = extractvalue { i64*, i64 } %call29, 1 + store i64 %80, i64* %79, align 8 + %call30 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSEb(%"struct.std::_Bit_reference"* %ref.tmp26, i1 zeroext %tobool25) + %81 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp31 to i8* + %82 = bitcast %"struct.std::_Bit_iterator"* %__position to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %81, i8* align 8 %82, i64 16, i1 false) + %call33 = call { i64*, i32 } @_ZNSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %this1) + %83 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp32 to { i64*, i32 }* + %84 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %83, i32 0, i32 0 + %85 = extractvalue { i64*, i32 } %call33, 0 + store i64* %85, i64** %84, align 8 + %86 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %83, i32 0, i32 1 + %87 = extractvalue { i64*, i32 } %call33, 1 + store i32 %87, i32* %86, align 8 + %88 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp34 to i8* + %89 = bitcast %"struct.std::_Bit_iterator"* %__i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %88, i8* align 8 %89, i64 16, i1 false) + %90 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp31 to { i64*, i32 }* + %91 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %90, i32 0, i32 0 + %92 = load i64*, i64** %91, align 8 + %93 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %90, i32 0, i32 1 + %94 = load i32, i32* %93, align 8 + %95 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp32 to { i64*, i32 }* + %96 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %95, i32 0, i32 0 + %97 = load i64*, i64** %96, align 8 + %98 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %95, i32 0, i32 1 + %99 = load i32, i32* %98, align 8 + %100 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp34 to { i64*, i32 }* + %101 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %100, i32 0, i32 0 + %102 = load i64*, i64** %101, align 8 + %103 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %100, i32 0, i32 1 + %104 = load i32, i32* %103, align 8 + %call35 = call { i64*, i32 } @_ZSt4copyISt13_Bit_iteratorS0_ET0_T_S2_S1_(i64* %92, i32 %94, i64* %97, i32 %99, i64* %102, i32 %104) + %105 = bitcast %"struct.std::_Bit_iterator"* %__finish to { i64*, i32 }* + %106 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %105, i32 0, i32 0 + %107 = extractvalue { i64*, i32 } %call35, 0 + store i64* %107, i64** %106, align 8 + %108 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %105, i32 0, i32 1 + %109 = extractvalue { i64*, i32 } %call35, 1 + store i32 %109, i32* %108, align 8 + %110 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + call void @_ZNSt13_Bvector_baseISaIbEE13_M_deallocateEv(%"struct.std::_Bvector_base"* %110) + %111 = load i64*, i64** %__q, align 8 + %112 = load i64, i64* %__len, align 8 + %call36 = call i64 @_ZNSt13_Bvector_baseISaIbEE8_S_nwordEm(i64 %112) + %add.ptr = getelementptr inbounds i64, i64* %111, i64 %call36 + %113 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl37 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %113, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl37, i32 0, i32 2 + store i64* %add.ptr, i64** %_M_end_of_storage, align 8 + %114 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl38 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %114, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl38, i32 0, i32 0 + %115 = bitcast %"struct.std::_Bit_iterator"* %_M_start to i8* + %116 = bitcast %"struct.std::_Bit_iterator"* %__start to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %115, i8* align 8 %116, i64 12, i1 false) + %117 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl39 = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %117, i32 0, i32 0 + %_M_finish40 = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl39, i32 0, i32 1 + %118 = bitcast %"struct.std::_Bit_iterator"* %_M_finish40 to i8* + %119 = bitcast %"struct.std::_Bit_iterator"* %__finish to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %118, i8* align 8 %119, i64 12, i1 false) + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt6vectorIbSaIbEE3endEv(%"class.std::vector.0"* %this) #6 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Bvector_base", %"struct.std::_Bvector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Bvector_base >::_Bvector_impl", %"struct.std::_Bvector_base >::_Bvector_impl"* %_M_impl, i32 0, i32 1 + %1 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %2 = bitcast %"struct.std::_Bit_iterator"* %_M_finish to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 16, i1 false) + %3 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %4 = load { i64*, i32 }, { i64*, i32 }* %3, align 8 + ret { i64*, i32 } %4 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt13copy_backwardISt13_Bit_iteratorS0_ET0_T_S2_S1_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to i8* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = load i64*, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = load i32, i32* %14, align 8 + %call = call { i64*, i32 } @_ZSt12__miter_baseISt13_Bit_iteratorET_S1_(i64* %13, i32 %15) + %16 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to i8* + %22 = bitcast %"struct.std::_Bit_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 16, i1 false) + %23 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %call4 = call { i64*, i32 } @_ZSt12__miter_baseISt13_Bit_iteratorET_S1_(i64* %25, i32 %27) + %28 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = extractvalue { i64*, i32 } %call4, 0 + store i64* %30, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = extractvalue { i64*, i32 } %call4, 1 + store i32 %32, i32* %31, align 8 + %33 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to i8* + %34 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %33, i8* align 8 %34, i64 16, i1 false) + %35 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 0 + %37 = load i64*, i64** %36, align 8 + %38 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 1 + %39 = load i32, i32* %38, align 8 + %40 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %41 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 0 + %42 = load i64*, i64** %41, align 8 + %43 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 1 + %44 = load i32, i32* %43, align 8 + %45 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %46 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 0 + %47 = load i64*, i64** %46, align 8 + %48 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 1 + %49 = load i32, i32* %48, align 8 + %call6 = call { i64*, i32 } @_ZSt23__copy_move_backward_a2ILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %37, i32 %39, i64* %42, i32 %44, i64* %47, i32 %49) + %50 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 0 + %52 = extractvalue { i64*, i32 } %call6, 0 + store i64* %52, i64** %51, align 8 + %53 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 1 + %54 = extractvalue { i64*, i32 } %call6, 1 + store i32 %54, i32* %53, align 8 + %55 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %56 = load { i64*, i32 }, { i64*, i32 }* %55, align 8 + ret { i64*, i32 } %56 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIbSaIbEE12_M_check_lenEmPKc(%"class.std::vector.0"* %this, i64 %__n, i8* %__s) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__n.addr = alloca i64, align 8 + %__s.addr = alloca i8*, align 8 + %__len = alloca i64, align 8 + %ref.tmp = alloca i64, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %__s, i8** %__s.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorIbSaIbEE8max_sizeEv(%"class.std::vector.0"* %this1) + %call2 = call i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %this1) + %sub = sub i64 %call, %call2 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ult i64 %sub, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i8*, i8** %__s.addr, align 8 + call void @_ZSt20__throw_length_errorPKc(i8* %1) #19 + unreachable + +if.end: ; preds = %entry + %call3 = call i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %this1) + %call4 = call i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %this1) + store i64 %call4, i64* %ref.tmp, align 8 + %call5 = call dereferenceable(8) i64* @_ZSt3maxImERKT_S2_S2_(i64* dereferenceable(8) %ref.tmp, i64* dereferenceable(8) %__n.addr) + %2 = load i64, i64* %call5, align 8 + %add = add i64 %call3, %2 + store i64 %add, i64* %__len, align 8 + %3 = load i64, i64* %__len, align 8 + %call6 = call i64 @_ZNKSt6vectorIbSaIbEE4sizeEv(%"class.std::vector.0"* %this1) + %cmp7 = icmp ult i64 %3, %call6 + br i1 %cmp7, label %cond.true, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %4 = load i64, i64* %__len, align 8 + %call8 = call i64 @_ZNKSt6vectorIbSaIbEE8max_sizeEv(%"class.std::vector.0"* %this1) + %cmp9 = icmp ugt i64 %4, %call8 + br i1 %cmp9, label %cond.true, label %cond.false + +cond.true: ; preds = %lor.lhs.false, %if.end + %call10 = call i64 @_ZNKSt6vectorIbSaIbEE8max_sizeEv(%"class.std::vector.0"* %this1) + br label %cond.end + +cond.false: ; preds = %lor.lhs.false + %5 = load i64, i64* %__len, align 8 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i64 [ %call10, %cond.true ], [ %5, %cond.false ] + ret i64 %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt4copyISt13_Bit_iteratorS0_ET0_T_S2_S1_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to i8* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = load i64*, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = load i32, i32* %14, align 8 + %call = call { i64*, i32 } @_ZSt12__miter_baseISt13_Bit_iteratorET_S1_(i64* %13, i32 %15) + %16 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to i8* + %22 = bitcast %"struct.std::_Bit_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 16, i1 false) + %23 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %call4 = call { i64*, i32 } @_ZSt12__miter_baseISt13_Bit_iteratorET_S1_(i64* %25, i32 %27) + %28 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = extractvalue { i64*, i32 } %call4, 0 + store i64* %30, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = extractvalue { i64*, i32 } %call4, 1 + store i32 %32, i32* %31, align 8 + %33 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to i8* + %34 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %33, i8* align 8 %34, i64 16, i1 false) + %35 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 0 + %37 = load i64*, i64** %36, align 8 + %38 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 1 + %39 = load i32, i32* %38, align 8 + %40 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %41 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 0 + %42 = load i64*, i64** %41, align 8 + %43 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 1 + %44 = load i32, i32* %43, align 8 + %45 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %46 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 0 + %47 = load i64*, i64** %46, align 8 + %48 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 1 + %49 = load i32, i32* %48, align 8 + %call6 = call { i64*, i32 } @_ZSt14__copy_move_a2ILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %37, i32 %39, i64* %42, i32 %44, i64* %47, i32 %49) + %50 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 0 + %52 = extractvalue { i64*, i32 } %call6, 0 + store i64* %52, i64** %51, align 8 + %53 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 1 + %54 = extractvalue { i64*, i32 } %call6, 1 + store i32 %54, i32* %53, align 8 + %55 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %56 = load { i64*, i32 }, { i64*, i32 }* %55, align 8 + ret { i64*, i32 } %56 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt23__copy_move_backward_a2ILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp6 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to i8* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = load i64*, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = load i32, i32* %14, align 8 + %call = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %13, i32 %15) + %16 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to i8* + %22 = bitcast %"struct.std::_Bit_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 16, i1 false) + %23 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %call4 = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %25, i32 %27) + %28 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = extractvalue { i64*, i32 } %call4, 0 + store i64* %30, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = extractvalue { i64*, i32 } %call4, 1 + store i32 %32, i32* %31, align 8 + %33 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to i8* + %34 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %33, i8* align 8 %34, i64 16, i1 false) + %35 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to { i64*, i32 }* + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 0 + %37 = load i64*, i64** %36, align 8 + %38 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 1 + %39 = load i32, i32* %38, align 8 + %call7 = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %37, i32 %39) + %40 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %41 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 0 + %42 = extractvalue { i64*, i32 } %call7, 0 + store i64* %42, i64** %41, align 8 + %43 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 1 + %44 = extractvalue { i64*, i32 } %call7, 1 + store i32 %44, i32* %43, align 8 + %45 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %46 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 0 + %47 = load i64*, i64** %46, align 8 + %48 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 1 + %49 = load i32, i32* %48, align 8 + %50 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 0 + %52 = load i64*, i64** %51, align 8 + %53 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 1 + %54 = load i32, i32* %53, align 8 + %55 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %56 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 0 + %57 = load i64*, i64** %56, align 8 + %58 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 1 + %59 = load i32, i32* %58, align 8 + %call8 = call { i64*, i32 } @_ZSt22__copy_move_backward_aILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %47, i32 %49, i64* %52, i32 %54, i64* %57, i32 %59) + %60 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %61 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 0 + %62 = extractvalue { i64*, i32 } %call8, 0 + store i64* %62, i64** %61, align 8 + %63 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 1 + %64 = extractvalue { i64*, i32 } %call8, 1 + store i32 %64, i32* %63, align 8 + %65 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %66 = load { i64*, i32 }, { i64*, i32 }* %65, align 8 + ret { i64*, i32 } %66 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt12__miter_baseISt13_Bit_iteratorET_S1_(i64* %__it.coerce0, i32 %__it.coerce1) #6 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__it = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__it to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__it.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__it.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %4 = bitcast %"struct.std::_Bit_iterator"* %__it to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %3, i8* align 8 %4, i64 16, i1 false) + %5 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %6 = load { i64*, i32 }, { i64*, i32 }* %5, align 8 + ret { i64*, i32 } %6 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt22__copy_move_backward_aILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %__simple = alloca i8, align 1 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + store i8 0, i8* %__simple, align 1 + %9 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to i8* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to i8* + %12 = bitcast %"struct.std::_Bit_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %11, i8* align 8 %12, i64 16, i1 false) + %13 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to i8* + %14 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 16, i1 false) + %15 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %16 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 0 + %17 = load i64*, i64** %16, align 8 + %18 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 1 + %19 = load i32, i32* %18, align 8 + %20 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to { i64*, i32 }* + %21 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %20, i32 0, i32 0 + %22 = load i64*, i64** %21, align 8 + %23 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %20, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %25, i32 0, i32 0 + %27 = load i64*, i64** %26, align 8 + %28 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %25, i32 0, i32 1 + %29 = load i32, i32* %28, align 8 + %call = call { i64*, i32 } @_ZNSt20__copy_move_backwardILb0ELb0ESt26random_access_iterator_tagE13__copy_move_bISt13_Bit_iteratorS3_EET0_T_S5_S4_(i64* %17, i32 %19, i64* %22, i32 %24, i64* %27, i32 %29) + %30 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %30, i32 0, i32 0 + %32 = extractvalue { i64*, i32 } %call, 0 + store i64* %32, i64** %31, align 8 + %33 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %30, i32 0, i32 1 + %34 = extractvalue { i64*, i32 } %call, 1 + store i32 %34, i32* %33, align 8 + %35 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %36 = load { i64*, i32 }, { i64*, i32 }* %35, align 8 + ret { i64*, i32 } %36 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt20__copy_move_backwardILb0ELb0ESt26random_access_iterator_tagE13__copy_move_bISt13_Bit_iteratorS3_EET0_T_S5_S4_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %__n = alloca i64, align 8 + %ref.tmp = alloca %"struct.std::_Bit_reference", align 8 + %ref.tmp3 = alloca %"struct.std::_Bit_reference", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_iterator"* %__last to %"struct.std::_Bit_iterator_base"* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to %"struct.std::_Bit_iterator_base"* + %call = call i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %9, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %10) + store i64 %call, i64* %__n, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %11 = load i64, i64* %__n, align 8 + %cmp = icmp sgt i64 %11, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratormmEv(%"struct.std::_Bit_iterator"* %__last) + %call2 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %call1) + %12 = bitcast %"struct.std::_Bit_reference"* %ref.tmp to { i64*, i64 }* + %13 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %12, i32 0, i32 0 + %14 = extractvalue { i64*, i64 } %call2, 0 + store i64* %14, i64** %13, align 8 + %15 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %12, i32 0, i32 1 + %16 = extractvalue { i64*, i64 } %call2, 1 + store i64 %16, i64* %15, align 8 + %call4 = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratormmEv(%"struct.std::_Bit_iterator"* %__result) + %call5 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %call4) + %17 = bitcast %"struct.std::_Bit_reference"* %ref.tmp3 to { i64*, i64 }* + %18 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %17, i32 0, i32 0 + %19 = extractvalue { i64*, i64 } %call5, 0 + store i64* %19, i64** %18, align 8 + %20 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %17, i32 0, i32 1 + %21 = extractvalue { i64*, i64 } %call5, 1 + store i64 %21, i64* %20, align 8 + %call6 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSERKS_(%"struct.std::_Bit_reference"* %ref.tmp3, %"struct.std::_Bit_reference"* dereferenceable(16) %ref.tmp) + br label %for.inc + +for.inc: ; preds = %for.body + %22 = load i64, i64* %__n, align 8 + %dec = add nsw i64 %22, -1 + store i64 %dec, i64* %__n, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %23 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %24 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 16, i1 false) + %25 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %26 = load { i64*, i32 }, { i64*, i32 }* %25, align 8 + ret { i64*, i32 } %26 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratormmEv(%"struct.std::_Bit_iterator"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator"*, align 8 + store %"struct.std::_Bit_iterator"* %this, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator"*, %"struct.std::_Bit_iterator"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %this1 to %"struct.std::_Bit_iterator_base"* + call void @_ZNSt18_Bit_iterator_base12_M_bump_downEv(%"struct.std::_Bit_iterator_base"* %0) + ret %"struct.std::_Bit_iterator"* %this1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSERKS_(%"struct.std::_Bit_reference"* %this, %"struct.std::_Bit_reference"* dereferenceable(16) %__x) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_reference"*, align 8 + %__x.addr = alloca %"struct.std::_Bit_reference"*, align 8 + store %"struct.std::_Bit_reference"* %this, %"struct.std::_Bit_reference"** %this.addr, align 8 + store %"struct.std::_Bit_reference"* %__x, %"struct.std::_Bit_reference"** %__x.addr, align 8 + %this1 = load %"struct.std::_Bit_reference"*, %"struct.std::_Bit_reference"** %this.addr, align 8 + %0 = load %"struct.std::_Bit_reference"*, %"struct.std::_Bit_reference"** %__x.addr, align 8 + %call = call zeroext i1 @_ZNKSt14_Bit_referencecvbEv(%"struct.std::_Bit_reference"* %0) + %call2 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSEb(%"struct.std::_Bit_reference"* %this1, i1 zeroext %call) + ret %"struct.std::_Bit_reference"* %call2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt18_Bit_iterator_base12_M_bump_downEv(%"struct.std::_Bit_iterator_base"* %this) #6 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Bit_iterator_base"*, align 8 + store %"struct.std::_Bit_iterator_base"* %this, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Bit_iterator_base"*, %"struct.std::_Bit_iterator_base"** %this.addr, align 8 + %_M_offset = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + %0 = load i32, i32* %_M_offset, align 8 + %dec = add i32 %0, -1 + store i32 %dec, i32* %_M_offset, align 8 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_offset2 = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 1 + store i32 63, i32* %_M_offset2, align 8 + %_M_p = getelementptr inbounds %"struct.std::_Bit_iterator_base", %"struct.std::_Bit_iterator_base"* %this1, i32 0, i32 0 + %1 = load i64*, i64** %_M_p, align 8 + %incdec.ptr = getelementptr inbounds i64, i64* %1, i32 -1 + store i64* %incdec.ptr, i64** %_M_p, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorIbSaIbEE8max_sizeEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__isize = alloca i64, align 8 + %__asize = alloca i64, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + store i64 9223372036854775744, i64* %__isize, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Bvector_base"* + %call = call dereferenceable(1) %"class.std::allocator.1"* @_ZNKSt13_Bvector_baseISaIbEE20_M_get_Bit_allocatorEv(%"struct.std::_Bvector_base"* %0) + %call2 = call i64 @_ZN9__gnu_cxx14__alloc_traitsISaImEE8max_sizeERKS1_(%"class.std::allocator.1"* dereferenceable(1) %call) + store i64 %call2, i64* %__asize, align 8 + %1 = load i64, i64* %__asize, align 8 + %cmp = icmp ule i64 %1, 144115188075855871 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %2 = load i64, i64* %__asize, align 8 + %mul = mul i64 %2, 64 + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i64 [ %mul, %cond.true ], [ 9223372036854775744, %cond.false ] + ret i64 %cond +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxx14__alloc_traitsISaImEE8max_sizeERKS1_(%"class.std::allocator.1"* dereferenceable(1) %__a) #6 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.1"*, align 8 + store %"class.std::allocator.1"* %__a, %"class.std::allocator.1"** %__a.addr, align 8 + %0 = load %"class.std::allocator.1"*, %"class.std::allocator.1"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.1"* %0 to %"class.__gnu_cxx::new_allocator.2"* + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorImE8max_sizeEv(%"class.__gnu_cxx::new_allocator.2"* %1) #3 + ret i64 %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt14__copy_move_a2ILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp3 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp5 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp6 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to i8* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to { i64*, i32 }* + %12 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 0 + %13 = load i64*, i64** %12, align 8 + %14 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %11, i32 0, i32 1 + %15 = load i32, i32* %14, align 8 + %call = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %13, i32 %15) + %16 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %17 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 0 + %18 = extractvalue { i64*, i32 } %call, 0 + store i64* %18, i64** %17, align 8 + %19 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %16, i32 0, i32 1 + %20 = extractvalue { i64*, i32 } %call, 1 + store i32 %20, i32* %19, align 8 + %21 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to i8* + %22 = bitcast %"struct.std::_Bit_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 16, i1 false) + %23 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp3 to { i64*, i32 }* + %24 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 0 + %25 = load i64*, i64** %24, align 8 + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %23, i32 0, i32 1 + %27 = load i32, i32* %26, align 8 + %call4 = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %25, i32 %27) + %28 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %29 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 0 + %30 = extractvalue { i64*, i32 } %call4, 0 + store i64* %30, i64** %29, align 8 + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %28, i32 0, i32 1 + %32 = extractvalue { i64*, i32 } %call4, 1 + store i32 %32, i32* %31, align 8 + %33 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to i8* + %34 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %33, i8* align 8 %34, i64 16, i1 false) + %35 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp6 to { i64*, i32 }* + %36 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 0 + %37 = load i64*, i64** %36, align 8 + %38 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %35, i32 0, i32 1 + %39 = load i32, i32* %38, align 8 + %call7 = call { i64*, i32 } @_ZSt12__niter_baseISt13_Bit_iteratorET_S1_(i64* %37, i32 %39) + %40 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %41 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 0 + %42 = extractvalue { i64*, i32 } %call7, 0 + store i64* %42, i64** %41, align 8 + %43 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %40, i32 0, i32 1 + %44 = extractvalue { i64*, i32 } %call7, 1 + store i32 %44, i32* %43, align 8 + %45 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %46 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 0 + %47 = load i64*, i64** %46, align 8 + %48 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %45, i32 0, i32 1 + %49 = load i32, i32* %48, align 8 + %50 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %51 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 0 + %52 = load i64*, i64** %51, align 8 + %53 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %50, i32 0, i32 1 + %54 = load i32, i32* %53, align 8 + %55 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp5 to { i64*, i32 }* + %56 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 0 + %57 = load i64*, i64** %56, align 8 + %58 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %55, i32 0, i32 1 + %59 = load i32, i32* %58, align 8 + %call8 = call { i64*, i32 } @_ZSt13__copy_move_aILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %47, i32 %49, i64* %52, i32 %54, i64* %57, i32 %59) + %60 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %61 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 0 + %62 = extractvalue { i64*, i32 } %call8, 0 + store i64* %62, i64** %61, align 8 + %63 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %60, i32 0, i32 1 + %64 = extractvalue { i64*, i32 } %call8, 1 + store i32 %64, i32* %63, align 8 + %65 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %66 = load { i64*, i32 }, { i64*, i32 }* %65, align 8 + ret { i64*, i32 } %66 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZSt13__copy_move_aILb0ESt13_Bit_iteratorS0_ET1_T0_S2_S1_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %__simple = alloca i8, align 1 + %agg.tmp = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp1 = alloca %"struct.std::_Bit_iterator", align 8 + %agg.tmp2 = alloca %"struct.std::_Bit_iterator", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + store i8 0, i8* %__simple, align 1 + %9 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to i8* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 16, i1 false) + %11 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to i8* + %12 = bitcast %"struct.std::_Bit_iterator"* %__last to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %11, i8* align 8 %12, i64 16, i1 false) + %13 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to i8* + %14 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 16, i1 false) + %15 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp to { i64*, i32 }* + %16 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 0 + %17 = load i64*, i64** %16, align 8 + %18 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %15, i32 0, i32 1 + %19 = load i32, i32* %18, align 8 + %20 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp1 to { i64*, i32 }* + %21 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %20, i32 0, i32 0 + %22 = load i64*, i64** %21, align 8 + %23 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %20, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast %"struct.std::_Bit_iterator"* %agg.tmp2 to { i64*, i32 }* + %26 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %25, i32 0, i32 0 + %27 = load i64*, i64** %26, align 8 + %28 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %25, i32 0, i32 1 + %29 = load i32, i32* %28, align 8 + %call = call { i64*, i32 } @_ZNSt11__copy_moveILb0ELb0ESt26random_access_iterator_tagE8__copy_mISt13_Bit_iteratorS3_EET0_T_S5_S4_(i64* %17, i32 %19, i64* %22, i32 %24, i64* %27, i32 %29) + %30 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %31 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %30, i32 0, i32 0 + %32 = extractvalue { i64*, i32 } %call, 0 + store i64* %32, i64** %31, align 8 + %33 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %30, i32 0, i32 1 + %34 = extractvalue { i64*, i32 } %call, 1 + store i32 %34, i32* %33, align 8 + %35 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %36 = load { i64*, i32 }, { i64*, i32 }* %35, align 8 + ret { i64*, i32 } %36 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local { i64*, i32 } @_ZNSt11__copy_moveILb0ELb0ESt26random_access_iterator_tagE8__copy_mISt13_Bit_iteratorS3_EET0_T_S5_S4_(i64* %__first.coerce0, i32 %__first.coerce1, i64* %__last.coerce0, i32 %__last.coerce1, i64* %__result.coerce0, i32 %__result.coerce1) #0 comdat align 2 { +entry: + %retval = alloca %"struct.std::_Bit_iterator", align 8 + %__first = alloca %"struct.std::_Bit_iterator", align 8 + %__last = alloca %"struct.std::_Bit_iterator", align 8 + %__result = alloca %"struct.std::_Bit_iterator", align 8 + %__n = alloca i64, align 8 + %ref.tmp = alloca %"struct.std::_Bit_reference", align 8 + %ref.tmp2 = alloca %"struct.std::_Bit_reference", align 8 + %0 = bitcast %"struct.std::_Bit_iterator"* %__first to { i64*, i32 }* + %1 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 0 + store i64* %__first.coerce0, i64** %1, align 8 + %2 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %0, i32 0, i32 1 + store i32 %__first.coerce1, i32* %2, align 8 + %3 = bitcast %"struct.std::_Bit_iterator"* %__last to { i64*, i32 }* + %4 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 0 + store i64* %__last.coerce0, i64** %4, align 8 + %5 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %3, i32 0, i32 1 + store i32 %__last.coerce1, i32* %5, align 8 + %6 = bitcast %"struct.std::_Bit_iterator"* %__result to { i64*, i32 }* + %7 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 0 + store i64* %__result.coerce0, i64** %7, align 8 + %8 = getelementptr inbounds { i64*, i32 }, { i64*, i32 }* %6, i32 0, i32 1 + store i32 %__result.coerce1, i32* %8, align 8 + %9 = bitcast %"struct.std::_Bit_iterator"* %__last to %"struct.std::_Bit_iterator_base"* + %10 = bitcast %"struct.std::_Bit_iterator"* %__first to %"struct.std::_Bit_iterator_base"* + %call = call i64 @_ZStmiRKSt18_Bit_iterator_baseS1_(%"struct.std::_Bit_iterator_base"* dereferenceable(16) %9, %"struct.std::_Bit_iterator_base"* dereferenceable(16) %10) + store i64 %call, i64* %__n, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %11 = load i64, i64* %__n, align 8 + %cmp = icmp sgt i64 %11, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %__first) + %12 = bitcast %"struct.std::_Bit_reference"* %ref.tmp to { i64*, i64 }* + %13 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %12, i32 0, i32 0 + %14 = extractvalue { i64*, i64 } %call1, 0 + store i64* %14, i64** %13, align 8 + %15 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %12, i32 0, i32 1 + %16 = extractvalue { i64*, i64 } %call1, 1 + store i64 %16, i64* %15, align 8 + %call3 = call { i64*, i64 } @_ZNKSt13_Bit_iteratordeEv(%"struct.std::_Bit_iterator"* %__result) + %17 = bitcast %"struct.std::_Bit_reference"* %ref.tmp2 to { i64*, i64 }* + %18 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %17, i32 0, i32 0 + %19 = extractvalue { i64*, i64 } %call3, 0 + store i64* %19, i64** %18, align 8 + %20 = getelementptr inbounds { i64*, i64 }, { i64*, i64 }* %17, i32 0, i32 1 + %21 = extractvalue { i64*, i64 } %call3, 1 + store i64 %21, i64* %20, align 8 + %call4 = call dereferenceable(16) %"struct.std::_Bit_reference"* @_ZNSt14_Bit_referenceaSERKS_(%"struct.std::_Bit_reference"* %ref.tmp2, %"struct.std::_Bit_reference"* dereferenceable(16) %ref.tmp) + %call5 = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorppEv(%"struct.std::_Bit_iterator"* %__first) + %call6 = call dereferenceable(16) %"struct.std::_Bit_iterator"* @_ZNSt13_Bit_iteratorppEv(%"struct.std::_Bit_iterator"* %__result) + br label %for.inc + +for.inc: ; preds = %for.body + %22 = load i64, i64* %__n, align 8 + %dec = add nsw i64 %22, -1 + store i64 %dec, i64* %__n, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %23 = bitcast %"struct.std::_Bit_iterator"* %retval to i8* + %24 = bitcast %"struct.std::_Bit_iterator"* %__result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 16, i1 false) + %25 = bitcast %"struct.std::_Bit_iterator"* %retval to { i64*, i32 }* + %26 = load { i64*, i32 }, { i64*, i32 }* %25, align 8 + ret { i64*, i32 } %26 +} + +; Function Attrs: noinline uwtable +define internal void @_GLOBAL__sub_I_main_test_cu.cu() #2 section ".text.startup" { +entry: + call void @__cxx_global_var_init() + ret void +} + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i8*, i64, i32*)* @_Z12histo_kernelPhlPj to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, i32*)* @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_ to i8*), i8* getelementptr inbounds ([50 x i8], [50 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([50 x i8], [50 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %3 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb0EEvPjPKjS0_iii to i8*), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @2, i64 0, i64 0), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @2, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %4 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb1EEvPjPKjS0_iii to i8*), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @3, i64 0, i64 0), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @3, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %5 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @_ZL10uniformAddPjS_iii to i8*), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @4, i64 0, i64 0), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @4, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %6 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb0EEvPjPKjS0_iii to i8*), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @5, i64 0, i64 0), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @5, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %7 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb1EEvPjPKjS0_iii to i8*), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @6, i64 0, i64 0), i8* getelementptr inbounds ([34 x i8], [34 x i8]* @6, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %8 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32*, i32*, i32*, i32*, i32)* @_ZL5pack2PjS_S_S_j to i8*), i8* getelementptr inbounds ([19 x i8], [19 x i8]* @7, i64 0, i64 0), i8* getelementptr inbounds ([19 x i8], [19 x i8]* @7, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } +attributes #4 = { argmemonly nounwind willreturn } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { noinline noreturn nounwind } +attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #10 = { nounwind readonly } +attributes #11 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #12 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #13 = { nounwind readnone speculatable willreturn } +attributes #14 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #16 = { noreturn nounwind } +attributes #17 = { builtin } +attributes #18 = { builtin nounwind } +attributes #19 = { noreturn } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/huffman/main_test_cu.cu b/examples/huffman/main_test_cu.cu new file mode 100755 index 0000000..229250a --- /dev/null +++ b/examples/huffman/main_test_cu.cu @@ -0,0 +1,225 @@ +/* + * PAVLE - Parallel Variable-Length Encoder for CUDA. Main file. + * + * Copyright (C) 2009 Ana Balevic + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the MIT License. Read the full licence: + * http://www.opensource.org/licenses/mit-license.php + * + * If you find this program useful, please contact me and reference PAVLE home + * page in your work. + * + */ + +#include "comparison_helpers.h" +#include "cuda_helpers.h" +#include "load_data.h" +#include "print_helpers.h" +#include "stats_logger.h" +#include "stdafx.h" +#include +#include + +//#include "vlc_kernel_gm32.cu" +//#include "vlc_kernel_sm32.cu" +#include "vlc_kernel_sm64huff.cu" +//#include "vlc_kernel_dpt.cu" +//#include "vlc_kernel_dptt.cu" +//#include "scan_kernel.cu" +#include "cpuencode.h" +#include "pack_kernels.cu" +#include "scan.cu" + +long long get_time() { + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000) + tv.tv_usec; +} +void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1); + +extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements, + unsigned int *outdata, unsigned int *outsize, + unsigned int *codewords, + unsigned int *codewordlens); + +int main(int argc, char *argv[]) { + if (!InitCUDA()) { + return 0; + } + unsigned int num_block_threads = 256; + if (argc > 1) + for (int i = 1; i < argc; i++) + runVLCTest(argv[i], num_block_threads); + else { + runVLCTest(NULL, num_block_threads, 1024); + } + return 0; +} + +void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) { + printf("CUDA! Starting VLC Tests!\n"); + unsigned int + num_elements; // uint num_elements = num_blocks * num_block_threads; + unsigned int mem_size; // uint mem_size = num_elements * sizeof(int); + unsigned int symbol_type_size = sizeof(int); + //////// LOAD DATA /////////////// + double H; // entropy + initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size, + symbol_type_size); + printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: " + "%d\n----------------------------\n", + num_elements, num_blocks, num_block_threads); + ////////LOAD DATA /////////////// + uint *sourceData = (uint *)malloc(mem_size); + uint *destData = (uint *)malloc(mem_size); + uint *crefData = (uint *)malloc(mem_size); + + uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size); + uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size); + + uint *cw32 = (uint *)malloc(mem_size); + uint *cw32len = (uint *)malloc(mem_size); + uint *cw32idx = (uint *)malloc(mem_size); + + uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int)); + + memset(sourceData, 0, mem_size); + memset(destData, 0, mem_size); + memset(crefData, 0, mem_size); + memset(cw32, 0, mem_size); + memset(cw32len, 0, mem_size); + memset(cw32idx, 0, mem_size); + memset(codewords, 0, NUM_SYMBOLS * symbol_type_size); + memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size); + memset(cindex2, 0, num_blocks * sizeof(int)); + //////// LOAD DATA /////////////// + loadData(file_name, sourceData, codewords, codewordlens, num_elements, + mem_size, H); + + //////// LOAD DATA /////////////// + + unsigned int *d_sourceData, *d_destData, *d_destDataPacked; + unsigned int *d_codewords, *d_codewordlens; + unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2; + + CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size)); + CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size)); + CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size)); + + CUDA_SAFE_CALL( + cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size)); + CUDA_SAFE_CALL( + cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size)); + + CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size)); + CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size)); + CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size)); + + CUDA_SAFE_CALL( + cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int))); + CUDA_SAFE_CALL( + cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int))); + // printf("source data\n"); + // for (int i = 0; i < 200; i++) { + // printf("%d ", sourceData[i]); + // } + // printf("\n"); + // printf("codewords\n"); + // for (int i = 0; i < 200; i++) { + // printf("%d ", codewords[i]); + // } + // printf("\n"); + // printf("codeword lens\n"); + // for (int i = 0; i < 200; i++) { + // printf("%d ", codewordlens[i]); + // } + // printf("\n"); + // return; + CUDA_SAFE_CALL( + cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords, + NUM_SYMBOLS * symbol_type_size, + cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens, + NUM_SYMBOLS * symbol_type_size, + cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL( + cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice)); + + dim3 grid_size(num_blocks, 1, 1); + dim3 block_size(num_block_threads, 1, 1); + unsigned int sm_size; + + unsigned int NT = 10; // number of runs for each execution time + + //////////////////* CPU ENCODER */////////////////////////////////// + unsigned int refbytesize; + long long timer = get_time(); + cpu_vlc_encode((unsigned int *)sourceData, num_elements, + (unsigned int *)crefData, &refbytesize, codewords, + codewordlens); + float msec = (float)((get_time() - timer) / 1000.0); + printf("CPU Encoding time (CPU): %f (ms)\n", msec); + printf("CPU Encoded to %d [B]\n", refbytesize); + unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1); + //////////////////* END CPU */////////////////////////////////// + + //////////////////* SM64HUFF KERNEL */////////////////////////////////// + grid_size.x = num_blocks; + block_size.x = num_block_threads; + sm_size = block_size.x * sizeof(unsigned int); +#ifdef CACHECWLUT + sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int); +#endif + + for (int i = 0; i < NT; i++) { + vlc_encode_kernel_sm64huff<<>>( + d_sourceData, d_codewords, d_codewordlens, +#ifdef TESTING + d_cw32, d_cw32len, d_cw32idx, +#endif + d_destData, d_cindex); // testedOK2 + cudaThreadSynchronize(); + } + // //////////////////* END KERNEL */////////////////////////////////// + +#ifdef TESTING + unsigned int num_scan_elements = grid_size.x; + preallocBlockSums(num_scan_elements); + cudaMemset(d_destDataPacked, 0, mem_size); + printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements); + prescanArray(d_cindex2, d_cindex, num_scan_elements); + pack2<<>>( + (unsigned int *)d_destData, d_cindex, d_cindex2, + (unsigned int *)d_destDataPacked, num_elements / num_scan_elements); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Pack2 Kernel execution failed\n"); + deallocBlockSums(); + // return; + + CUDA_SAFE_CALL( + cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost)); + compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints); +#endif + + free(sourceData); + free(destData); + free(codewords); + free(codewordlens); + free(cw32); + free(cw32len); + free(crefData); + CUDA_SAFE_CALL(cudaFree(d_sourceData)); + CUDA_SAFE_CALL(cudaFree(d_destData)); + CUDA_SAFE_CALL(cudaFree(d_destDataPacked)); + CUDA_SAFE_CALL(cudaFree(d_codewords)); + CUDA_SAFE_CALL(cudaFree(d_codewordlens)); + CUDA_SAFE_CALL(cudaFree(d_cw32)); + CUDA_SAFE_CALL(cudaFree(d_cw32len)); + CUDA_SAFE_CALL(cudaFree(d_cw32idx)); + CUDA_SAFE_CALL(cudaFree(d_cindex)); + CUDA_SAFE_CALL(cudaFree(d_cindex2)); + free(cindex2); +} diff --git a/examples/huffman/pabio_kernels_v2.cu b/examples/huffman/pabio_kernels_v2.cu new file mode 100644 index 0000000..3474cb2 --- /dev/null +++ b/examples/huffman/pabio_kernels_v2.cu @@ -0,0 +1,62 @@ +/* + * Copyright Ana Balevic, 2008-2009. All rights reserved. + */ +#ifndef _PABIO_KERNEL2_H_ +#define _PABIO_KERNEL2_H_ + +#include "parameters.h" + +/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible) +* Set numbits in the destination word out[kc] starting from the position startbit +* Implementation comments: +* Second atomic operation actually sets these bits to the value stored in the codeword; the other bits are left unotuched +* First atomic operation is a necessary prepration - we change only the bits that will be affected by the codeword to be written to 1s +* in order for set bits to work with using atomicand. +* TODOs: benchmark performance 1) gm atomics vs sm atomics; 2) memset at init time vs. atomicOr +*/ +__device__ void static put_bits_atomic2(unsigned int* out, unsigned int kc, + unsigned int startbit, unsigned int numbits, + unsigned int codeword) { + unsigned int cw32 = codeword; + unsigned int restbits = 32-startbit-numbits; + + /* 1. Prepare the memory location */ +#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s + unsigned int mask = ((1< 0000...001111 + mask<<=restbits; //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions) + atomicAnd(&out[kc], ~mask); //set 0s in the destination from startbit in the len of numbits +#endif + + /* 2. Write the codeword */ + cw32 = cw32< 0000...001111 + mask<<=restbits; //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions) + atomicAnd(&out[kc], ~mask); //set 0s in the destination from startbit in the len of numbits +#endif + + /* 2. Write the codeword */ + if (startbit == 0 && restbits == 0) { + out[kc] = cw32; + } else { + cw32 = cw32<> bit; // cut off those bits that do not fit into the initial + // location in destData[] + atomicOr(&dstData[dword], tmp); // fill up this initial location + tmp = (bit == 0) ? 0 : (dw << 32 - bit); + for (i = 1; i < bitsize / 32; + i++) { // from now on, we have exclusive access to destData[] + dw = srcData[offset + i]; // load next dword from srcData[] + tmp |= dw >> bit; // fill up tmp + dstData[dword + i] = tmp; // write complete dword to destData[] + tmp = (bit == 0) ? 0 : (dw << 32 - bit); + } + // exclusive access to dstData[] ends here + // the remaining block can, or rather should be further optimized + // write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible + // by 32, in this case do nothing + if (bit != 0 || bitsize % 32 != 0) + atomicOr(&dstData[dword + i], tmp); + if (bitsize % 32 != 0) { + dw = srcData[offset + i]; + atomicOr(&dstData[dword + i], dw >> bit); + atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit)); + } +} + +#endif diff --git a/examples/huffman/parameters.h b/examples/huffman/parameters.h new file mode 100644 index 0000000..d008df4 --- /dev/null +++ b/examples/huffman/parameters.h @@ -0,0 +1,27 @@ +#ifndef _PARAMS_H_ +#define _PARAMS_H_ + +typedef unsigned int uint; +typedef unsigned char uint8; + +#define BENCH 0 +/* 0 - MEASURE TIME, NO TESTING +** 1 - TEST +** 2 - TEST & VERBOSE +*/ +#define TESTING + +#define DPT 4 // data (dwords) per thread + +#define CACHECWLUT // MAX DPT = 8 +//#define CACHESRCDATA // MAX DPT = 4 + +#define SMATOMICS + +#define MEMSET0 + +#define MAX_SM_BLOCK_SIZE_GPU 16384 // B + +#define NUM_SYMBOLS 256 // fixed to 256. + +#endif diff --git a/examples/huffman/print_helpers.h b/examples/huffman/print_helpers.h new file mode 100644 index 0000000..e84e990 --- /dev/null +++ b/examples/huffman/print_helpers.h @@ -0,0 +1,217 @@ +#ifndef _PRINT_HELPERS_H_ +#define _PRINT_HELPERS_H_ + +#include "parameters.h" +#include + +__inline void printdbg_data_bin(const char *filename, unsigned int *data, + unsigned int num_ints) { + FILE *dump = fopen((const char *)filename, "wt"); + for (unsigned int i = 0; i < num_ints; i++) { + unsigned int mask = 0x80000000; + for (unsigned int j = 0; j < 32; j++) { + if (data[i] & mask) + fprintf(dump, "1"); // printf("1"); + else + fprintf(dump, "0"); // printf("0"); + mask = mask >> 1; + } + fprintf(dump, "\n"); + } + fclose(dump); +} +__inline void printdbg_data_int(const char *filename, unsigned int *data, + unsigned int num_ints) { + FILE *dump = fopen((const char *)filename, "wt"); + for (unsigned int i = 0; i < num_ints; i++) { + fprintf(dump, "%d: %d\n", i, data[i]); + } + fclose(dump); +} + +__inline void printdbg_gpu_data_detailed(FILE *gpudump, unsigned int *cw32, + unsigned int *cw32len, + unsigned int *cw32idx, + unsigned int num_elements) { + for (unsigned int i = 0; i < num_elements; i++) { + fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t", + cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]); + // print codeword: + unsigned int mask = 0x80000000; + mask = mask >> (32 - cw32len[i]); + for (unsigned int j = 0; j < cw32len[i]; j++) { + if (cw32[i] & mask) + fprintf(gpudump, "1"); // printf("1"); + else + fprintf(gpudump, "0"); // printf("0"); + mask = mask >> 1; + } + fprintf(gpudump, "\n"); + } +} + +__inline void printdbg_gpu_data_detailed2(const char *filename, + unsigned int *cw32, + unsigned int *cw32len, + unsigned int *cw32idx, + unsigned int num_elements) { + FILE *gpudump = fopen((const char *)filename, "wt"); + for (unsigned int i = 0; i < num_elements; i++) { + fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t", + cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]); + // print codeword: + unsigned int mask = 0x80000000; + mask = mask >> (32 - cw32len[i]); + for (unsigned int j = 0; j < cw32len[i]; j++) { + if (cw32[i] & mask) + fprintf(gpudump, "1"); // printf("1"); + else + fprintf(gpudump, "0"); // printf("0"); + mask = mask >> 1; + } + fprintf(gpudump, "\n"); + } + fclose(gpudump); +} + +/************************************************************************/ +/* BIT PRINTS */ +/************************************************************************/ +__inline void printBits(unsigned char number) { + unsigned char mask = 0x80; + for (unsigned int j = 0; j < 8; j++) { + if (number & mask) + printf("1"); + else + printf("0"); + mask = mask >> 1; + } + printf(" "); +} +__inline void print32Bits(unsigned int number) { + unsigned int mask = 0x80000000; + for (unsigned int j = 0; j < 32; j++) { + if (number & mask) + printf("1"); + else + printf("0"); + mask = mask >> 1; + } + printf("\n"); +} +__inline void print32BitsM(unsigned int marker) { + for (unsigned int j = 0; j < 32; j++) { + if (marker == (j + 1)) + printf("|"); + else + printf("."); + } + printf("\n"); +} +__inline void print_array_char_as_bits(unsigned char *a, unsigned int len) { + + printf( + " ========================= Printing vector =======================\n"); + printf("Total number of elements is %d\n", len); + for (unsigned int i = 0; i < len; i++) { + printf("a[%d]=%d \t", i, a[i]); + printBits(a[i]); + printf("\n"); + } + printf("\n"); + printf( + " ==================================================================\n"); +} + +__inline void print_array_ints_as_bits(unsigned int *a, unsigned int len) { + + printf( + " ========================= Printing vector =======================\n"); + for (unsigned int i = 0; i < len; i++) { + print32Bits(a[i]); + printf("\n"); + } + printf("\n"); + printf( + " ==================================================================\n"); +} + +__inline void print_compare_array_ints_as_bits(unsigned int *a, unsigned int *b, + unsigned int len) { + + printf( + " ========================= Printing vector =======================\n"); + for (unsigned int i = 0; i < len; i++) { + print32Bits(a[i]); + print32Bits(b[i]); + printf("\n"); + } + printf("\n"); + printf( + " ==================================================================\n"); +} + +__inline void print_array_in_hex(unsigned int *a, unsigned int len) { + + printf( + " ========================= Printing vector =======================\n"); + // printf("Total number of elements is %d\n", len); + for (unsigned int i = 0; i < len; i++) { + printf("%#X\t", a[i]); + } + + printf("\n"); + printf( + " ==================================================================\n"); +} + +/************************************************************************/ +/* ARRAY PRINTS */ +/***********************************************************************/ + +template __inline void print_array(T *a, unsigned int len) { + + printf( + " ========================= Printing vector =======================\n"); + printf("Total number of elements is %d\n", len); + for (unsigned int i = 0; i < len; i++) { + printf("a[%d]=%d \t", i, a[i]); + } + + printf("\n"); + printf( + " ==================================================================\n"); +} + +template +__inline void print_rled_arrays(ST *rle_symbols, CT *rle_counts, + unsigned int rle_len) { + ST current_symbol; + CT current_count; + printf(" ========================= Printing RLE vector " + "=======================\n"); + printf(" Total number of RL Pairs is %d\n", rle_len); + for (unsigned int k = 0; k < rle_len; k++) { + current_symbol = rle_symbols[k]; + current_count = rle_counts[k]; + printf("(%d,%d) ,\t", current_symbol, current_count); + } + printf("\n"); +} + +__inline void print_packed_rle_array(unsigned int *rle, unsigned int rle_len) { + unsigned short current_symbol; + unsigned short current_count; + printf(" ========================= Printing RLE vector " + "=======================\n"); + printf(" Total number of RL Pairs is %d\n", rle_len); + for (unsigned int k = 0; k < rle_len; k++) { + current_symbol = (unsigned short)(rle[k] >> 16); // get the higher half-word + current_count = + (unsigned short)rle[k] & 0x0000FFFFF; // get the shorter half-word + printf("(%d,%d) ,\t", current_symbol, current_count); + } + printf("\n"); +} + +#endif // _PRINT_HELPERS_H_ diff --git a/examples/huffman/run.sh b/examples/huffman/run.sh new file mode 100644 index 0000000..97c57ec --- /dev/null +++ b/examples/huffman/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +# clang++ main_test_cu.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v +clang -c -emit-llvm cpuencode.cpp +llvm-as main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as main_test_cu-host-x86_64-unknown-linux-gnu.ll + +../../build/compilation/kernelTranslator main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator main_test_cu-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc +llc --relocation-model=pic --filetype=obj cpuencode.bc + +g++ -Wall -L../../build/runtime \ + -L../../build/runtime/threadPool -o pavle \ + -fPIC -no-pie host.o kernel.o cpuencode.o -lc -lx86Runtime -lthreadPool -lpthread + +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./pavle ../../rodinia-data/huffman/test1024_H2.206587175259.in diff --git a/examples/huffman/scan.cu b/examples/huffman/scan.cu new file mode 100755 index 0000000..2dd0ddf --- /dev/null +++ b/examples/huffman/scan.cu @@ -0,0 +1,216 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +#ifndef _PRESCAN_CU_ +#define _PRESCAN_CU_ + +// includes, kernels +#include "cutil.h" +#include "scanLargeArray_kernel.cu" +#include +#include + +#define max(a, b) (a > b ? a : b) +inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); } + +inline int floorPow2(int n) { +#ifdef WIN32 + // method 2 + return 1 << (int)logb((float)n); +#else + // method 1 + // float nf = (float)n; + // return 1 << (((*(int*)&nf) >> 23) - 127); + int exp; + frexp((float)n, &exp); + return 1 << (exp - 1); +#endif +} + +#define BLOCK_SIZE 256 + +static unsigned int **g_scanBlockSums; +static unsigned int g_numEltsAllocated = 0; +static unsigned int g_numLevelsAllocated = 0; + +static void preallocBlockSums(unsigned int maxNumElements) { + assert(g_numEltsAllocated == 0); // shouldn't be called + + g_numEltsAllocated = maxNumElements; + + unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks + unsigned int numElts = maxNumElements; + int level = 0; + + do { + unsigned int numBlocks = + max(1, (int)ceil((float)numElts / (2.f * blockSize))); + if (numBlocks > 1) + level++; + numElts = numBlocks; + } while (numElts > 1); + + g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *)); + g_numLevelsAllocated = level; + numElts = maxNumElements; + level = 0; + + do { + unsigned int numBlocks = + max(1, (int)ceil((float)numElts / (2.f * blockSize))); + if (numBlocks > 1) + CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++], + numBlocks * sizeof(unsigned int))); + numElts = numBlocks; + } while (numElts > 1); + + CUT_CHECK_ERROR("preallocBlockSums"); +} + +static void deallocBlockSums() { + for (unsigned int i = 0; i < g_numLevelsAllocated; i++) { + cudaFree(g_scanBlockSums[i]); + } + + CUT_CHECK_ERROR("deallocBlockSums"); + + free((void **)g_scanBlockSums); + + g_scanBlockSums = 0; + g_numEltsAllocated = 0; + g_numLevelsAllocated = 0; +} + +static void prescanArrayRecursive(unsigned int *outArray, + const unsigned int *inArray, int numElements, + int level) { + unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks + unsigned int numBlocks = + max(1, (int)ceil((float)numElements / (2.f * blockSize))); + unsigned int numThreads; + + if (numBlocks > 1) + numThreads = blockSize; + else if (isPowerOfTwo(numElements)) + numThreads = numElements / 2; + else + numThreads = floorPow2(numElements); + + unsigned int numEltsPerBlock = numThreads * 2; + + // if this is a non-power-of-2 array, the last block will be non-full + // compute the smallest power of 2 able to compute its scan. + unsigned int numEltsLastBlock = + numElements - (numBlocks - 1) * numEltsPerBlock; + unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2); + unsigned int np2LastBlock = 0; + unsigned int sharedMemLastBlock = 0; + + if (numEltsLastBlock != numEltsPerBlock) { + np2LastBlock = 1; + + if (!isPowerOfTwo(numEltsLastBlock)) + numThreadsLastBlock = floorPow2(numEltsLastBlock); + + unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS; + sharedMemLastBlock = + sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace); + } + + // padding space is used to avoid shared memory bank conflicts + unsigned int extraSpace = numEltsPerBlock / NUM_BANKS; + unsigned int sharedMemSize = + sizeof(unsigned int) * (numEltsPerBlock + extraSpace); + +#ifdef DEBUG + if (numBlocks > 1) { + assert(g_numEltsAllocated >= numElements); + } +#endif + + // setup execution parameters + // if NP2, we process the last block separately + dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1); + dim3 threads(numThreads, 1, 1); + + // make sure there are no CUDA errors before we start + CUT_CHECK_ERROR("prescanArrayRecursive before kernels"); + + // execute the scan + if (numBlocks > 1) { + prescan<<>>( + outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("prescanWithBlockSums"); + if (np2LastBlock) { + prescan<<<1, numThreadsLastBlock>>>( + outArray, inArray, g_scanBlockSums[level], numEltsLastBlock, + numBlocks - 1, numElements - numEltsLastBlock); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("prescanNP2WithBlockSums"); + } + + // After scanning all the sub-blocks, we are mostly done. But now we + // need to take all of the last values of the sub-blocks and scan those. + // This will give us a new value that must be sdded to each block to + // get the final results. + // recursive (CPU) call + prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level], + numBlocks, level + 1); + + uniformAdd<<>>(outArray, g_scanBlockSums[level], + numElements - numEltsLastBlock, 0, 0); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("uniformAdd"); + if (np2LastBlock) { + uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level], + numEltsLastBlock, numBlocks - 1, + numElements - numEltsLastBlock); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("uniformAdd"); + } + } else if (isPowerOfTwo(numElements)) { + prescan + <<>>(outArray, inArray, 0, numThreads * 2, 0, 0); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("prescan"); + } else { + prescan + <<>>(outArray, inArray, 0, numElements, 0, 0); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("prescanNP2"); + } +} + +static void prescanArray(unsigned int *outArray, unsigned int *inArray, + int numElements) { + prescanArrayRecursive(outArray, inArray, numElements, 0); +} + +#endif // _PRESCAN_CU_ diff --git a/examples/huffman/scanLargeArray_kernel.cu b/examples/huffman/scanLargeArray_kernel.cu new file mode 100644 index 0000000..acfca30 --- /dev/null +++ b/examples/huffman/scanLargeArray_kernel.cu @@ -0,0 +1,237 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +#ifndef _SCAN_BEST_KERNEL_CU_ +#define _SCAN_BEST_KERNEL_CU_ + +// Define this to more rigorously avoid bank conflicts, +// even at the lower (root) levels of the tree +// Note that due to the higher addressing overhead, performance +// is lower with ZERO_BANK_CONFLICTS enabled. It is provided +// as an example. +//#define ZERO_BANK_CONFLICTS + +// 16 banks on G80 +#define NUM_BANKS 16 +#define LOG_NUM_BANKS 4 + +#ifdef ZERO_BANK_CONFLICTS +#define CONFLICT_FREE_OFFSET(index) \ + ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS)) +#else +#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS) +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Work-efficient compute implementation of scan, one thread per 2 elements +// Work-efficient: O(log(n)) steps, and O(n) adds. +// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no +// ping-ponging Also avoids most bank conflicts using single-element offsets +// every NUM_BANKS elements. +// +// In addition, If ZERO_BANK_CONFLICTS is defined, uses +// n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS) +// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts +// using single-element offsets every NUM_BANKS elements, plus additional +// single-element offsets after every NUM_BANKS^2 elements. +// +// Uses a balanced tree type algorithm. See Blelloch, 1990 "Prefix Sums +// and Their Applications", or Prins and Chatterjee PRAM course notes: +// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf +// +// This work-efficient version is based on the algorithm presented in Guy +// Blelloch's excellent paper "Prefix sums and their applications". +// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html +// +// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS +// is defined) Con: More instructions to compute bank-conflict-free shared +// memory addressing, and slightly more shared memory storage used. +// + +template +__device__ static void +loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n, + int baseIndex, int &ai, int &bi, int &mem_ai, + int &mem_bi, int &bankOffsetA, int &bankOffsetB) { + int thid = threadIdx.x; + mem_ai = baseIndex + threadIdx.x; + mem_bi = mem_ai + blockDim.x; + + ai = thid; + bi = thid + blockDim.x; + + // compute spacing to avoid bank conflicts + bankOffsetA = CONFLICT_FREE_OFFSET(ai); + bankOffsetB = CONFLICT_FREE_OFFSET(bi); + + // Cache the computational window in shared memory + // pad values beyond n with zeros + s_data[ai + bankOffsetA] = g_idata[mem_ai]; + + if (isNP2) // compile-time decision + { + s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0; + } else { + s_data[bi + bankOffsetB] = g_idata[mem_bi]; + } +} + +template +__device__ static void +storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n, + int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA, + int bankOffsetB) { + __syncthreads(); + + // write results to global memory + g_odata[mem_ai] = s_data[ai + bankOffsetA]; + if (isNP2) // compile-time decision + { + if (bi < n) + g_odata[mem_bi] = s_data[bi + bankOffsetB]; + } else { + g_odata[mem_bi] = s_data[bi + bankOffsetB]; + } +} + +template +__device__ static void clearLastElement(unsigned int *s_data, + unsigned int *g_blockSums, + int blockIndex) { + if (threadIdx.x == 0) { + int index = (blockDim.x << 1) - 1; + index += CONFLICT_FREE_OFFSET(index); + + if (storeSum) // compile-time decision + { + // write this block's total sum to the corresponding index in the + // blockSums array + g_blockSums[blockIndex] = s_data[index]; + } + + // zero the last element in the scan so it will propagate back to the front + s_data[index] = 0; + } +} + +__device__ static unsigned int buildSum(unsigned int *s_data) { + unsigned int thid = threadIdx.x; + unsigned int stride = 1; + + // build the sum in place up the tree + for (int d = blockDim.x; d > 0; d >>= 1) { + __syncthreads(); + + if (thid < d) { + int i = __mul24(__mul24(2, stride), thid); + int ai = i + stride - 1; + int bi = ai + stride; + + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + + s_data[bi] += s_data[ai]; + } + + stride *= 2; + } + + return stride; +} + +__device__ static void scanRootToLeaves(unsigned int *s_data, + unsigned int stride) { + unsigned int thid = threadIdx.x; + + // traverse down the tree building the scan in place + for (int d = 1; d <= blockDim.x; d *= 2) { + stride >>= 1; + + __syncthreads(); + + if (thid < d) { + int i = __mul24(__mul24(2, stride), thid); + int ai = i + stride - 1; + int bi = ai + stride; + + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + + unsigned int t = s_data[ai]; + s_data[ai] = s_data[bi]; + s_data[bi] += t; + } + } +} + +template +__device__ static void prescanBlock(unsigned int *data, int blockIndex, + unsigned int *blockSums) { + int stride = buildSum(data); // build the sum in place up the tree + clearLastElement(data, blockSums, + (blockIndex == 0) ? blockIdx.x : blockIndex); + scanRootToLeaves(data, stride); // traverse down tree to build the scan +} + +template +__global__ static void +prescan(unsigned int *g_odata, const unsigned int *g_idata, + unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) { + int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB; + __shared__ unsigned int s_data[3072]; + + // load data into shared memory + loadSharedChunkFromMem( + s_data, g_idata, n, + (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai, + bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB); + // scan the data in each block + prescanBlock(s_data, blockIndex, g_blockSums); + // write results to device memory + storeSharedChunkToMem(g_odata, s_data, n, ai, bi, mem_ai, mem_bi, + bankOffsetA, bankOffsetB); +} + +__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms, + int n, int blockOffset, int baseIndex) { + __shared__ unsigned int uni; + if (threadIdx.x == 0) + uni = uniforms[blockIdx.x + blockOffset]; + + unsigned int address = + __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x; + + __syncthreads(); + + // note two adds per thread + g_data[address] += uni; + g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni; +} + +#endif // #ifndef _SCAN_BEST_KERNEL_CU_ diff --git a/examples/huffman/stats_logger.cpp b/examples/huffman/stats_logger.cpp new file mode 100644 index 0000000..460efac --- /dev/null +++ b/examples/huffman/stats_logger.cpp @@ -0,0 +1,43 @@ +/* + * Copyright 2009 Tjark Bringewat. All rights reserved. + */ + +#include "stats_logger.h" +#include "stdafx.h" +#include +#include +#include + +std::map filenames; + +void LogStats(const char *graphname, const char *seriesname, float xValue, + float yValue, const char *xAxisQuantity, + const char *yAxisQuantity, const char *xAxisUnit, + const char *yAxisUnit, const char *xAxisScaleType, + const char *yAxisScaleType, unsigned int seriesnumber, + const char *description) { + std::ostringstream temp, temp2; + temp << graphname << "__" << seriesname; + size_t exists = filenames.count(temp.str()); + if (!exists) + filenames[temp.str()] = seriesnumber; + temp2 << graphname << "__" << filenames[temp.str()] << "_" << seriesname + << ".txt"; + FILE *f; + if (!exists) { + f = fopen(temp2.str().c_str(), "wt"); + fprintf(f, "SERIES_NAME\n%s\n", seriesname); + fprintf(f, "X_AXIS_QUANTITY\n%s\n", xAxisQuantity); + fprintf(f, "Y_AXIS_QUANTITY\n%s\n", yAxisQuantity); + fprintf(f, "X_AXIS_UNIT\n%s\n", xAxisUnit); + fprintf(f, "Y_AXIS_UNIT\n%s\n", yAxisUnit); + fprintf(f, "X_AXIS_SCALE_TYPE\n%s\n", xAxisScaleType); + fprintf(f, "Y_AXIS_SCALE_TYPE\n%s\n", yAxisScaleType); + fprintf(f, "DESCRIPTION\n%s\n", description); + fprintf(f, "__DATA__\n"); + } else { + f = fopen(temp2.str().c_str(), "at"); + } + fprintf(f, "%f %f\n", xValue, yValue); + fclose(f); +} diff --git a/examples/huffman/stats_logger.h b/examples/huffman/stats_logger.h new file mode 100644 index 0000000..c9381ab --- /dev/null +++ b/examples/huffman/stats_logger.h @@ -0,0 +1,45 @@ +/* + * Copyright Tjark Bringewat. All rights reserved. + */ + +#ifndef _STATS_LOGGER_H_ +#define _STATS_LOGGER_H_ + +#include +#pragma warning(disable : 4996) + +extern "C" void +LogStats(const char *graphname, const char *seriesname, float xValue, + float yValue, const char *xAxisQuantity, const char *yAxisQuantity, + const char *xAxisUnit = "", const char *yAxisUnit = "", + const char *xAxisScaleType = "lin", const char *yAxisScaleType = "lin", + unsigned int seriesnumber = 0, const char *description = ""); + +inline void LogStats2( + const char *graph, // Groups several functions into one graph. Only appears + // in the file name. + const char *function, // Name of the particular function. Appears in file + // name and legend. + float yValue, float xValue, const char *yAxisName = "Time", + const char *yAxisUnit = "ms", const char *xAxisName = "Data size", + const char *xAxisUnit = "MB", + const char *yAxisScaleType = "lin", // Can be lin or log for linear or + // logarithmic scale, respectively. + const char *xAxisScaleType = "log", + unsigned int fId = + 0, // Determines the order in which different functions are plotted to a + // common graph. Only appears in the file name. + const char *description = "") { + LogStats(graph, function, xValue, yValue, xAxisName, yAxisName, xAxisUnit, + yAxisUnit, xAxisScaleType, yAxisScaleType, fId, description); + if (strcmp(xAxisUnit, "MB") == 0 && strcmp(yAxisUnit, "ms") == 0) { + char buffer[100]; + strcpy(buffer, graph); + strcat(buffer, "_datarate"); + LogStats(buffer, function, xValue, (xValue * 1000.0f) / (yValue * 1024.0f), + xAxisName, "Data rate", xAxisUnit, "GB/s", xAxisScaleType, + yAxisScaleType, fId, description); + } +} + +#endif diff --git a/examples/huffman/stdafx.h b/examples/huffman/stdafx.h new file mode 100644 index 0000000..f75dd45 --- /dev/null +++ b/examples/huffman/stdafx.h @@ -0,0 +1,11 @@ +#pragma once + +#include "cutil.h" +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/examples/huffman/testdatagen.h b/examples/huffman/testdatagen.h new file mode 100644 index 0000000..605a5af --- /dev/null +++ b/examples/huffman/testdatagen.h @@ -0,0 +1,83 @@ +#ifndef _TESTDATA_GEN_H_ +#define _TESTDATA_GEN_H_ + +#include "parameters.h" + +template +__inline__ void generateRLETestData(T *data, unsigned int num_blocks, + unsigned int num_block_threads) { + unsigned int i, j; + + /* generate first block*/ + for (i = 0; i < num_block_threads; i += 8) { + data[i] = 1; + data[i + 1] = 2; + data[i + 2] = 3; + data[i + 3] = 3; + data[i + 4] = 3; + data[i + 5] = 4; + data[i + 6] = 4; + data[i + 7] = 5; + } + /* copy contents of the first block to all other blocks (for testing only)*/ + for (j = 1; j < num_blocks; j++) + for (i = 0; i < num_block_threads; i++) + *(data + j * num_block_threads + i) = data[i]; +} + +template +__inline__ void generateRLETestDataLongRuns1(T *data, unsigned int num_blocks, + unsigned int num_block_threads, + unsigned int avg_run_len) { + unsigned int i, j; + + /* generate first block*/ + for (i = 0; i < num_block_threads / avg_run_len; i++) + for (j = 0; j < avg_run_len; j++) + data[i * avg_run_len + j] = i; + + /* copy contents of the first block to all other blocks (for testing only)*/ + for (j = 1; j < num_blocks; j++) + for (i = 0; i < num_block_threads; i++) + *(data + j * num_block_threads + i) = data[i]; +} + +// VLE TEST DATA VER2.0 + +// for testing only: generates codewords of the following lengths: 1, 2, 3, 4, +// 4, 5, 6, 7 +// and dummy odewords: 1, 10, 100, 1000, 1000, 10000, 100000, 1000000 +// equals 0x01, 0x02, 0x4, 0x8, 0x8, 0x10, 0x20, 0x40 +// num_symbols =256. Must be multiple of 8. +inline void generateCodewords(unsigned int *codewords, + unsigned int *codewordlens, + unsigned int num_symbols) { + unsigned int idx, i, j, numbits, k; // val, k; + /* Generate codeword lengths*/ + for (j = 0; j < num_symbols / 8; j++) { + for (i = 0; i < 4; i++) { // generate first half of length 1,2 3, 4 + idx = j * 8 + i; + codewordlens[idx] = i % 4 + 1; + } + for (i = 0; i < 4; i++) { // generate first half of length 4, 5, 6, 7 + idx = j * 8 + 4 + i; + codewordlens[idx] = i % 4 + 4; + } + } + /* Generate codewords*/ + for (k = 0; k < num_symbols; k++) { + numbits = codewordlens[k]; + codewords[k] = 0x01 << (numbits - 1); + } +} + +inline void generateData(unsigned int *data, unsigned int num_elements, + unsigned int *codewords, unsigned int *codewordlens, + unsigned int num_symbols) { + unsigned int i; + for (i = 0; i < num_elements; i++) { + data[i] = (unsigned int)(((float)rand() / (RAND_MAX + 1)) * num_symbols); + } +} + +#endif diff --git a/examples/huffman/vlc_kernel_sm64huff.cu b/examples/huffman/vlc_kernel_sm64huff.cu new file mode 100755 index 0000000..9a88015 --- /dev/null +++ b/examples/huffman/vlc_kernel_sm64huff.cu @@ -0,0 +1,160 @@ +#ifndef _VLC_SM64HUFF_KERNEL_H_ +#define _VLC_SM64HUFF_KERNEL_H_ + +#include "pabio_kernels_v2.cu" +#include "parameters.h" +#include + +#ifdef SMATOMICS + +/* HUFFMAN-FRIENDLY PAVLE + CHARACTERISTICS: + 1. CACHE CW_LUT INTO SM, LOAD AS 2 INT ARRAYS + 2. PARALLEL PREFIX SUM + 3. PARALLEL BIT I/O USING SHARED-MEMORY ATOMIC OPERATIONS (COMAPTIBLE WITH + CUDA1.3+) + + NOTES & ASSUMPTIONS: + - HUFFMAN-CODING FRIENDLY, SUPPORTS CODEWORDS OF 2X SIZE OF ORIGINAL + SYMBOLS (BYTES). - NUMBER OF THREADS PER BLOCK IS 256; IF YOU WANT TO PLAY + WITH DIFFERENT NUMBERS, THE CW CACHING SHOULD BE MODIFIED (SEE DPT* KERNELS) + - SM usage: 1x size of the input data (REUSE) + size of CWLUT + TURN ON CACHING FOR HIGH ENTROPY DATA! +*/ + +__global__ static void +vlc_encode_kernel_sm64huff(unsigned int *data, const unsigned int *gm_codewords, + const unsigned int *gm_codewordlens, +#ifdef TESTING + unsigned int *cw32, unsigned int *cw32len, + unsigned int *cw32idx, +#endif + unsigned int *out, unsigned int *outidx) { + + unsigned int kn = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int k = threadIdx.x; + unsigned int kc, startbit, wrbits; + + unsigned long long cw64 = 0; + unsigned int val32, codewordlen = 0; + unsigned char tmpbyte, tmpcwlen; + unsigned int tmpcw32; + + __shared__ unsigned int sm[3072]; + __shared__ unsigned int kcmax; + +#ifdef CACHECWLUT + unsigned int *codewords = (unsigned int *)sm; + unsigned int *codewordlens = (unsigned int *)(sm + NUM_SYMBOLS); + unsigned int *as = (unsigned int *)(sm + 2 * NUM_SYMBOLS); + + /* Load the codewords and the original data*/ + codewords[k] = gm_codewords[k]; + codewordlens[k] = gm_codewordlens[k]; + val32 = data[kn]; + __syncthreads(); + for (unsigned int i = 0; i < 4; i++) { + tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8)); + tmpcw32 = codewords[tmpbyte]; + tmpcwlen = codewordlens[tmpbyte]; + cw64 = (cw64 << tmpcwlen) | tmpcw32; + codewordlen += tmpcwlen; + } +#else + unsigned int *as = (unsigned int *)sm; + val32 = data[kn]; + for (unsigned int i = 0; i < 4; i++) { + tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8)); + tmpcw32 = gm_codewords[tmpbyte]; + tmpcwlen = gm_codewordlens[tmpbyte]; + cw64 = (cw64 << tmpcwlen) | tmpcw32; + codewordlen += tmpcwlen; + } +#endif + as[k] = codewordlen; + __syncthreads(); + + /* Prefix sum of codeword lengths (denoted in bits) [inplace implementation] + */ + unsigned int offset = 1; + + /* Build the sum in place up the tree */ + for (unsigned int d = (blockDim.x) >> 1; d > 0; d >>= 1) { + __syncthreads(); + if (k < d) { + unsigned char ai = offset * (2 * k + 1) - 1; + unsigned char bi = offset * (2 * k + 2) - 1; + as[bi] += as[ai]; + } + offset *= 2; + } + + /* scan back down the tree */ + /* clear the last element */ + if (k == 0) + as[blockDim.x - 1] = 0; + + // traverse down the tree building the scan in place + for (unsigned int d = 1; d < blockDim.x; d *= 2) { + offset >>= 1; + __syncthreads(); + if (k < d) { + unsigned char ai = offset * (2 * k + 1) - 1; + unsigned char bi = offset * (2 * k + 2) - 1; + unsigned int t = as[ai]; + as[ai] = as[bi]; + as[bi] += t; + } + } + __syncthreads(); + + if (k == blockDim.x - 1) { + outidx[blockIdx.x] = as[k] + codewordlen; + kcmax = (as[k] + codewordlen) / 32; + // printf("kcmax: %d\n", kcmax); + } + + /* Write the codes */ + kc = as[k] / 32; + startbit = as[k] % 32; + as[k] = 0U; + __syncthreads(); + + /* Part 1*/ + wrbits = codewordlen > (32 - startbit) ? (32 - startbit) : codewordlen; + tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits)); + // if (wrbits == 32) as[kc] = tmpcw32; + // //unnecessary overhead; increases number of branches else + atomicOr(&as[kc], tmpcw32 << (32 - startbit - + wrbits)); // shift left in case it's shorter + // then the available space + codewordlen -= wrbits; + + /*Part 2*/ + if (codewordlen) { + wrbits = codewordlen > 32 ? 32 : codewordlen; + tmpcw32 = + (unsigned int)(cw64 >> (codewordlen - wrbits)) & ((1 << wrbits) - 1); + // if (wrbits == 32) as[kc+1] = tmpcw32; + // else + atomicOr(&as[kc + 1], tmpcw32 << (32 - wrbits)); + codewordlen -= wrbits; + } + + /*Part 3*/ + if (codewordlen) { + tmpcw32 = (unsigned int)(cw64 & ((1 << codewordlen) - 1)); + // if (wrbits == 32) as[kc+2] = tmpcw32; + // else + atomicOr(&as[kc + 2], tmpcw32 << (32 - codewordlen)); + } + + __syncthreads(); + + if (k <= kcmax) + out[kn] = as[k]; +} +////////////////////////////////////////////////////////////////////////////// +#endif + +#endif diff --git a/examples/lud/common-host-x86_64-unknown-linux-gnu.ll b/examples/lud/common-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..194405b --- /dev/null +++ b/examples/lud/common-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,1291 @@ +; ModuleID = 'common-host-x86_64-unknown-linux-gnu.bc' +source_filename = "./common/common.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.__stopwatch_t = type { %struct.timeval, %struct.timeval } +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } + +$_ZSt4fabsf = comdat any + +$_ZSt3expf = comdat any + +@.str = private unnamed_addr constant [3 x i8] c"rb\00", align 1 +@.str.1 = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 +@.str.2 = private unnamed_addr constant [4 x i8] c"%f \00", align 1 +@.str.3 = private unnamed_addr constant [35 x i8] c"dismatch at (%d, %d): (o)%f (n)%f\0A\00", align 1 +@.str.4 = private unnamed_addr constant [6 x i8] c"PASS\0A\00", align 1 +@.str.5 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @stopwatch_start(%struct.__stopwatch_t* %sw) #0 { +entry: + %sw.addr = alloca %struct.__stopwatch_t*, align 8 + store %struct.__stopwatch_t* %sw, %struct.__stopwatch_t** %sw.addr, align 8 + %0 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %cmp = icmp eq %struct.__stopwatch_t* %0, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %return + +if.end: ; preds = %entry + %1 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %begin = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %1, i32 0, i32 0 + %2 = bitcast %struct.timeval* %begin to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %2, i8 0, i64 16, i1 false) + %3 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %end = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %3, i32 0, i32 1 + %4 = bitcast %struct.timeval* %end to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %4, i8 0, i64 16, i1 false) + %5 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %begin1 = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %5, i32 0, i32 0 + %call = call i32 @gettimeofday(%struct.timeval* %begin1, %struct.timezone* null) #5 + br label %return + +return: ; preds = %if.end, %if.then + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #2 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @stopwatch_stop(%struct.__stopwatch_t* %sw) #0 { +entry: + %sw.addr = alloca %struct.__stopwatch_t*, align 8 + store %struct.__stopwatch_t* %sw, %struct.__stopwatch_t** %sw.addr, align 8 + %0 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %cmp = icmp eq %struct.__stopwatch_t* %0, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %return + +if.end: ; preds = %entry + %1 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %end = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %1, i32 0, i32 1 + %call = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone* null) #5 + br label %return + +return: ; preds = %if.end, %if.then + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local double @get_interval_by_sec(%struct.__stopwatch_t* %sw) #0 { +entry: + %retval = alloca double, align 8 + %sw.addr = alloca %struct.__stopwatch_t*, align 8 + store %struct.__stopwatch_t* %sw, %struct.__stopwatch_t** %sw.addr, align 8 + %0 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %cmp = icmp eq %struct.__stopwatch_t* %0, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store double 0.000000e+00, double* %retval, align 8 + br label %return + +if.end: ; preds = %entry + %1 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %end = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %1, i32 0, i32 1 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %end, i32 0, i32 0 + %2 = load i64, i64* %tv_sec, align 8 + %3 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %begin = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %3, i32 0, i32 0 + %tv_sec1 = getelementptr inbounds %struct.timeval, %struct.timeval* %begin, i32 0, i32 0 + %4 = load i64, i64* %tv_sec1, align 8 + %sub = sub nsw i64 %2, %4 + %conv = sitofp i64 %sub to double + %5 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %end2 = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %5, i32 0, i32 1 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %end2, i32 0, i32 1 + %6 = load i64, i64* %tv_usec, align 8 + %7 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %begin3 = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %7, i32 0, i32 0 + %tv_usec4 = getelementptr inbounds %struct.timeval, %struct.timeval* %begin3, i32 0, i32 1 + %8 = load i64, i64* %tv_usec4, align 8 + %sub5 = sub nsw i64 %6, %8 + %conv6 = sitofp i64 %sub5 to double + %div = fdiv double %conv6, 1.000000e+06 + %add = fadd double %conv, %div + store double %add, double* %retval, align 8 + br label %return + +return: ; preds = %if.end, %if.then + %9 = load double, double* %retval, align 8 + ret double %9 +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @get_interval_by_usec(%struct.__stopwatch_t* %sw) #0 { +entry: + %retval = alloca i32, align 4 + %sw.addr = alloca %struct.__stopwatch_t*, align 8 + store %struct.__stopwatch_t* %sw, %struct.__stopwatch_t** %sw.addr, align 8 + %0 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %cmp = icmp eq %struct.__stopwatch_t* %0, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 0, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + %1 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %end = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %1, i32 0, i32 1 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %end, i32 0, i32 0 + %2 = load i64, i64* %tv_sec, align 8 + %3 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %begin = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %3, i32 0, i32 0 + %tv_sec1 = getelementptr inbounds %struct.timeval, %struct.timeval* %begin, i32 0, i32 0 + %4 = load i64, i64* %tv_sec1, align 8 + %sub = sub nsw i64 %2, %4 + %mul = mul nsw i64 %sub, 1000000 + %5 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %end2 = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %5, i32 0, i32 1 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %end2, i32 0, i32 1 + %6 = load i64, i64* %tv_usec, align 8 + %7 = load %struct.__stopwatch_t*, %struct.__stopwatch_t** %sw.addr, align 8 + %begin3 = getelementptr inbounds %struct.__stopwatch_t, %struct.__stopwatch_t* %7, i32 0, i32 0 + %tv_usec4 = getelementptr inbounds %struct.timeval, %struct.timeval* %begin3, i32 0, i32 1 + %8 = load i64, i64* %tv_usec4, align 8 + %sub5 = sub nsw i64 %6, %8 + %add = add nsw i64 %mul, %sub5 + %conv = trunc i64 %add to i32 + store i32 %conv, i32* %retval, align 4 + br label %return + +return: ; preds = %if.end, %if.then + %9 = load i32, i32* %retval, align 4 + ret i32 %9 +} + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @create_matrix_from_file(float** %mp, i8* %filename, i32* %size_p) #3 { +entry: + %retval = alloca i32, align 4 + %mp.addr = alloca float**, align 8 + %filename.addr = alloca i8*, align 8 + %size_p.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %size = alloca i32, align 4 + %m = alloca float*, align 8 + %fp = alloca %struct._IO_FILE*, align 8 + store float** %mp, float*** %mp.addr, align 8 + store i8* %filename, i8** %filename.addr, align 8 + store i32* %size_p, i32** %size_p.addr, align 8 + store %struct._IO_FILE* null, %struct._IO_FILE** %fp, align 8 + %0 = load i8*, i8** %filename.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %1 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %cmp = icmp eq %struct._IO_FILE* %1, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 1, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + %2 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32* %size) + %3 = load i32, i32* %size, align 4 + %conv = sext i32 %3 to i64 + %mul = mul i64 4, %conv + %4 = load i32, i32* %size, align 4 + %conv2 = sext i32 %4 to i64 + %mul3 = mul i64 %mul, %conv2 + %call4 = call noalias i8* @malloc(i64 %mul3) #5 + %5 = bitcast i8* %call4 to float* + store float* %5, float** %m, align 8 + %6 = load float*, float** %m, align 8 + %cmp5 = icmp eq float* %6, null + br i1 %cmp5, label %if.then6, label %if.end8 + +if.then6: ; preds = %if.end + %7 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call7 = call i32 @fclose(%struct._IO_FILE* %7) + store i32 1, i32* %retval, align 4 + br label %return + +if.end8: ; preds = %if.end + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc17, %if.end8 + %8 = load i32, i32* %i, align 4 + %9 = load i32, i32* %size, align 4 + %cmp9 = icmp slt i32 %8, %9 + br i1 %cmp9, label %for.body, label %for.end19 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond10 + +for.cond10: ; preds = %for.inc, %for.body + %10 = load i32, i32* %j, align 4 + %11 = load i32, i32* %size, align 4 + %cmp11 = icmp slt i32 %10, %11 + br i1 %cmp11, label %for.body12, label %for.end + +for.body12: ; preds = %for.cond10 + %12 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %13 = load float*, float** %m, align 8 + %14 = load i32, i32* %i, align 4 + %15 = load i32, i32* %size, align 4 + %mul13 = mul nsw i32 %14, %15 + %idx.ext = sext i32 %mul13 to i64 + %add.ptr = getelementptr inbounds float, float* %13, i64 %idx.ext + %16 = load i32, i32* %j, align 4 + %idx.ext14 = sext i32 %16 to i64 + %add.ptr15 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext14 + %call16 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.2, i64 0, i64 0), float* %add.ptr15) + br label %for.inc + +for.inc: ; preds = %for.body12 + %17 = load i32, i32* %j, align 4 + %inc = add nsw i32 %17, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond10 + +for.end: ; preds = %for.cond10 + br label %for.inc17 + +for.inc17: ; preds = %for.end + %18 = load i32, i32* %i, align 4 + %inc18 = add nsw i32 %18, 1 + store i32 %inc18, i32* %i, align 4 + br label %for.cond + +for.end19: ; preds = %for.cond + %19 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call20 = call i32 @fclose(%struct._IO_FILE* %19) + %20 = load i32, i32* %size, align 4 + %21 = load i32*, i32** %size_p.addr, align 8 + store i32 %20, i32* %21, align 4 + %22 = load float*, float** %m, align 8 + %23 = load float**, float*** %mp.addr, align 8 + store float* %22, float** %23, align 8 + store i32 0, i32* %retval, align 4 + br label %return + +return: ; preds = %for.end19, %if.then6, %if.then + %24 = load i32, i32* %retval, align 4 + ret i32 %24 +} + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #4 + +declare dso_local i32 @fscanf(%struct._IO_FILE*, i8*, ...) #4 + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #2 + +declare dso_local i32 @fclose(%struct._IO_FILE*) #4 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @create_matrix_from_random(float** %mp, i32 %size) #0 { +entry: + %retval = alloca i32, align 4 + %mp.addr = alloca float**, align 8 + %size.addr = alloca i32, align 4 + %l = alloca float*, align 8 + %u = alloca float*, align 8 + %m = alloca float*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + store float** %mp, float*** %mp.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + %call = call i64 @time(i64* null) #5 + %conv = trunc i64 %call to i32 + call void @srand(i32 %conv) #5 + %0 = load i32, i32* %size.addr, align 4 + %1 = load i32, i32* %size.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv1 = sext i32 %mul to i64 + %mul2 = mul i64 %conv1, 4 + %call3 = call noalias i8* @malloc(i64 %mul2) #5 + %2 = bitcast i8* %call3 to float* + store float* %2, float** %l, align 8 + %3 = load float*, float** %l, align 8 + %cmp = icmp eq float* %3, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 1, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + %4 = load i32, i32* %size.addr, align 4 + %5 = load i32, i32* %size.addr, align 4 + %mul4 = mul nsw i32 %4, %5 + %conv5 = sext i32 %mul4 to i64 + %mul6 = mul i64 %conv5, 4 + %call7 = call noalias i8* @malloc(i64 %mul6) #5 + %6 = bitcast i8* %call7 to float* + store float* %6, float** %u, align 8 + %7 = load float*, float** %u, align 8 + %cmp8 = icmp eq float* %7, null + br i1 %cmp8, label %if.then9, label %if.end10 + +if.then9: ; preds = %if.end + %8 = load float*, float** %l, align 8 + %9 = bitcast float* %8 to i8* + call void @free(i8* %9) #5 + store i32 1, i32* %retval, align 4 + br label %return + +if.end10: ; preds = %if.end + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc33, %if.end10 + %10 = load i32, i32* %i, align 4 + %11 = load i32, i32* %size.addr, align 4 + %cmp11 = icmp slt i32 %10, %11 + br i1 %cmp11, label %for.body, label %for.end35 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond12 + +for.cond12: ; preds = %for.inc, %for.body + %12 = load i32, i32* %j, align 4 + %13 = load i32, i32* %size.addr, align 4 + %cmp13 = icmp slt i32 %12, %13 + br i1 %cmp13, label %for.body14, label %for.end + +for.body14: ; preds = %for.cond12 + %14 = load i32, i32* %i, align 4 + %15 = load i32, i32* %j, align 4 + %cmp15 = icmp sgt i32 %14, %15 + br i1 %cmp15, label %if.then16, label %if.else + +if.then16: ; preds = %for.body14 + %call17 = call i32 @rand() #5 + %conv18 = sitofp i32 %call17 to float + %div = fdiv float %conv18, 0x41E0000000000000 + %16 = load float*, float** %l, align 8 + %17 = load i32, i32* %i, align 4 + %18 = load i32, i32* %size.addr, align 4 + %mul19 = mul nsw i32 %17, %18 + %19 = load i32, i32* %j, align 4 + %add = add nsw i32 %mul19, %19 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %16, i64 %idxprom + store float %div, float* %arrayidx, align 4 + br label %if.end32 + +if.else: ; preds = %for.body14 + %20 = load i32, i32* %i, align 4 + %21 = load i32, i32* %j, align 4 + %cmp20 = icmp eq i32 %20, %21 + br i1 %cmp20, label %if.then21, label %if.else26 + +if.then21: ; preds = %if.else + %22 = load float*, float** %l, align 8 + %23 = load i32, i32* %i, align 4 + %24 = load i32, i32* %size.addr, align 4 + %mul22 = mul nsw i32 %23, %24 + %25 = load i32, i32* %j, align 4 + %add23 = add nsw i32 %mul22, %25 + %idxprom24 = sext i32 %add23 to i64 + %arrayidx25 = getelementptr inbounds float, float* %22, i64 %idxprom24 + store float 1.000000e+00, float* %arrayidx25, align 4 + br label %if.end31 + +if.else26: ; preds = %if.else + %26 = load float*, float** %l, align 8 + %27 = load i32, i32* %i, align 4 + %28 = load i32, i32* %size.addr, align 4 + %mul27 = mul nsw i32 %27, %28 + %29 = load i32, i32* %j, align 4 + %add28 = add nsw i32 %mul27, %29 + %idxprom29 = sext i32 %add28 to i64 + %arrayidx30 = getelementptr inbounds float, float* %26, i64 %idxprom29 + store float 0.000000e+00, float* %arrayidx30, align 4 + br label %if.end31 + +if.end31: ; preds = %if.else26, %if.then21 + br label %if.end32 + +if.end32: ; preds = %if.end31, %if.then16 + br label %for.inc + +for.inc: ; preds = %if.end32 + %30 = load i32, i32* %j, align 4 + %inc = add nsw i32 %30, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond12 + +for.end: ; preds = %for.cond12 + br label %for.inc33 + +for.inc33: ; preds = %for.end + %31 = load i32, i32* %i, align 4 + %inc34 = add nsw i32 %31, 1 + store i32 %inc34, i32* %i, align 4 + br label %for.cond + +for.end35: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond36 + +for.cond36: ; preds = %for.inc60, %for.end35 + %32 = load i32, i32* %j, align 4 + %33 = load i32, i32* %size.addr, align 4 + %cmp37 = icmp slt i32 %32, %33 + br i1 %cmp37, label %for.body38, label %for.end62 + +for.body38: ; preds = %for.cond36 + store i32 0, i32* %i, align 4 + br label %for.cond39 + +for.cond39: ; preds = %for.inc57, %for.body38 + %34 = load i32, i32* %i, align 4 + %35 = load i32, i32* %size.addr, align 4 + %cmp40 = icmp slt i32 %34, %35 + br i1 %cmp40, label %for.body41, label %for.end59 + +for.body41: ; preds = %for.cond39 + %36 = load i32, i32* %i, align 4 + %37 = load i32, i32* %j, align 4 + %cmp42 = icmp sgt i32 %36, %37 + br i1 %cmp42, label %if.then43, label %if.else48 + +if.then43: ; preds = %for.body41 + %38 = load float*, float** %u, align 8 + %39 = load i32, i32* %j, align 4 + %40 = load i32, i32* %size.addr, align 4 + %mul44 = mul nsw i32 %39, %40 + %41 = load i32, i32* %i, align 4 + %add45 = add nsw i32 %mul44, %41 + %idxprom46 = sext i32 %add45 to i64 + %arrayidx47 = getelementptr inbounds float, float* %38, i64 %idxprom46 + store float 0.000000e+00, float* %arrayidx47, align 4 + br label %if.end56 + +if.else48: ; preds = %for.body41 + %call49 = call i32 @rand() #5 + %conv50 = sitofp i32 %call49 to float + %div51 = fdiv float %conv50, 0x41E0000000000000 + %42 = load float*, float** %u, align 8 + %43 = load i32, i32* %j, align 4 + %44 = load i32, i32* %size.addr, align 4 + %mul52 = mul nsw i32 %43, %44 + %45 = load i32, i32* %i, align 4 + %add53 = add nsw i32 %mul52, %45 + %idxprom54 = sext i32 %add53 to i64 + %arrayidx55 = getelementptr inbounds float, float* %42, i64 %idxprom54 + store float %div51, float* %arrayidx55, align 4 + br label %if.end56 + +if.end56: ; preds = %if.else48, %if.then43 + br label %for.inc57 + +for.inc57: ; preds = %if.end56 + %46 = load i32, i32* %i, align 4 + %inc58 = add nsw i32 %46, 1 + store i32 %inc58, i32* %i, align 4 + br label %for.cond39 + +for.end59: ; preds = %for.cond39 + br label %for.inc60 + +for.inc60: ; preds = %for.end59 + %47 = load i32, i32* %j, align 4 + %inc61 = add nsw i32 %47, 1 + store i32 %inc61, i32* %j, align 4 + br label %for.cond36 + +for.end62: ; preds = %for.cond36 + store i32 0, i32* %i, align 4 + br label %for.cond63 + +for.cond63: ; preds = %for.inc92, %for.end62 + %48 = load i32, i32* %i, align 4 + %49 = load i32, i32* %size.addr, align 4 + %cmp64 = icmp slt i32 %48, %49 + br i1 %cmp64, label %for.body65, label %for.end94 + +for.body65: ; preds = %for.cond63 + store i32 0, i32* %j, align 4 + br label %for.cond66 + +for.cond66: ; preds = %for.inc89, %for.body65 + %50 = load i32, i32* %j, align 4 + %51 = load i32, i32* %size.addr, align 4 + %cmp67 = icmp slt i32 %50, %51 + br i1 %cmp67, label %for.body68, label %for.end91 + +for.body68: ; preds = %for.cond66 + store i32 0, i32* %k, align 4 + br label %for.cond69 + +for.cond69: ; preds = %for.inc86, %for.body68 + %52 = load i32, i32* %k, align 4 + %53 = load i32, i32* %i, align 4 + %54 = load i32, i32* %j, align 4 + %cmp70 = icmp slt i32 %53, %54 + br i1 %cmp70, label %cond.true, label %cond.false + +cond.true: ; preds = %for.cond69 + %55 = load i32, i32* %i, align 4 + br label %cond.end + +cond.false: ; preds = %for.cond69 + %56 = load i32, i32* %j, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %55, %cond.true ], [ %56, %cond.false ] + %cmp71 = icmp sle i32 %52, %cond + br i1 %cmp71, label %for.body72, label %for.end88 + +for.body72: ; preds = %cond.end + %57 = load float*, float** %l, align 8 + %58 = load i32, i32* %i, align 4 + %59 = load i32, i32* %size.addr, align 4 + %mul73 = mul nsw i32 %58, %59 + %60 = load i32, i32* %k, align 4 + %add74 = add nsw i32 %mul73, %60 + %idxprom75 = sext i32 %add74 to i64 + %arrayidx76 = getelementptr inbounds float, float* %57, i64 %idxprom75 + %61 = load float, float* %arrayidx76, align 4 + %62 = load float*, float** %u, align 8 + %63 = load i32, i32* %j, align 4 + %64 = load i32, i32* %size.addr, align 4 + %mul77 = mul nsw i32 %63, %64 + %65 = load i32, i32* %k, align 4 + %add78 = add nsw i32 %mul77, %65 + %idxprom79 = sext i32 %add78 to i64 + %arrayidx80 = getelementptr inbounds float, float* %62, i64 %idxprom79 + %66 = load float, float* %arrayidx80, align 4 + %mul81 = fmul float %61, %66 + %67 = load float*, float** %m, align 8 + %68 = load i32, i32* %i, align 4 + %69 = load i32, i32* %size.addr, align 4 + %mul82 = mul nsw i32 %68, %69 + %70 = load i32, i32* %j, align 4 + %add83 = add nsw i32 %mul82, %70 + %idxprom84 = sext i32 %add83 to i64 + %arrayidx85 = getelementptr inbounds float, float* %67, i64 %idxprom84 + store float %mul81, float* %arrayidx85, align 4 + br label %for.inc86 + +for.inc86: ; preds = %for.body72 + %71 = load i32, i32* %k, align 4 + %inc87 = add nsw i32 %71, 1 + store i32 %inc87, i32* %k, align 4 + br label %for.cond69 + +for.end88: ; preds = %cond.end + br label %for.inc89 + +for.inc89: ; preds = %for.end88 + %72 = load i32, i32* %j, align 4 + %inc90 = add nsw i32 %72, 1 + store i32 %inc90, i32* %j, align 4 + br label %for.cond66 + +for.end91: ; preds = %for.cond66 + br label %for.inc92 + +for.inc92: ; preds = %for.end91 + %73 = load i32, i32* %i, align 4 + %inc93 = add nsw i32 %73, 1 + store i32 %inc93, i32* %i, align 4 + br label %for.cond63 + +for.end94: ; preds = %for.cond63 + %74 = load float*, float** %l, align 8 + %75 = bitcast float* %74 to i8* + call void @free(i8* %75) #5 + %76 = load float*, float** %u, align 8 + %77 = bitcast float* %76 to i8* + call void @free(i8* %77) #5 + %78 = load float*, float** %m, align 8 + %79 = load float**, float*** %mp.addr, align 8 + store float* %78, float** %79, align 8 + store i32 0, i32* %retval, align 4 + br label %return + +return: ; preds = %for.end94, %if.then9, %if.then + %80 = load i32, i32* %retval, align 4 + ret i32 %80 +} + +; Function Attrs: nounwind +declare dso_local void @srand(i32) #2 + +; Function Attrs: nounwind +declare dso_local i64 @time(i64*) #2 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #2 + +; Function Attrs: nounwind +declare dso_local i32 @rand() #2 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @matrix_multiply(float* %inputa, float* %inputb, float* %output, i32 %size) #0 { +entry: + %inputa.addr = alloca float*, align 8 + %inputb.addr = alloca float*, align 8 + %output.addr = alloca float*, align 8 + %size.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + store float* %inputa, float** %inputa.addr, align 8 + store float* %inputb, float** %inputb.addr, align 8 + store float* %output, float** %output.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc19, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %size.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end21 + +for.body: ; preds = %for.cond + store i32 0, i32* %k, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc16, %for.body + %2 = load i32, i32* %k, align 4 + %3 = load i32, i32* %size.addr, align 4 + %cmp2 = icmp slt i32 %2, %3 + br i1 %cmp2, label %for.body3, label %for.end18 + +for.body3: ; preds = %for.cond1 + store i32 0, i32* %j, align 4 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %4 = load i32, i32* %j, align 4 + %5 = load i32, i32* %size.addr, align 4 + %cmp5 = icmp slt i32 %4, %5 + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %6 = load float*, float** %inputa.addr, align 8 + %7 = load i32, i32* %i, align 4 + %8 = load i32, i32* %size.addr, align 4 + %mul = mul nsw i32 %7, %8 + %9 = load i32, i32* %k, align 4 + %add = add nsw i32 %mul, %9 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %6, i64 %idxprom + %10 = load float, float* %arrayidx, align 4 + %11 = load float*, float** %inputb.addr, align 8 + %12 = load i32, i32* %k, align 4 + %13 = load i32, i32* %size.addr, align 4 + %mul7 = mul nsw i32 %12, %13 + %14 = load i32, i32* %j, align 4 + %add8 = add nsw i32 %mul7, %14 + %idxprom9 = sext i32 %add8 to i64 + %arrayidx10 = getelementptr inbounds float, float* %11, i64 %idxprom9 + %15 = load float, float* %arrayidx10, align 4 + %mul11 = fmul float %10, %15 + %16 = load float*, float** %output.addr, align 8 + %17 = load i32, i32* %i, align 4 + %18 = load i32, i32* %size.addr, align 4 + %mul12 = mul nsw i32 %17, %18 + %19 = load i32, i32* %j, align 4 + %add13 = add nsw i32 %mul12, %19 + %idxprom14 = sext i32 %add13 to i64 + %arrayidx15 = getelementptr inbounds float, float* %16, i64 %idxprom14 + store float %mul11, float* %arrayidx15, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %20 = load i32, i32* %j, align 4 + %inc = add nsw i32 %20, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc16 + +for.inc16: ; preds = %for.end + %21 = load i32, i32* %k, align 4 + %inc17 = add nsw i32 %21, 1 + store i32 %inc17, i32* %k, align 4 + br label %for.cond1 + +for.end18: ; preds = %for.cond1 + br label %for.inc19 + +for.inc19: ; preds = %for.end18 + %22 = load i32, i32* %i, align 4 + %inc20 = add nsw i32 %22, 1 + store i32 %inc20, i32* %i, align 4 + br label %for.cond + +for.end21: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @lud_verify(float* %m, float* %lu, i32 %matrix_dim) #3 { +entry: + %m.addr = alloca float*, align 8 + %lu.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + %tmp = alloca float*, align 8 + %sum = alloca float, align 4 + %l = alloca float, align 4 + %u = alloca float, align 4 + store float* %m, float** %m.addr, align 8 + store float* %lu, float** %lu.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + %0 = load i32, i32* %matrix_dim.addr, align 4 + %1 = load i32, i32* %matrix_dim.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv = sext i32 %mul to i64 + %mul1 = mul i64 %conv, 4 + %call = call noalias i8* @malloc(i64 %mul1) #5 + %2 = bitcast i8* %call to float* + store float* %2, float** %tmp, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc24, %entry + %3 = load i32, i32* %i, align 4 + %4 = load i32, i32* %matrix_dim.addr, align 4 + %cmp = icmp slt i32 %3, %4 + br i1 %cmp, label %for.body, label %for.end26 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond2 + +for.cond2: ; preds = %for.inc21, %for.body + %5 = load i32, i32* %j, align 4 + %6 = load i32, i32* %matrix_dim.addr, align 4 + %cmp3 = icmp slt i32 %5, %6 + br i1 %cmp3, label %for.body4, label %for.end23 + +for.body4: ; preds = %for.cond2 + store float 0.000000e+00, float* %sum, align 4 + store i32 0, i32* %k, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc, %for.body4 + %7 = load i32, i32* %k, align 4 + %8 = load i32, i32* %i, align 4 + %9 = load i32, i32* %j, align 4 + %cmp6 = icmp slt i32 %8, %9 + br i1 %cmp6, label %cond.true, label %cond.false + +cond.true: ; preds = %for.cond5 + %10 = load i32, i32* %i, align 4 + br label %cond.end + +cond.false: ; preds = %for.cond5 + %11 = load i32, i32* %j, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %10, %cond.true ], [ %11, %cond.false ] + %cmp7 = icmp sle i32 %7, %cond + br i1 %cmp7, label %for.body8, label %for.end + +for.body8: ; preds = %cond.end + %12 = load i32, i32* %i, align 4 + %13 = load i32, i32* %k, align 4 + %cmp9 = icmp eq i32 %12, %13 + br i1 %cmp9, label %if.then, label %if.else + +if.then: ; preds = %for.body8 + store float 1.000000e+00, float* %l, align 4 + br label %if.end + +if.else: ; preds = %for.body8 + %14 = load float*, float** %lu.addr, align 8 + %15 = load i32, i32* %i, align 4 + %16 = load i32, i32* %matrix_dim.addr, align 4 + %mul10 = mul nsw i32 %15, %16 + %17 = load i32, i32* %k, align 4 + %add = add nsw i32 %mul10, %17 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %14, i64 %idxprom + %18 = load float, float* %arrayidx, align 4 + store float %18, float* %l, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %19 = load float*, float** %lu.addr, align 8 + %20 = load i32, i32* %k, align 4 + %21 = load i32, i32* %matrix_dim.addr, align 4 + %mul11 = mul nsw i32 %20, %21 + %22 = load i32, i32* %j, align 4 + %add12 = add nsw i32 %mul11, %22 + %idxprom13 = sext i32 %add12 to i64 + %arrayidx14 = getelementptr inbounds float, float* %19, i64 %idxprom13 + %23 = load float, float* %arrayidx14, align 4 + store float %23, float* %u, align 4 + %24 = load float, float* %l, align 4 + %25 = load float, float* %u, align 4 + %mul15 = fmul float %24, %25 + %26 = load float, float* %sum, align 4 + %add16 = fadd float %26, %mul15 + store float %add16, float* %sum, align 4 + br label %for.inc + +for.inc: ; preds = %if.end + %27 = load i32, i32* %k, align 4 + %inc = add nsw i32 %27, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond5 + +for.end: ; preds = %cond.end + %28 = load float, float* %sum, align 4 + %29 = load float*, float** %tmp, align 8 + %30 = load i32, i32* %i, align 4 + %31 = load i32, i32* %matrix_dim.addr, align 4 + %mul17 = mul nsw i32 %30, %31 + %32 = load i32, i32* %j, align 4 + %add18 = add nsw i32 %mul17, %32 + %idxprom19 = sext i32 %add18 to i64 + %arrayidx20 = getelementptr inbounds float, float* %29, i64 %idxprom19 + store float %28, float* %arrayidx20, align 4 + br label %for.inc21 + +for.inc21: ; preds = %for.end + %33 = load i32, i32* %j, align 4 + %inc22 = add nsw i32 %33, 1 + store i32 %inc22, i32* %j, align 4 + br label %for.cond2 + +for.end23: ; preds = %for.cond2 + br label %for.inc24 + +for.inc24: ; preds = %for.end23 + %34 = load i32, i32* %i, align 4 + %inc25 = add nsw i32 %34, 1 + store i32 %inc25, i32* %i, align 4 + br label %for.cond + +for.end26: ; preds = %for.cond + store i32 0, i32* %i, align 4 + br label %for.cond27 + +for.cond27: ; preds = %for.inc60, %for.end26 + %35 = load i32, i32* %i, align 4 + %36 = load i32, i32* %matrix_dim.addr, align 4 + %cmp28 = icmp slt i32 %35, %36 + br i1 %cmp28, label %for.body29, label %for.end62 + +for.body29: ; preds = %for.cond27 + store i32 0, i32* %j, align 4 + br label %for.cond30 + +for.cond30: ; preds = %for.inc57, %for.body29 + %37 = load i32, i32* %j, align 4 + %38 = load i32, i32* %matrix_dim.addr, align 4 + %cmp31 = icmp slt i32 %37, %38 + br i1 %cmp31, label %for.body32, label %for.end59 + +for.body32: ; preds = %for.cond30 + %39 = load float*, float** %m.addr, align 8 + %40 = load i32, i32* %i, align 4 + %41 = load i32, i32* %matrix_dim.addr, align 4 + %mul33 = mul nsw i32 %40, %41 + %42 = load i32, i32* %j, align 4 + %add34 = add nsw i32 %mul33, %42 + %idxprom35 = sext i32 %add34 to i64 + %arrayidx36 = getelementptr inbounds float, float* %39, i64 %idxprom35 + %43 = load float, float* %arrayidx36, align 4 + %44 = load float*, float** %tmp, align 8 + %45 = load i32, i32* %i, align 4 + %46 = load i32, i32* %matrix_dim.addr, align 4 + %mul37 = mul nsw i32 %45, %46 + %47 = load i32, i32* %j, align 4 + %add38 = add nsw i32 %mul37, %47 + %idxprom39 = sext i32 %add38 to i64 + %arrayidx40 = getelementptr inbounds float, float* %44, i64 %idxprom39 + %48 = load float, float* %arrayidx40, align 4 + %sub = fsub float %43, %48 + %call41 = call float @_ZSt4fabsf(float %sub) + %conv42 = fpext float %call41 to double + %cmp43 = fcmp ogt double %conv42, 1.000000e-04 + br i1 %cmp43, label %if.then44, label %if.end56 + +if.then44: ; preds = %for.body32 + %49 = load i32, i32* %i, align 4 + %50 = load i32, i32* %j, align 4 + %51 = load float*, float** %m.addr, align 8 + %52 = load i32, i32* %i, align 4 + %53 = load i32, i32* %matrix_dim.addr, align 4 + %mul45 = mul nsw i32 %52, %53 + %54 = load i32, i32* %j, align 4 + %add46 = add nsw i32 %mul45, %54 + %idxprom47 = sext i32 %add46 to i64 + %arrayidx48 = getelementptr inbounds float, float* %51, i64 %idxprom47 + %55 = load float, float* %arrayidx48, align 4 + %conv49 = fpext float %55 to double + %56 = load float*, float** %tmp, align 8 + %57 = load i32, i32* %i, align 4 + %58 = load i32, i32* %matrix_dim.addr, align 4 + %mul50 = mul nsw i32 %57, %58 + %59 = load i32, i32* %j, align 4 + %add51 = add nsw i32 %mul50, %59 + %idxprom52 = sext i32 %add51 to i64 + %arrayidx53 = getelementptr inbounds float, float* %56, i64 %idxprom52 + %60 = load float, float* %arrayidx53, align 4 + %conv54 = fpext float %60 to double + %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.3, i64 0, i64 0), i32 %49, i32 %50, double %conv49, double %conv54) + br label %if.end56 + +if.end56: ; preds = %if.then44, %for.body32 + br label %for.inc57 + +for.inc57: ; preds = %if.end56 + %61 = load i32, i32* %j, align 4 + %inc58 = add nsw i32 %61, 1 + store i32 %inc58, i32* %j, align 4 + br label %for.cond30 + +for.end59: ; preds = %for.cond30 + br label %for.inc60 + +for.inc60: ; preds = %for.end59 + %62 = load i32, i32* %i, align 4 + %inc61 = add nsw i32 %62, 1 + store i32 %inc61, i32* %i, align 4 + br label %for.cond27 + +for.end62: ; preds = %for.cond27 + %call63 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.4, i64 0, i64 0)) + %63 = load float*, float** %tmp, align 8 + %64 = bitcast float* %63 to i8* + call void @free(i8* %64) #5 + ret i32 0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt4fabsf(float %__x) #0 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %1 = call float @llvm.fabs.f32(float %0) + ret float %1 +} + +declare dso_local i32 @printf(i8*, ...) #4 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @matrix_duplicate(float* %src, float** %dst, i32 %matrix_dim) #0 { +entry: + %src.addr = alloca float*, align 8 + %dst.addr = alloca float**, align 8 + %matrix_dim.addr = alloca i32, align 4 + %s = alloca i32, align 4 + %p = alloca float*, align 8 + store float* %src, float** %src.addr, align 8 + store float** %dst, float*** %dst.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + %0 = load i32, i32* %matrix_dim.addr, align 4 + %1 = load i32, i32* %matrix_dim.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv = sext i32 %mul to i64 + %mul1 = mul i64 %conv, 4 + %conv2 = trunc i64 %mul1 to i32 + store i32 %conv2, i32* %s, align 4 + %2 = load i32, i32* %s, align 4 + %conv3 = sext i32 %2 to i64 + %call = call noalias i8* @malloc(i64 %conv3) #5 + %3 = bitcast i8* %call to float* + store float* %3, float** %p, align 8 + %4 = load float*, float** %p, align 8 + %5 = bitcast float* %4 to i8* + %6 = load float*, float** %src.addr, align 8 + %7 = bitcast float* %6 to i8* + %8 = load i32, i32* %s, align 4 + %conv4 = sext i32 %8 to i64 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %5, i8* align 4 %7, i64 %conv4, i1 false) + %9 = load float*, float** %p, align 8 + %10 = load float**, float*** %dst.addr, align 8 + store float* %9, float** %10, align 8 + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @print_matrix(float* %m, i32 %matrix_dim) #3 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc5, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %matrix_dim.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end7 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %2 = load i32, i32* %j, align 4 + %3 = load i32, i32* %matrix_dim.addr, align 4 + %cmp2 = icmp slt i32 %2, %3 + br i1 %cmp2, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %4 = load float*, float** %m.addr, align 8 + %5 = load i32, i32* %i, align 4 + %6 = load i32, i32* %matrix_dim.addr, align 4 + %mul = mul nsw i32 %5, %6 + %7 = load i32, i32* %j, align 4 + %add = add nsw i32 %mul, %7 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %8 = load float, float* %arrayidx, align 4 + %conv = fpext float %8 to double + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.2, i64 0, i64 0), double %conv) + br label %for.inc + +for.inc: ; preds = %for.body3 + %9 = load i32, i32* %j, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.5, i64 0, i64 0)) + br label %for.inc5 + +for.inc5: ; preds = %for.end + %10 = load i32, i32* %i, align 4 + %inc6 = add nsw i32 %10, 1 + store i32 %inc6, i32* %i, align 4 + br label %for.cond + +for.end7: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @create_matrix(float** %mp, i32 %size) #3 { +entry: + %retval = alloca i32, align 4 + %mp.addr = alloca float**, align 8 + %size.addr = alloca i32, align 4 + %m = alloca float*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %lamda = alloca float, align 4 + %saved_stack = alloca i8*, align 8 + %__vla_expr0 = alloca i64, align 8 + %coe_i = alloca float, align 4 + %cleanup.dest.slot = alloca i32, align 4 + store float** %mp, float*** %mp.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + store float 0xBF50624DE0000000, float* %lamda, align 4 + %0 = load i32, i32* %size.addr, align 4 + %mul = mul nsw i32 2, %0 + %sub = sub nsw i32 %mul, 1 + %1 = zext i32 %sub to i64 + %2 = call i8* @llvm.stacksave() + store i8* %2, i8** %saved_stack, align 8 + %vla = alloca float, i64 %1, align 16 + store i64 %1, i64* %__vla_expr0, align 8 + store float 0.000000e+00, float* %coe_i, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %3 = load i32, i32* %i, align 4 + %4 = load i32, i32* %size.addr, align 4 + %cmp = icmp slt i32 %3, %4 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %5 = load float, float* %lamda, align 4 + %6 = load i32, i32* %i, align 4 + %conv = sitofp i32 %6 to float + %mul1 = fmul float %5, %conv + %call = call float @_ZSt3expf(float %mul1) + %mul2 = fmul float 1.000000e+01, %call + store float %mul2, float* %coe_i, align 4 + %7 = load i32, i32* %size.addr, align 4 + %sub3 = sub nsw i32 %7, 1 + %8 = load i32, i32* %i, align 4 + %add = add nsw i32 %sub3, %8 + store i32 %add, i32* %j, align 4 + %9 = load float, float* %coe_i, align 4 + %10 = load i32, i32* %j, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds float, float* %vla, i64 %idxprom + store float %9, float* %arrayidx, align 4 + %11 = load i32, i32* %size.addr, align 4 + %sub4 = sub nsw i32 %11, 1 + %12 = load i32, i32* %i, align 4 + %sub5 = sub nsw i32 %sub4, %12 + store i32 %sub5, i32* %j, align 4 + %13 = load float, float* %coe_i, align 4 + %14 = load i32, i32* %j, align 4 + %idxprom6 = sext i32 %14 to i64 + %arrayidx7 = getelementptr inbounds float, float* %vla, i64 %idxprom6 + store float %13, float* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %16 = load i32, i32* %size.addr, align 4 + %conv8 = sext i32 %16 to i64 + %mul9 = mul i64 4, %conv8 + %17 = load i32, i32* %size.addr, align 4 + %conv10 = sext i32 %17 to i64 + %mul11 = mul i64 %mul9, %conv10 + %call12 = call noalias i8* @malloc(i64 %mul11) #5 + %18 = bitcast i8* %call12 to float* + store float* %18, float** %m, align 8 + %19 = load float*, float** %m, align 8 + %cmp13 = icmp eq float* %19, null + br i1 %cmp13, label %if.then, label %if.end + +if.then: ; preds = %for.end + store i32 1, i32* %retval, align 4 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +if.end: ; preds = %for.end + store i32 0, i32* %i, align 4 + br label %for.cond14 + +for.cond14: ; preds = %for.inc32, %if.end + %20 = load i32, i32* %i, align 4 + %21 = load i32, i32* %size.addr, align 4 + %cmp15 = icmp slt i32 %20, %21 + br i1 %cmp15, label %for.body16, label %for.end34 + +for.body16: ; preds = %for.cond14 + store i32 0, i32* %j, align 4 + br label %for.cond17 + +for.cond17: ; preds = %for.inc29, %for.body16 + %22 = load i32, i32* %j, align 4 + %23 = load i32, i32* %size.addr, align 4 + %cmp18 = icmp slt i32 %22, %23 + br i1 %cmp18, label %for.body19, label %for.end31 + +for.body19: ; preds = %for.cond17 + %24 = load i32, i32* %size.addr, align 4 + %sub20 = sub nsw i32 %24, 1 + %25 = load i32, i32* %i, align 4 + %sub21 = sub nsw i32 %sub20, %25 + %26 = load i32, i32* %j, align 4 + %add22 = add nsw i32 %sub21, %26 + %idxprom23 = sext i32 %add22 to i64 + %arrayidx24 = getelementptr inbounds float, float* %vla, i64 %idxprom23 + %27 = load float, float* %arrayidx24, align 4 + %28 = load float*, float** %m, align 8 + %29 = load i32, i32* %i, align 4 + %30 = load i32, i32* %size.addr, align 4 + %mul25 = mul nsw i32 %29, %30 + %31 = load i32, i32* %j, align 4 + %add26 = add nsw i32 %mul25, %31 + %idxprom27 = sext i32 %add26 to i64 + %arrayidx28 = getelementptr inbounds float, float* %28, i64 %idxprom27 + store float %27, float* %arrayidx28, align 4 + br label %for.inc29 + +for.inc29: ; preds = %for.body19 + %32 = load i32, i32* %j, align 4 + %inc30 = add nsw i32 %32, 1 + store i32 %inc30, i32* %j, align 4 + br label %for.cond17 + +for.end31: ; preds = %for.cond17 + br label %for.inc32 + +for.inc32: ; preds = %for.end31 + %33 = load i32, i32* %i, align 4 + %inc33 = add nsw i32 %33, 1 + store i32 %inc33, i32* %i, align 4 + br label %for.cond14 + +for.end34: ; preds = %for.cond14 + %34 = load float*, float** %m, align 8 + %35 = load float**, float*** %mp.addr, align 8 + store float* %34, float** %35, align 8 + store i32 0, i32* %retval, align 4 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +cleanup: ; preds = %for.end34, %if.then + %36 = load i8*, i8** %saved_stack, align 8 + call void @llvm.stackrestore(i8* %36) + %37 = load i32, i32* %retval, align 4 + ret i32 %37 +} + +; Function Attrs: nounwind +declare i8* @llvm.stacksave() #5 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt3expf(float %__x) #0 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %call = call float @expf(float %0) #5 + ret float %call +} + +; Function Attrs: nounwind +declare void @llvm.stackrestore(i8*) #5 + +; Function Attrs: nounwind readnone speculatable willreturn +declare float @llvm.fabs.f32(float) #6 + +; Function Attrs: nounwind +declare dso_local float @expf(float) #2 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } +attributes #6 = { nounwind readnone speculatable willreturn } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/lud/lud-host-x86_64-unknown-linux-gnu.ll b/examples/lud/lud-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..2044338 --- /dev/null +++ b/examples/lud/lud-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,326 @@ +; ModuleID = 'lud-host-x86_64-unknown-linux-gnu.bc' +source_filename = "cuda/lud.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.option = type { i8*, i32, i32*, i32 } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.__stopwatch_t = type { %struct.timeval, %struct.timeval } +%struct.timeval = type { i64, i64 } + +@.str = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1 +@.str.1 = private unnamed_addr constant [8 x i8] c"::vs:i:\00", align 1 +@_ZL12long_options = internal global [4 x %struct.option] [%struct.option { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.15, i32 0, i32 0), i32 1, i32* null, i32 105 }, %struct.option { i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.16, i32 0, i32 0), i32 1, i32* null, i32 115 }, %struct.option { i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.17, i32 0, i32 0), i32 0, i32* null, i32 118 }, %struct.option zeroinitializer], align 16 +@optarg = external dso_local global i8*, align 8 +@_ZL9do_verify = internal global i32 0, align 4 +@.str.2 = private unnamed_addr constant [44 x i8] c"Generate input matrix internally, size =%d\0A\00", align 1 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.3 = private unnamed_addr constant [16 x i8] c"invalid option\0A\00", align 1 +@.str.4 = private unnamed_addr constant [18 x i8] c"missing argument\0A\00", align 1 +@.str.5 = private unnamed_addr constant [47 x i8] c"Usage: %s [-v] [-s matrix_size|-i input_file]\0A\00", align 1 +@optind = external dso_local global i32, align 4 +@.str.6 = private unnamed_addr constant [29 x i8] c"Reading matrix from file %s\0A\00", align 1 +@.str.7 = private unnamed_addr constant [34 x i8] c"error create matrix from file %s\0A\00", align 1 +@.str.8 = private unnamed_addr constant [36 x i8] c"Creating matrix internally size=%d\0A\00", align 1 +@.str.9 = private unnamed_addr constant [40 x i8] c"error create matrix internally size=%d\0A\00", align 1 +@.str.10 = private unnamed_addr constant [26 x i8] c"No input file specified!\0A\00", align 1 +@.str.11 = private unnamed_addr constant [12 x i8] c"Before LUD\0A\00", align 1 +@.str.12 = private unnamed_addr constant [24 x i8] c"Time consumed(ms): %lf\0A\00", align 1 +@.str.13 = private unnamed_addr constant [11 x i8] c"After LUD\0A\00", align 1 +@.str.14 = private unnamed_addr constant [15 x i8] c">>>Verify<<<<\0A\00", align 1 +@.str.15 = private unnamed_addr constant [6 x i8] c"input\00", align 1 +@.str.16 = private unnamed_addr constant [5 x i8] c"size\00", align 1 +@.str.17 = private unnamed_addr constant [7 x i8] c"verify\00", align 1 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #0 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %matrix_dim = alloca i32, align 4 + %opt = alloca i32, align 4 + %option_index = alloca i32, align 4 + %ret = alloca i32, align 4 + %input_file = alloca i8*, align 8 + %m = alloca float*, align 8 + %d_m = alloca float*, align 8 + %mm = alloca float*, align 8 + %sw = alloca %struct.__stopwatch_t, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i64 0, i64 0), i32 16, i32 16) + store i32 32, i32* %matrix_dim, align 4 + store i32 0, i32* %option_index, align 4 + store i8* null, i8** %input_file, align 8 + br label %while.cond + +while.cond: ; preds = %sw.epilog, %entry + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + %call2 = call i32 @getopt_long(i32 %0, i8** %1, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.1, i64 0, i64 0), %struct.option* getelementptr inbounds ([4 x %struct.option], [4 x %struct.option]* @_ZL12long_options, i64 0, i64 0), i32* %option_index) #5 + store i32 %call2, i32* %opt, align 4 + %cmp = icmp ne i32 %call2, -1 + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %2 = load i32, i32* %opt, align 4 + switch i32 %2, label %sw.default [ + i32 105, label %sw.bb + i32 118, label %sw.bb3 + i32 115, label %sw.bb4 + i32 63, label %sw.bb7 + i32 58, label %sw.bb9 + ] + +sw.bb: ; preds = %while.body + %3 = load i8*, i8** @optarg, align 8 + store i8* %3, i8** %input_file, align 8 + br label %sw.epilog + +sw.bb3: ; preds = %while.body + store i32 1, i32* @_ZL9do_verify, align 4 + br label %sw.epilog + +sw.bb4: ; preds = %while.body + %4 = load i8*, i8** @optarg, align 8 + %call5 = call i32 @atoi(i8* %4) #6 + store i32 %call5, i32* %matrix_dim, align 4 + %5 = load i32, i32* %matrix_dim, align 4 + %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.2, i64 0, i64 0), i32 %5) + br label %sw.epilog + +sw.bb7: ; preds = %while.body + %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0)) + br label %sw.epilog + +sw.bb9: ; preds = %while.body + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call10 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0)) + br label %sw.epilog + +sw.default: ; preds = %while.body + %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %9 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %9, i64 0 + %10 = load i8*, i8** %arrayidx, align 8 + %call11 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.5, i64 0, i64 0), i8* %10) + call void @exit(i32 1) #7 + unreachable + +sw.epilog: ; preds = %sw.bb9, %sw.bb7, %sw.bb4, %sw.bb3, %sw.bb + br label %while.cond + +while.end: ; preds = %while.cond + %11 = load i32, i32* @optind, align 4 + %12 = load i32, i32* %argc.addr, align 4 + %cmp12 = icmp slt i32 %11, %12 + br i1 %cmp12, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %while.end + %13 = load i32, i32* @optind, align 4 + %cmp13 = icmp eq i32 %13, 1 + br i1 %cmp13, label %if.then, label %if.end + +if.then: ; preds = %lor.lhs.false, %while.end + %14 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %15 = load i8**, i8*** %argv.addr, align 8 + %arrayidx14 = getelementptr inbounds i8*, i8** %15, i64 0 + %16 = load i8*, i8** %arrayidx14, align 8 + %call15 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %14, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.5, i64 0, i64 0), i8* %16) + call void @exit(i32 1) #7 + unreachable + +if.end: ; preds = %lor.lhs.false + %17 = load i8*, i8** %input_file, align 8 + %tobool = icmp ne i8* %17, null + br i1 %tobool, label %if.then16, label %if.else + +if.then16: ; preds = %if.end + %18 = load i8*, i8** %input_file, align 8 + %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.6, i64 0, i64 0), i8* %18) + %19 = load i8*, i8** %input_file, align 8 + %call18 = call i32 @create_matrix_from_file(float** %m, i8* %19, i32* %matrix_dim) + store i32 %call18, i32* %ret, align 4 + %20 = load i32, i32* %ret, align 4 + %cmp19 = icmp ne i32 %20, 0 + br i1 %cmp19, label %if.then20, label %if.end22 + +if.then20: ; preds = %if.then16 + store float* null, float** %m, align 8 + %21 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %22 = load i8*, i8** %input_file, align 8 + %call21 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %21, i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.7, i64 0, i64 0), i8* %22) + call void @exit(i32 1) #7 + unreachable + +if.end22: ; preds = %if.then16 + br label %if.end34 + +if.else: ; preds = %if.end + %23 = load i32, i32* %matrix_dim, align 4 + %tobool23 = icmp ne i32 %23, 0 + br i1 %tobool23, label %if.then24, label %if.else31 + +if.then24: ; preds = %if.else + %24 = load i32, i32* %matrix_dim, align 4 + %call25 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.8, i64 0, i64 0), i32 %24) + %25 = load i32, i32* %matrix_dim, align 4 + %call26 = call i32 @create_matrix(float** %m, i32 %25) + store i32 %call26, i32* %ret, align 4 + %26 = load i32, i32* %ret, align 4 + %cmp27 = icmp ne i32 %26, 0 + br i1 %cmp27, label %if.then28, label %if.end30 + +if.then28: ; preds = %if.then24 + store float* null, float** %m, align 8 + %27 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %28 = load i32, i32* %matrix_dim, align 4 + %call29 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %27, i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.9, i64 0, i64 0), i32 %28) + call void @exit(i32 1) #7 + unreachable + +if.end30: ; preds = %if.then24 + br label %if.end33 + +if.else31: ; preds = %if.else + %call32 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.10, i64 0, i64 0)) + call void @exit(i32 1) #7 + unreachable + +if.end33: ; preds = %if.end30 + br label %if.end34 + +if.end34: ; preds = %if.end33, %if.end22 + %29 = load i32, i32* @_ZL9do_verify, align 4 + %tobool35 = icmp ne i32 %29, 0 + br i1 %tobool35, label %if.then36, label %if.end38 + +if.then36: ; preds = %if.end34 + %call37 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.11, i64 0, i64 0)) + %30 = load float*, float** %m, align 8 + %31 = load i32, i32* %matrix_dim, align 4 + call void @matrix_duplicate(float* %30, float** %mm, i32 %31) + br label %if.end38 + +if.end38: ; preds = %if.then36, %if.end34 + %32 = bitcast float** %d_m to i8** + %33 = load i32, i32* %matrix_dim, align 4 + %34 = load i32, i32* %matrix_dim, align 4 + %mul = mul nsw i32 %33, %34 + %conv = sext i32 %mul to i64 + %mul39 = mul i64 %conv, 4 + %call40 = call i32 @cudaMalloc(i8** %32, i64 %mul39) + call void @stopwatch_start(%struct.__stopwatch_t* %sw) + %35 = load float*, float** %d_m, align 8 + %36 = bitcast float* %35 to i8* + %37 = load float*, float** %m, align 8 + %38 = bitcast float* %37 to i8* + %39 = load i32, i32* %matrix_dim, align 4 + %40 = load i32, i32* %matrix_dim, align 4 + %mul41 = mul nsw i32 %39, %40 + %conv42 = sext i32 %mul41 to i64 + %mul43 = mul i64 %conv42, 4 + %call44 = call i32 @cudaMemcpy(i8* %36, i8* %38, i64 %mul43, i32 1) + %41 = load float*, float** %d_m, align 8 + %42 = load i32, i32* %matrix_dim, align 4 + call void @_Z8lud_cudaPfi(float* %41, i32 %42) + %43 = load float*, float** %m, align 8 + %44 = bitcast float* %43 to i8* + %45 = load float*, float** %d_m, align 8 + %46 = bitcast float* %45 to i8* + %47 = load i32, i32* %matrix_dim, align 4 + %48 = load i32, i32* %matrix_dim, align 4 + %mul45 = mul nsw i32 %47, %48 + %conv46 = sext i32 %mul45 to i64 + %mul47 = mul i64 %conv46, 4 + %call48 = call i32 @cudaMemcpy(i8* %44, i8* %46, i64 %mul47, i32 2) + call void @stopwatch_stop(%struct.__stopwatch_t* %sw) + %call49 = call double @get_interval_by_sec(%struct.__stopwatch_t* %sw) + %mul50 = fmul contract double 1.000000e+03, %call49 + %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.12, i64 0, i64 0), double %mul50) + %49 = load float*, float** %d_m, align 8 + %50 = bitcast float* %49 to i8* + %call52 = call i32 @cudaFree(i8* %50) + %51 = load i32, i32* @_ZL9do_verify, align 4 + %tobool53 = icmp ne i32 %51, 0 + br i1 %tobool53, label %if.then54, label %if.end58 + +if.then54: ; preds = %if.end38 + %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.13, i64 0, i64 0)) + %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.14, i64 0, i64 0)) + %52 = load float*, float** %mm, align 8 + %53 = load float*, float** %m, align 8 + %54 = load i32, i32* %matrix_dim, align 4 + %call57 = call i32 @lud_verify(float* %52, float* %53, i32 %54) + %55 = load float*, float** %mm, align 8 + %56 = bitcast float* %55 to i8* + call void @free(i8* %56) #5 + br label %if.end58 + +if.end58: ; preds = %if.then54, %if.end38 + %57 = load float*, float** %m, align 8 + %58 = bitcast float* %57 to i8* + call void @free(i8* %58) #5 + ret i32 0 +} + +declare dso_local i32 @cudaSetDevice(i32) #1 + +declare dso_local i32 @printf(i8*, ...) #1 + +; Function Attrs: nounwind +declare dso_local i32 @getopt_long(i32, i8**, i8*, %struct.option*, i32*) #2 + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #3 + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #4 + +declare dso_local i32 @create_matrix_from_file(float**, i8*, i32*) #1 + +declare dso_local i32 @create_matrix(float**, i32) #1 + +declare dso_local void @matrix_duplicate(float*, float**, i32) #1 + +declare dso_local i32 @cudaMalloc(i8**, i64) #1 + +declare dso_local void @stopwatch_start(%struct.__stopwatch_t*) #1 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1 + +declare dso_local void @_Z8lud_cudaPfi(float*, i32) #1 + +declare dso_local void @stopwatch_stop(%struct.__stopwatch_t*) #1 + +declare dso_local double @get_interval_by_sec(%struct.__stopwatch_t*) #1 + +declare dso_local i32 @cudaFree(i8*) #1 + +declare dso_local i32 @lud_verify(float*, float*, i32) #1 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #2 + +attributes #0 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } +attributes #6 = { nounwind readonly } +attributes #7 = { noreturn nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/lud/lud_kernel-cuda-nvptx64-nvidia-cuda-sm_50.ll b/examples/lud/lud_kernel-cuda-nvptx64-nvidia-cuda-sm_50.ll new file mode 100644 index 0000000..9bdbe7d --- /dev/null +++ b/examples/lud/lud_kernel-cuda-nvptx64-nvidia-cuda-sm_50.ll @@ -0,0 +1,1001 @@ +; ModuleID = 'lud_kernel-cuda-nvptx64-nvidia-cuda-sm_50.bc' +source_filename = "cuda/lud_kernel.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any + +@_ZZ12lud_diagonalPfiiE6shadow = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@_ZZ13lud_perimeterPfiiE3dia = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ13lud_perimeterPfiiE8peri_row = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ13lud_perimeterPfiiE8peri_col = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@_ZZ12lud_internalPfiiE8peri_row = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ12lud_internalPfiiE8peri_col = internal addrspace(3) global [16 x [16 x float]] undef, align 4 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z12lud_diagonalPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %offset.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %array_offset = alloca i32, align 4 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 %offset, i32* %offset.addr, align 4 + %0 = load i32, i32* %offset.addr, align 4 + %1 = load i32, i32* %matrix_dim.addr, align 4 + %mul = mul nsw i32 %0, %1 + %2 = load i32, i32* %offset.addr, align 4 + %add = add nsw i32 %mul, %2 + store i32 %add, i32* %array_offset, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %3 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %3, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %4 = load float*, float** %m.addr, align 8 + %5 = load i32, i32* %array_offset, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add1 = add i32 %5, %call + %idxprom = zext i32 %add1 to i64 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %6 = load float, float* %arrayidx, align 4 + %7 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %7 to i64 + %arrayidx3 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom2 + %call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom5 = zext i32 %call4 to i64 + %arrayidx6 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx3, i64 0, i64 %idxprom5 + store float %6, float* %arrayidx6, align 4 + %8 = load i32, i32* %matrix_dim.addr, align 4 + %9 = load i32, i32* %array_offset, align 4 + %add7 = add nsw i32 %9, %8 + store i32 %add7, i32* %array_offset, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %10 = load i32, i32* %i, align 4 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + call void @llvm.nvvm.barrier0() + store i32 0, i32* %i, align 4 + br label %for.cond8 + +for.cond8: ; preds = %for.inc72, %for.end + %11 = load i32, i32* %i, align 4 + %cmp9 = icmp slt i32 %11, 15 + br i1 %cmp9, label %for.body10, label %for.end74 + +for.body10: ; preds = %for.cond8 + %call11 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %12 = load i32, i32* %i, align 4 + %cmp12 = icmp ugt i32 %call11, %12 + br i1 %cmp12, label %if.then, label %if.end + +if.then: ; preds = %for.body10 + store i32 0, i32* %j, align 4 + br label %for.cond13 + +for.cond13: ; preds = %for.inc31, %if.then + %13 = load i32, i32* %j, align 4 + %14 = load i32, i32* %i, align 4 + %cmp14 = icmp slt i32 %13, %14 + br i1 %cmp14, label %for.body15, label %for.end33 + +for.body15: ; preds = %for.cond13 + %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom17 = zext i32 %call16 to i64 + %arrayidx18 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom17 + %15 = load i32, i32* %j, align 4 + %idxprom19 = sext i32 %15 to i64 + %arrayidx20 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx18, i64 0, i64 %idxprom19 + %16 = load float, float* %arrayidx20, align 4 + %17 = load i32, i32* %j, align 4 + %idxprom21 = sext i32 %17 to i64 + %arrayidx22 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom21 + %18 = load i32, i32* %i, align 4 + %idxprom23 = sext i32 %18 to i64 + %arrayidx24 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx22, i64 0, i64 %idxprom23 + %19 = load float, float* %arrayidx24, align 4 + %mul25 = fmul contract float %16, %19 + %call26 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom27 = zext i32 %call26 to i64 + %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom27 + %20 = load i32, i32* %i, align 4 + %idxprom29 = sext i32 %20 to i64 + %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29 + %21 = load float, float* %arrayidx30, align 4 + %sub = fsub contract float %21, %mul25 + store float %sub, float* %arrayidx30, align 4 + br label %for.inc31 + +for.inc31: ; preds = %for.body15 + %22 = load i32, i32* %j, align 4 + %inc32 = add nsw i32 %22, 1 + store i32 %inc32, i32* %j, align 4 + br label %for.cond13 + +for.end33: ; preds = %for.cond13 + %23 = load i32, i32* %i, align 4 + %idxprom34 = sext i32 %23 to i64 + %arrayidx35 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom34 + %24 = load i32, i32* %i, align 4 + %idxprom36 = sext i32 %24 to i64 + %arrayidx37 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx35, i64 0, i64 %idxprom36 + %25 = load float, float* %arrayidx37, align 4 + %call38 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom39 = zext i32 %call38 to i64 + %arrayidx40 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom39 + %26 = load i32, i32* %i, align 4 + %idxprom41 = sext i32 %26 to i64 + %arrayidx42 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx40, i64 0, i64 %idxprom41 + %27 = load float, float* %arrayidx42, align 4 + %div = fdiv float %27, %25 + store float %div, float* %arrayidx42, align 4 + br label %if.end + +if.end: ; preds = %for.end33, %for.body10 + call void @llvm.nvvm.barrier0() + %call43 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %28 = load i32, i32* %i, align 4 + %cmp44 = icmp ugt i32 %call43, %28 + br i1 %cmp44, label %if.then45, label %if.end71 + +if.then45: ; preds = %if.end + store i32 0, i32* %j, align 4 + br label %for.cond46 + +for.cond46: ; preds = %for.inc68, %if.then45 + %29 = load i32, i32* %j, align 4 + %30 = load i32, i32* %i, align 4 + %add47 = add nsw i32 %30, 1 + %cmp48 = icmp slt i32 %29, %add47 + br i1 %cmp48, label %for.body49, label %for.end70 + +for.body49: ; preds = %for.cond46 + %31 = load i32, i32* %i, align 4 + %add50 = add nsw i32 %31, 1 + %idxprom51 = sext i32 %add50 to i64 + %arrayidx52 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom51 + %32 = load i32, i32* %j, align 4 + %idxprom53 = sext i32 %32 to i64 + %arrayidx54 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx52, i64 0, i64 %idxprom53 + %33 = load float, float* %arrayidx54, align 4 + %34 = load i32, i32* %j, align 4 + %idxprom55 = sext i32 %34 to i64 + %arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom55 + %call57 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom58 = zext i32 %call57 to i64 + %arrayidx59 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom58 + %35 = load float, float* %arrayidx59, align 4 + %mul60 = fmul contract float %33, %35 + %36 = load i32, i32* %i, align 4 + %add61 = add nsw i32 %36, 1 + %idxprom62 = sext i32 %add61 to i64 + %arrayidx63 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom62 + %call64 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom65 = zext i32 %call64 to i64 + %arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx63, i64 0, i64 %idxprom65 + %37 = load float, float* %arrayidx66, align 4 + %sub67 = fsub contract float %37, %mul60 + store float %sub67, float* %arrayidx66, align 4 + br label %for.inc68 + +for.inc68: ; preds = %for.body49 + %38 = load i32, i32* %j, align 4 + %inc69 = add nsw i32 %38, 1 + store i32 %inc69, i32* %j, align 4 + br label %for.cond46 + +for.end70: ; preds = %for.cond46 + br label %if.end71 + +if.end71: ; preds = %for.end70, %if.end + call void @llvm.nvvm.barrier0() + br label %for.inc72 + +for.inc72: ; preds = %if.end71 + %39 = load i32, i32* %i, align 4 + %inc73 = add nsw i32 %39, 1 + store i32 %inc73, i32* %i, align 4 + br label %for.cond8 + +for.end74: ; preds = %for.cond8 + %40 = load i32, i32* %offset.addr, align 4 + %add75 = add nsw i32 %40, 1 + %41 = load i32, i32* %matrix_dim.addr, align 4 + %mul76 = mul nsw i32 %add75, %41 + %42 = load i32, i32* %offset.addr, align 4 + %add77 = add nsw i32 %mul76, %42 + store i32 %add77, i32* %array_offset, align 4 + store i32 1, i32* %i, align 4 + br label %for.cond78 + +for.cond78: ; preds = %for.inc91, %for.end74 + %43 = load i32, i32* %i, align 4 + %cmp79 = icmp slt i32 %43, 16 + br i1 %cmp79, label %for.body80, label %for.end93 + +for.body80: ; preds = %for.cond78 + %44 = load i32, i32* %i, align 4 + %idxprom81 = sext i32 %44 to i64 + %arrayidx82 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_diagonalPfiiE6shadow to [16 x [16 x float]]*), i64 0, i64 %idxprom81 + %call83 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom84 = zext i32 %call83 to i64 + %arrayidx85 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx82, i64 0, i64 %idxprom84 + %45 = load float, float* %arrayidx85, align 4 + %46 = load float*, float** %m.addr, align 8 + %47 = load i32, i32* %array_offset, align 4 + %call86 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add87 = add i32 %47, %call86 + %idxprom88 = zext i32 %add87 to i64 + %arrayidx89 = getelementptr inbounds float, float* %46, i64 %idxprom88 + store float %45, float* %arrayidx89, align 4 + %48 = load i32, i32* %matrix_dim.addr, align 4 + %49 = load i32, i32* %array_offset, align 4 + %add90 = add nsw i32 %49, %48 + store i32 %add90, i32* %array_offset, align 4 + br label %for.inc91 + +for.inc91: ; preds = %for.body80 + %50 = load i32, i32* %i, align 4 + %inc92 = add nsw i32 %50, 1 + store i32 %inc92, i32* %i, align 4 + br label %for.cond78 + +for.end93: ; preds = %for.cond78 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z13lud_perimeterPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %offset.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %array_offset = alloca i32, align 4 + %idx = alloca i32, align 4 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 %offset, i32* %offset.addr, align 4 + %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %cmp = icmp ult i32 %call, 16 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %idx, align 4 + %0 = load i32, i32* %offset.addr, align 4 + %1 = load i32, i32* %matrix_dim.addr, align 4 + %mul = mul nsw i32 %0, %1 + %2 = load i32, i32* %offset.addr, align 4 + %add = add nsw i32 %mul, %2 + store i32 %add, i32* %array_offset, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then + %3 = load i32, i32* %i, align 4 + %cmp2 = icmp slt i32 %3, 8 + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %4 = load float*, float** %m.addr, align 8 + %5 = load i32, i32* %array_offset, align 4 + %6 = load i32, i32* %idx, align 4 + %add3 = add nsw i32 %5, %6 + %idxprom = sext i32 %add3 to i64 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %7 = load float, float* %arrayidx, align 4 + %8 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE3dia to [16 x [16 x float]]*), i64 0, i64 %idxprom4 + %9 = load i32, i32* %idx, align 4 + %idxprom6 = sext i32 %9 to i64 + %arrayidx7 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx5, i64 0, i64 %idxprom6 + store float %7, float* %arrayidx7, align 4 + %10 = load i32, i32* %matrix_dim.addr, align 4 + %11 = load i32, i32* %array_offset, align 4 + %add8 = add nsw i32 %11, %10 + store i32 %add8, i32* %array_offset, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %12 = load i32, i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %13 = load i32, i32* %offset.addr, align 4 + %14 = load i32, i32* %matrix_dim.addr, align 4 + %mul9 = mul nsw i32 %13, %14 + %15 = load i32, i32* %offset.addr, align 4 + %add10 = add nsw i32 %mul9, %15 + store i32 %add10, i32* %array_offset, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond11 + +for.cond11: ; preds = %for.inc26, %for.end + %16 = load i32, i32* %i, align 4 + %cmp12 = icmp slt i32 %16, 16 + br i1 %cmp12, label %for.body13, label %for.end28 + +for.body13: ; preds = %for.cond11 + %17 = load float*, float** %m.addr, align 8 + %18 = load i32, i32* %array_offset, align 4 + %call14 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %add15 = add i32 %call14, 1 + %mul16 = mul i32 %add15, 16 + %add17 = add i32 %18, %mul16 + %19 = load i32, i32* %idx, align 4 + %add18 = add i32 %add17, %19 + %idxprom19 = zext i32 %add18 to i64 + %arrayidx20 = getelementptr inbounds float, float* %17, i64 %idxprom19 + %20 = load float, float* %arrayidx20, align 4 + %21 = load i32, i32* %i, align 4 + %idxprom21 = sext i32 %21 to i64 + %arrayidx22 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_row to [16 x [16 x float]]*), i64 0, i64 %idxprom21 + %22 = load i32, i32* %idx, align 4 + %idxprom23 = sext i32 %22 to i64 + %arrayidx24 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx22, i64 0, i64 %idxprom23 + store float %20, float* %arrayidx24, align 4 + %23 = load i32, i32* %matrix_dim.addr, align 4 + %24 = load i32, i32* %array_offset, align 4 + %add25 = add nsw i32 %24, %23 + store i32 %add25, i32* %array_offset, align 4 + br label %for.inc26 + +for.inc26: ; preds = %for.body13 + %25 = load i32, i32* %i, align 4 + %inc27 = add nsw i32 %25, 1 + store i32 %inc27, i32* %i, align 4 + br label %for.cond11 + +for.end28: ; preds = %for.cond11 + br label %if.end + +if.else: ; preds = %entry + %call29 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %sub = sub i32 %call29, 16 + store i32 %sub, i32* %idx, align 4 + %26 = load i32, i32* %offset.addr, align 4 + %add30 = add nsw i32 %26, 8 + %27 = load i32, i32* %matrix_dim.addr, align 4 + %mul31 = mul nsw i32 %add30, %27 + %28 = load i32, i32* %offset.addr, align 4 + %add32 = add nsw i32 %mul31, %28 + store i32 %add32, i32* %array_offset, align 4 + store i32 8, i32* %i, align 4 + br label %for.cond33 + +for.cond33: ; preds = %for.inc44, %if.else + %29 = load i32, i32* %i, align 4 + %cmp34 = icmp slt i32 %29, 16 + br i1 %cmp34, label %for.body35, label %for.end46 + +for.body35: ; preds = %for.cond33 + %30 = load float*, float** %m.addr, align 8 + %31 = load i32, i32* %array_offset, align 4 + %32 = load i32, i32* %idx, align 4 + %add36 = add nsw i32 %31, %32 + %idxprom37 = sext i32 %add36 to i64 + %arrayidx38 = getelementptr inbounds float, float* %30, i64 %idxprom37 + %33 = load float, float* %arrayidx38, align 4 + %34 = load i32, i32* %i, align 4 + %idxprom39 = sext i32 %34 to i64 + %arrayidx40 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE3dia to [16 x [16 x float]]*), i64 0, i64 %idxprom39 + %35 = load i32, i32* %idx, align 4 + %idxprom41 = sext i32 %35 to i64 + %arrayidx42 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx40, i64 0, i64 %idxprom41 + store float %33, float* %arrayidx42, align 4 + %36 = load i32, i32* %matrix_dim.addr, align 4 + %37 = load i32, i32* %array_offset, align 4 + %add43 = add nsw i32 %37, %36 + store i32 %add43, i32* %array_offset, align 4 + br label %for.inc44 + +for.inc44: ; preds = %for.body35 + %38 = load i32, i32* %i, align 4 + %inc45 = add nsw i32 %38, 1 + store i32 %inc45, i32* %i, align 4 + br label %for.cond33 + +for.end46: ; preds = %for.cond33 + %39 = load i32, i32* %offset.addr, align 4 + %call47 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %add48 = add i32 %call47, 1 + %mul49 = mul i32 %add48, 16 + %add50 = add i32 %39, %mul49 + %40 = load i32, i32* %matrix_dim.addr, align 4 + %mul51 = mul i32 %add50, %40 + %41 = load i32, i32* %offset.addr, align 4 + %add52 = add i32 %mul51, %41 + store i32 %add52, i32* %array_offset, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond53 + +for.cond53: ; preds = %for.inc64, %for.end46 + %42 = load i32, i32* %i, align 4 + %cmp54 = icmp slt i32 %42, 16 + br i1 %cmp54, label %for.body55, label %for.end66 + +for.body55: ; preds = %for.cond53 + %43 = load float*, float** %m.addr, align 8 + %44 = load i32, i32* %array_offset, align 4 + %45 = load i32, i32* %idx, align 4 + %add56 = add nsw i32 %44, %45 + %idxprom57 = sext i32 %add56 to i64 + %arrayidx58 = getelementptr inbounds float, float* %43, i64 %idxprom57 + %46 = load float, float* %arrayidx58, align 4 + %47 = load i32, i32* %i, align 4 + %idxprom59 = sext i32 %47 to i64 + %arrayidx60 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom59 + %48 = load i32, i32* %idx, align 4 + %idxprom61 = sext i32 %48 to i64 + %arrayidx62 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx60, i64 0, i64 %idxprom61 + store float %46, float* %arrayidx62, align 4 + %49 = load i32, i32* %matrix_dim.addr, align 4 + %50 = load i32, i32* %array_offset, align 4 + %add63 = add nsw i32 %50, %49 + store i32 %add63, i32* %array_offset, align 4 + br label %for.inc64 + +for.inc64: ; preds = %for.body55 + %51 = load i32, i32* %i, align 4 + %inc65 = add nsw i32 %51, 1 + store i32 %inc65, i32* %i, align 4 + br label %for.cond53 + +for.end66: ; preds = %for.cond53 + br label %if.end + +if.end: ; preds = %for.end66, %for.end28 + call void @llvm.nvvm.barrier0() + %call67 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %cmp68 = icmp ult i32 %call67, 16 + br i1 %cmp68, label %if.then69, label %if.else97 + +if.then69: ; preds = %if.end + %call70 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call70, i32* %idx, align 4 + store i32 1, i32* %i, align 4 + br label %for.cond71 + +for.cond71: ; preds = %for.inc94, %if.then69 + %52 = load i32, i32* %i, align 4 + %cmp72 = icmp slt i32 %52, 16 + br i1 %cmp72, label %for.body73, label %for.end96 + +for.body73: ; preds = %for.cond71 + store i32 0, i32* %j, align 4 + br label %for.cond74 + +for.cond74: ; preds = %for.inc91, %for.body73 + %53 = load i32, i32* %j, align 4 + %54 = load i32, i32* %i, align 4 + %cmp75 = icmp slt i32 %53, %54 + br i1 %cmp75, label %for.body76, label %for.end93 + +for.body76: ; preds = %for.cond74 + %55 = load i32, i32* %i, align 4 + %idxprom77 = sext i32 %55 to i64 + %arrayidx78 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE3dia to [16 x [16 x float]]*), i64 0, i64 %idxprom77 + %56 = load i32, i32* %j, align 4 + %idxprom79 = sext i32 %56 to i64 + %arrayidx80 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx78, i64 0, i64 %idxprom79 + %57 = load float, float* %arrayidx80, align 4 + %58 = load i32, i32* %j, align 4 + %idxprom81 = sext i32 %58 to i64 + %arrayidx82 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_row to [16 x [16 x float]]*), i64 0, i64 %idxprom81 + %59 = load i32, i32* %idx, align 4 + %idxprom83 = sext i32 %59 to i64 + %arrayidx84 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx82, i64 0, i64 %idxprom83 + %60 = load float, float* %arrayidx84, align 4 + %mul85 = fmul contract float %57, %60 + %61 = load i32, i32* %i, align 4 + %idxprom86 = sext i32 %61 to i64 + %arrayidx87 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_row to [16 x [16 x float]]*), i64 0, i64 %idxprom86 + %62 = load i32, i32* %idx, align 4 + %idxprom88 = sext i32 %62 to i64 + %arrayidx89 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx87, i64 0, i64 %idxprom88 + %63 = load float, float* %arrayidx89, align 4 + %sub90 = fsub contract float %63, %mul85 + store float %sub90, float* %arrayidx89, align 4 + br label %for.inc91 + +for.inc91: ; preds = %for.body76 + %64 = load i32, i32* %j, align 4 + %inc92 = add nsw i32 %64, 1 + store i32 %inc92, i32* %j, align 4 + br label %for.cond74 + +for.end93: ; preds = %for.cond74 + br label %for.inc94 + +for.inc94: ; preds = %for.end93 + %65 = load i32, i32* %i, align 4 + %inc95 = add nsw i32 %65, 1 + store i32 %inc95, i32* %i, align 4 + br label %for.cond71 + +for.end96: ; preds = %for.cond71 + br label %if.end134 + +if.else97: ; preds = %if.end + %call98 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %sub99 = sub i32 %call98, 16 + store i32 %sub99, i32* %idx, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond100 + +for.cond100: ; preds = %for.inc131, %if.else97 + %66 = load i32, i32* %i, align 4 + %cmp101 = icmp slt i32 %66, 16 + br i1 %cmp101, label %for.body102, label %for.end133 + +for.body102: ; preds = %for.cond100 + store i32 0, i32* %j, align 4 + br label %for.cond103 + +for.cond103: ; preds = %for.inc120, %for.body102 + %67 = load i32, i32* %j, align 4 + %68 = load i32, i32* %i, align 4 + %cmp104 = icmp slt i32 %67, %68 + br i1 %cmp104, label %for.body105, label %for.end122 + +for.body105: ; preds = %for.cond103 + %69 = load i32, i32* %idx, align 4 + %idxprom106 = sext i32 %69 to i64 + %arrayidx107 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom106 + %70 = load i32, i32* %j, align 4 + %idxprom108 = sext i32 %70 to i64 + %arrayidx109 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx107, i64 0, i64 %idxprom108 + %71 = load float, float* %arrayidx109, align 4 + %72 = load i32, i32* %j, align 4 + %idxprom110 = sext i32 %72 to i64 + %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE3dia to [16 x [16 x float]]*), i64 0, i64 %idxprom110 + %73 = load i32, i32* %i, align 4 + %idxprom112 = sext i32 %73 to i64 + %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112 + %74 = load float, float* %arrayidx113, align 4 + %mul114 = fmul contract float %71, %74 + %75 = load i32, i32* %idx, align 4 + %idxprom115 = sext i32 %75 to i64 + %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom115 + %76 = load i32, i32* %i, align 4 + %idxprom117 = sext i32 %76 to i64 + %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117 + %77 = load float, float* %arrayidx118, align 4 + %sub119 = fsub contract float %77, %mul114 + store float %sub119, float* %arrayidx118, align 4 + br label %for.inc120 + +for.inc120: ; preds = %for.body105 + %78 = load i32, i32* %j, align 4 + %inc121 = add nsw i32 %78, 1 + store i32 %inc121, i32* %j, align 4 + br label %for.cond103 + +for.end122: ; preds = %for.cond103 + %79 = load i32, i32* %i, align 4 + %idxprom123 = sext i32 %79 to i64 + %arrayidx124 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE3dia to [16 x [16 x float]]*), i64 0, i64 %idxprom123 + %80 = load i32, i32* %i, align 4 + %idxprom125 = sext i32 %80 to i64 + %arrayidx126 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx124, i64 0, i64 %idxprom125 + %81 = load float, float* %arrayidx126, align 4 + %82 = load i32, i32* %idx, align 4 + %idxprom127 = sext i32 %82 to i64 + %arrayidx128 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom127 + %83 = load i32, i32* %i, align 4 + %idxprom129 = sext i32 %83 to i64 + %arrayidx130 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx128, i64 0, i64 %idxprom129 + %84 = load float, float* %arrayidx130, align 4 + %div = fdiv float %84, %81 + store float %div, float* %arrayidx130, align 4 + br label %for.inc131 + +for.inc131: ; preds = %for.end122 + %85 = load i32, i32* %i, align 4 + %inc132 = add nsw i32 %85, 1 + store i32 %inc132, i32* %i, align 4 + br label %for.cond100 + +for.end133: ; preds = %for.cond100 + br label %if.end134 + +if.end134: ; preds = %for.end133, %for.end96 + call void @llvm.nvvm.barrier0() + %call135 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %cmp136 = icmp ult i32 %call135, 16 + br i1 %cmp136, label %if.then137, label %if.else160 + +if.then137: ; preds = %if.end134 + %call138 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call138, i32* %idx, align 4 + %86 = load i32, i32* %offset.addr, align 4 + %add139 = add nsw i32 %86, 1 + %87 = load i32, i32* %matrix_dim.addr, align 4 + %mul140 = mul nsw i32 %add139, %87 + %88 = load i32, i32* %offset.addr, align 4 + %add141 = add nsw i32 %mul140, %88 + store i32 %add141, i32* %array_offset, align 4 + store i32 1, i32* %i, align 4 + br label %for.cond142 + +for.cond142: ; preds = %for.inc157, %if.then137 + %89 = load i32, i32* %i, align 4 + %cmp143 = icmp slt i32 %89, 16 + br i1 %cmp143, label %for.body144, label %for.end159 + +for.body144: ; preds = %for.cond142 + %90 = load i32, i32* %i, align 4 + %idxprom145 = sext i32 %90 to i64 + %arrayidx146 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_row to [16 x [16 x float]]*), i64 0, i64 %idxprom145 + %91 = load i32, i32* %idx, align 4 + %idxprom147 = sext i32 %91 to i64 + %arrayidx148 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx146, i64 0, i64 %idxprom147 + %92 = load float, float* %arrayidx148, align 4 + %93 = load float*, float** %m.addr, align 8 + %94 = load i32, i32* %array_offset, align 4 + %call149 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %add150 = add i32 %call149, 1 + %mul151 = mul i32 %add150, 16 + %add152 = add i32 %94, %mul151 + %95 = load i32, i32* %idx, align 4 + %add153 = add i32 %add152, %95 + %idxprom154 = zext i32 %add153 to i64 + %arrayidx155 = getelementptr inbounds float, float* %93, i64 %idxprom154 + store float %92, float* %arrayidx155, align 4 + %96 = load i32, i32* %matrix_dim.addr, align 4 + %97 = load i32, i32* %array_offset, align 4 + %add156 = add nsw i32 %97, %96 + store i32 %add156, i32* %array_offset, align 4 + br label %for.inc157 + +for.inc157: ; preds = %for.body144 + %98 = load i32, i32* %i, align 4 + %inc158 = add nsw i32 %98, 1 + store i32 %inc158, i32* %i, align 4 + br label %for.cond142 + +for.end159: ; preds = %for.cond142 + br label %if.end183 + +if.else160: ; preds = %if.end134 + %call161 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %sub162 = sub i32 %call161, 16 + store i32 %sub162, i32* %idx, align 4 + %99 = load i32, i32* %offset.addr, align 4 + %call163 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %add164 = add i32 %call163, 1 + %mul165 = mul i32 %add164, 16 + %add166 = add i32 %99, %mul165 + %100 = load i32, i32* %matrix_dim.addr, align 4 + %mul167 = mul i32 %add166, %100 + %101 = load i32, i32* %offset.addr, align 4 + %add168 = add i32 %mul167, %101 + store i32 %add168, i32* %array_offset, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond169 + +for.cond169: ; preds = %for.inc180, %if.else160 + %102 = load i32, i32* %i, align 4 + %cmp170 = icmp slt i32 %102, 16 + br i1 %cmp170, label %for.body171, label %for.end182 + +for.body171: ; preds = %for.cond169 + %103 = load i32, i32* %i, align 4 + %idxprom172 = sext i32 %103 to i64 + %arrayidx173 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ13lud_perimeterPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom172 + %104 = load i32, i32* %idx, align 4 + %idxprom174 = sext i32 %104 to i64 + %arrayidx175 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx173, i64 0, i64 %idxprom174 + %105 = load float, float* %arrayidx175, align 4 + %106 = load float*, float** %m.addr, align 8 + %107 = load i32, i32* %array_offset, align 4 + %108 = load i32, i32* %idx, align 4 + %add176 = add nsw i32 %107, %108 + %idxprom177 = sext i32 %add176 to i64 + %arrayidx178 = getelementptr inbounds float, float* %106, i64 %idxprom177 + store float %105, float* %arrayidx178, align 4 + %109 = load i32, i32* %matrix_dim.addr, align 4 + %110 = load i32, i32* %array_offset, align 4 + %add179 = add nsw i32 %110, %109 + store i32 %add179, i32* %array_offset, align 4 + br label %for.inc180 + +for.inc180: ; preds = %for.body171 + %111 = load i32, i32* %i, align 4 + %inc181 = add nsw i32 %111, 1 + store i32 %inc181, i32* %i, align 4 + br label %for.cond169 + +for.end182: ; preds = %for.cond169 + br label %if.end183 + +if.end183: ; preds = %for.end182, %for.end159 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z12lud_internalPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %offset.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %sum = alloca float, align 4 + %global_row_id = alloca i32, align 4 + %global_col_id = alloca i32, align 4 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 %offset, i32* %offset.addr, align 4 + %0 = load i32, i32* %offset.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2 + %add = add i32 %call, 1 + %mul = mul i32 %add, 16 + %add1 = add i32 %0, %mul + store i32 %add1, i32* %global_row_id, align 4 + %1 = load i32, i32* %offset.addr, align 4 + %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + %add3 = add i32 %call2, 1 + %mul4 = mul i32 %add3, 16 + %add5 = add i32 %1, %mul4 + store i32 %add5, i32* %global_col_id, align 4 + %2 = load float*, float** %m.addr, align 8 + %3 = load i32, i32* %offset.addr, align 4 + %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + %add7 = add i32 %3, %call6 + %4 = load i32, i32* %matrix_dim.addr, align 4 + %mul8 = mul i32 %add7, %4 + %5 = load i32, i32* %global_col_id, align 4 + %add9 = add i32 %mul8, %5 + %call10 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add11 = add i32 %add9, %call10 + %idxprom = zext i32 %add11 to i64 + %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom + %6 = load float, float* %arrayidx, align 4 + %call12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + %idxprom13 = zext i32 %call12 to i64 + %arrayidx14 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_internalPfiiE8peri_row to [16 x [16 x float]]*), i64 0, i64 %idxprom13 + %call15 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom16 = zext i32 %call15 to i64 + %arrayidx17 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx14, i64 0, i64 %idxprom16 + store float %6, float* %arrayidx17, align 4 + %7 = load float*, float** %m.addr, align 8 + %8 = load i32, i32* %global_row_id, align 4 + %call18 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + %add19 = add i32 %8, %call18 + %9 = load i32, i32* %matrix_dim.addr, align 4 + %mul20 = mul i32 %add19, %9 + %10 = load i32, i32* %offset.addr, align 4 + %add21 = add i32 %mul20, %10 + %call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add23 = add i32 %add21, %call22 + %idxprom24 = zext i32 %add23 to i64 + %arrayidx25 = getelementptr inbounds float, float* %7, i64 %idxprom24 + %11 = load float, float* %arrayidx25, align 4 + %call26 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + %idxprom27 = zext i32 %call26 to i64 + %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_internalPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom27 + %call29 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom30 = zext i32 %call29 to i64 + %arrayidx31 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom30 + store float %11, float* %arrayidx31, align 4 + call void @llvm.nvvm.barrier0() + store float 0.000000e+00, float* %sum, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %12 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %12, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call32 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + %idxprom33 = zext i32 %call32 to i64 + %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_internalPfiiE8peri_col to [16 x [16 x float]]*), i64 0, i64 %idxprom33 + %13 = load i32, i32* %i, align 4 + %idxprom35 = sext i32 %13 to i64 + %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35 + %14 = load float, float* %arrayidx36, align 4 + %15 = load i32, i32* %i, align 4 + %idxprom37 = sext i32 %15 to i64 + %arrayidx38 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ12lud_internalPfiiE8peri_row to [16 x [16 x float]]*), i64 0, i64 %idxprom37 + %call39 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %idxprom40 = zext i32 %call39 to i64 + %arrayidx41 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx38, i64 0, i64 %idxprom40 + %16 = load float, float* %arrayidx41, align 4 + %mul42 = fmul contract float %14, %16 + %17 = load float, float* %sum, align 4 + %add43 = fadd contract float %17, %mul42 + store float %add43, float* %sum, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %18 = load i32, i32* %i, align 4 + %inc = add nsw i32 %18, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %19 = load float, float* %sum, align 4 + %20 = load float*, float** %m.addr, align 8 + %21 = load i32, i32* %global_row_id, align 4 + %call44 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + %add45 = add i32 %21, %call44 + %22 = load i32, i32* %matrix_dim.addr, align 4 + %mul46 = mul i32 %add45, %22 + %23 = load i32, i32* %global_col_id, align 4 + %add47 = add i32 %mul46, %23 + %call48 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + %add49 = add i32 %add47, %call48 + %idxprom50 = zext i32 %add49 to i64 + %arrayidx51 = getelementptr inbounds float, float* %20, i64 %idxprom50 + %24 = load float, float* %arrayidx51, align 4 + %sub = fsub contract float %24, %19 + store float %sub, float* %arrayidx51, align 4 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + ret i32 %0 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_50" "target-features"="+ptx64,+sm_50" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_50" "target-features"="+ptx64,+sm_50" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !7, !6, !8, !8, !8, !8, !9, !9, !8} +!llvm.ident = !{!10} +!nvvmir.version = !{!11} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (float*, i32, i32)* @_Z12lud_diagonalPfii, !"kernel", i32 1} +!4 = !{void (float*, i32, i32)* @_Z13lud_perimeterPfii, !"kernel", i32 1} +!5 = !{void (float*, i32, i32)* @_Z12lud_internalPfii, !"kernel", i32 1} +!6 = !{null, !"align", i32 8} +!7 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!8 = !{null, !"align", i32 16} +!9 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!10 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!11 = !{i32 1, i32 4} diff --git a/examples/lud/lud_kernel-host-x86_64-unknown-linux-gnu.ll b/examples/lud/lud_kernel-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..65c51b5 --- /dev/null +++ b/examples/lud/lud_kernel-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,452 @@ +; ModuleID = 'lud_kernel-host-x86_64-unknown-linux-gnu.bc' +source_filename = "cuda/lud_kernel.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +@0 = private unnamed_addr constant [21 x i8] c"_Z12lud_diagonalPfii\00", align 1 +@1 = private unnamed_addr constant [22 x i8] c"_Z13lud_perimeterPfii\00", align 1 +@2 = private unnamed_addr constant [21 x i8] c"_Z12lud_internalPfii\00", align 1 +@3 = private constant [51057 x i8] c"P\EDU\BA\01\00\10\00`\C7\00\00\00\00\00\00\02\00\01\01@\00\00\00\E8\AE\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\002\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00@\AE\00\00\00\00\00\00\C0\A9\00\00\00\00\00\002\052\00@\008\00\03\00@\00\12\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z12lud_internalPfii\00.nv.info._Z12lud_internalPfii\00.nv.shared._Z12lud_internalPfii\00.nv.global\00.nv.constant0._Z12lud_internalPfii\00.text._Z13lud_perimeterPfii\00.nv.info._Z13lud_perimeterPfii\00.nv.shared._Z13lud_perimeterPfii\00.nv.constant0._Z13lud_perimeterPfii\00.text._Z12lud_diagonalPfii\00.nv.info._Z12lud_diagonalPfii\00.nv.shared._Z12lud_diagonalPfii\00.nv.constant0._Z12lud_diagonalPfii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z12lud_internalPfii\00.text._Z12lud_internalPfii\00.nv.info._Z12lud_internalPfii\00.nv.shared._Z12lud_internalPfii\00.nv.global\00threadIdx\00blockIdx\00$___ZZ12lud_internalPfiiE8peri_row__905\00$___ZZ12lud_internalPfiiE8peri_col__907\00.nv.constant0._Z12lud_internalPfii\00_param\00_Z13lud_perimeterPfii\00.text._Z13lud_perimeterPfii\00.nv.info._Z13lud_perimeterPfii\00.nv.shared._Z13lud_perimeterPfii\00$_Z13lud_perimeterPfii$__cuda_sm3x_div_rn_noftz_f32\00$_Z13lud_perimeterPfii$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ13lud_perimeterPfiiE3dia__430\00$___ZZ13lud_perimeterPfiiE8peri_row__432\00$___ZZ13lud_perimeterPfiiE8peri_col__434\00.nv.constant0._Z13lud_perimeterPfii\00_Z12lud_diagonalPfii\00.text._Z12lud_diagonalPfii\00.nv.info._Z12lud_diagonalPfii\00.nv.shared._Z12lud_diagonalPfii\00$_Z12lud_diagonalPfii$__cuda_sm3x_div_rn_noftz_f32\00$_Z12lud_diagonalPfii$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ12lud_diagonalPfiiE6shadow__186\00.nv.constant0._Z12lud_diagonalPfii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00G\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\00\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A0\00\00\00\03\00\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AB\00\00\00\01\00\0F\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\B5\00\00\00\01\00\0F\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\0E\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00N\01\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\89\01\00\00\03\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AA\01\00\00\22\00\0C\00PN\00\00\00\00\00\00`\01\00\00\00\00\00\00\DE\01\00\00\22\00\0C\00\B0O\00\00\00\00\00\00P\08\00\00\00\00\00\00\91\02\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\CA\02\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\03\00\00\03\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00#\03\00\00\22\00\0D\00\E0$\00\00\00\00\00\00`\01\00\00\00\00\00\00V\03\00\00\22\00\0D\00@&\00\00\00\00\00\00@\08\00\00\00\00\00\00\B8\03\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\0B\00\00\00\00\00\00\00\00\00@\15\00\00\00\00\00\008\01\00\00\12\10\0C\00\00\00\00\00\00\00\00\00\00X\00\00\00\00\00\00\B5\02\00\00\12\10\0D\00\00\00\00\00\00\00\00\00\80.\00\00\00\00\00\00\04/\08\00\13\00\00\00\11\00\00\00\04#\08\00\0F\00\00\00\00\00\00\00\04\12\08\00\0F\00\00\00\00\00\00\00\04\11\08\00\0F\00\00\00\00\00\00\00\04#\08\00\0E\00\00\00\00\00\00\00\04\12\08\00\0E\00\00\00\00\00\00\00\04\11\08\00\0E\00\00\00\00\00\00\00\04#\08\00\13\00\00\00\00\00\00\00\04\12\08\00\13\00\00\00 \00\00\00\04\11\08\00\13\00\00\00 \00\00\00\04/\08\00\12\00\00\00\11\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00\00\00\00\00\04\11\08\00\0A\00\00\00\00\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00\00\00\00\00\04\11\08\00\09\00\00\00\00\00\00\00\04#\08\00\12\00\00\00\00\00\00\00\04\12\08\00\12\00\00\00 \00\00\00\04\11\08\00\12\00\00\00 \00\00\00\04/\08\00\11\00\00\00\0E\00\00\00\04#\08\00\11\00\00\00\00\00\00\00\04\12\08\00\11\00\00\00 \00\00\00\04\11\08\00\11\00\00\00 \00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\01\10\00\03\19\10\00\04\17\0C\00\00\00\00\00\02\00\0C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0\11\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\08\03\00\00\F8\03\00\00\04\1C\04\000\15\00\00\04\1E\04\00 \00\00\00\010\00\00\01*\00\00\04\0A\08\00\0B\00\00\00@\01\10\00\03\19\10\00\04\17\0C\00\00\00\00\00\02\00\0C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0\11\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\10\00\B8\0E\00\00(\1D\00\00XB\00\00(F\00\00\04\1C\04\00HN\00\00\04\1E\04\00\90\00\00\00\010\00\00\01*\00\00\04\0A\08\00\10\00\00\00@\01\10\00\03\19\10\00\04\17\0C\00\00\00\00\00\02\00\0C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0\11\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1C\04\00\D8$\00\00\04\1E\04\00\90\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB5\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\14visible .entry _Z12lud_diagonalPfii\9D\04\00\97\00\0F\22\00\01\0E{\04\00\93\00\0F*\00\08\1F1*\00\0F\0F\8B\0C\1B\1F6\E7\07\18\95pred %p<8\A0\03\10fA\01[f<16>\C3\03-56\C4\03 67\C5\03P\09.shaH\00\03\93\00\124\93\00\1FZ\C9\00\00\C0E6shadow[102t\03\0F\00\04\08\1F6\00\04\1C\0F8\01\01\0F\EA\0C\0B\0F\94\01\01\0F\12\03\0C\0F\F1\01\09\13]\B1\00#to\FB\12\07\A0\03\02\F9\02\01\F8\0F\0A\1C\00\183\FF\02\0B+\03\1F3!\0D\1A&ld\FC\02\04\1B\00\07\16\00$4,F\00\A1;\0Amul.lo.s\19\00\225,4\00\83%r4;\0Aadd\17\00%6,\1C\00\0Ar\00\03\86\03\186o\03O7, 0\C2\03\02\F2\047;\0Abra.uni LBB6_1;\0A\08\00\16:\9A\00%8,3\00\92;\0Asetp.gt\85\002p1, \00\C215;\0A@%p1 braI\00\1B4Y\00\132Y\00\122Y\00\02]\01455,Q\01\08\0A\01\04q\00\06\82\02\02\18\00\989, %tid.x\08\01350,4\00\00\22\00\01\C4\01\00_\00\034\00\22d5(\01\00\FE\00$hl\18\03457,\1C\00\132L\00\03\19\00$8,\98\00\01'\00\01\96\00\02q\03\111\AC\00\00#\00\02\AC\00\13s`\00\199\1E\01\06d\00460, \00\176H\03_rd61,|\03\0B\03\88\02\02\B4\03\056\00\02\A5\02)61\B0\00863,\1D\00\1D0\FF\00$64\18\01\08\9B\00$5,\1C\00\0A\FF\00866,V\00\1153\02\11f3\02\00\1D\00!],\08\01\07\AB\01'51\B6\02\06\16\00\182\C1\01\07\AB\01#3,\1E\00\00:\00\0F\B6\02\02+538\02\1338\02\173\91\02(54t\01\07`\00#5,\1E\00\1F1\ED\02\02/55\EE\02\04\D84:\0Abar.sync 0s\02\1F98\03\05\1B9J\00\135J\00\175\A7\00/109\03\07#2,!\00!14:\03\162:\03+20\\\00\136\\\00(6:\17\0C\166\0D\03\05y\01-27r\00\22le\1C\004p4,6\001r27t\00\164t\00+12t\00\137t\00\197t\00\1D8\0F\01\132|\0C\1B8@\00\138@\00\178\10\01\142+\03\192\EF\03\1F3'\01\00\13e'\01#5,8\00\00'\00\01\B5\00\175\B5\00\1B1u\00\139u\00\189\B5\00&45)\01\0B\1D\03\114\BF\04)45\9F\03/42\9F\03!\1243\03)42i\03444,h\00\196i\03\01\95\00\066\00\184R\04(46'\01\08N\00\02\B6\04\1D4\B6\04\02\FD\04\05U\00\09\B6\04\120\B6\04+48L\00\159L\00\0A\9A\00)50\9A\00\199\EC\04\1F1\EC\04\04\02\D3\03\01 \00\0BP\05'3,U\00(52\9A\00\131P\05*3]3\00\184\CD\00\0A3\00\1323\00\00W\044neg\17\00!3,\EA\00\84;\0Afma.rn\17\00&4,\1D\00\111\06\00\192\D1\04\2254\D1\04\0C\AB\06$10\05\07\170\BE\02\0A\97\01\06u\04\02\96\01\00\1E\00\1F1\1B\03\02/47\1B\03\04711:\\\01/16\\\01\04417, \00\0AH\06/18\A9\02!\121\F9\01)18u\01820,\1D\00\197\82\00%21\82\00\0Ax\02722,=\00'21\AB\01\03\AA\01)22\C2\07/31\8C\03\06\112*\02*31z\00$4,\1C\00\0A\A6\02(25\B0\00*24\1D\00'6,$\00\09\97\00\133\97\00\00d\016div-\02\224,\1E\00*%f$\02\122\F5\06\0D#\02\04\CF\08/12;\06\05/32\A1\05\00/33\A1\05\07$6,6\001r33\EC\04\176\EC\04\0Ca\05\141\19\07(13\EE\04/34\A3\05\05\1D3\C4\00\144B\00\174\E7\02/35\A5\05\03\1F6s\07\03337,\1E\00\1C1\BC\05#7,O\00\00&\00\01\D0\00\177\D0\00\0Dy\0A\04B\07\181C\07/40w\00\03\03\BA\05\00@\07\02J\024s64J\03\122\91\03\191-\03\1F2-\03\22\132`\02\198\96\02\00\B1\06\03h\00\0A\96\02\01\DF\02\056\00(30<\05/32\D6\05\04\00\13\02\03 \00\0A^\03\00\CB\01\06U\00'33\C7\02\03\8B\0A+34K\00\155K\00\0B\99\00\196\99\00\195V\0B\1F2\94\03\06\02\DD\01\0B\D4\06438,\1C\00\0B\B0\00'9,l\00'38\B0\00\136\B0\00\1A9\EB\05)40\E2\00\082\00\137\B7\06\190\EA\05\00{\00+f5\E8\05$9,\1A\00\00z\00)f7\C0\03\114\FD\02\1Cf\AF\09\141T\09(16n\02\1F3\E3\05\04\02E\0B\1F3\E3\05\06\0F[\03\07+7:\1A\00\05\FE\05\1A89\04\0A@\00\04\CA\08(19\9B\03\1F8\84\03\04\02 \09\1F8\F7\0A\05\1F3\AD\0A\05\182\E0\06(11\97\0E\06^\00\00u\07\02\1E\00'1;.\00\1F3\AF\0E\05\01w\07\046\00\193J\00&5, \00\0F\FF\0B\03\191\9B\02/16\D3\00\05,16_\0E\04\CB\0B\182\BA\0E/17\82\0B\07#3,!\00\03\BC\0E\173\82\0B\0D\BD\0E\05\BE\0E\182\B4\07\1F4\0F\09\05\02\C2\08\0D\F9\0D\0F\83\04!\02\FE\07\0A\CF\03&8,\1A\00\0A1\01\1F8\CC\03\06\01$\02\1A1\C9\04\00\1D\0A\02\1B\00\0A\CA\03\01.\09\04g\00(10\82\09\02\97\03\131\7F\11\04T\08\1F2\C7\0F\01/19\06\0E\03\02/\08\151\91\00\0B\A9\00\01\B1\09:r20\AA\00$4,\1C\00\0B\AB\00$5,\82\00\01'\00\09\FD\03\2215\E1\09\08\22\0D\0F\B1\0E\02/22\AB\00\04#3,\1E\00\00:\00\0F\B2\02\02\1C2\B1\0E$23,\02\08\B3\0E\1F2\B3\0E\04\02\A5\08\1F2\B3\0E\06/25\E9\02\06/4:\1A\15\0A\103g\02\9Eperimeter\1B\15\0E#\00\0F\1C\15\05\0F+\00\0A\1F1+\00\0C\0F\1E\15\1F\1F7\1E\15 ,14\1F\15,20\1F\15=135 \15?122!\15\0B\0E\CD\00TE3dia\1F\15\0F:\00\1B\108\0F\00O_row?\00)?col\9D\15\10\1F7\9D\15\1F\0E\B8\01\0F\9E\15\0E\0E\16\02\0F\9F\15\0F\0Eu\02\0F\A0\15t\09\9D\18\04F\05\044\06\16u\EF\14\1E3\EF\14:7_1\C3\07\137[\1297_1\9E\0B\07\C2\14\09L\04\158\A9\07\0E\FB\0A\0B'\16\1F1y\07\05$42\14\0B\02\FE\0A\07\C0\08'3, \00\0F\C7\04\02\194t\1E/44\F8\12\05\0B\EB\08\137\D8\1587_2H\09\1F5w\07\07#4,!\00\0B\84\12;7_5Z\00\133Z\00(3:k\06/10l\06\02(27\C2\05\07\18\00\05L\16\1B8\D6\08$9,8\00\01'\00\0DG\0C\121\06\07*12<\15\141k\06.117\07%3,\A4\00\02+\00\08;\07\149<\07\193A\16/11G\08\05\141\DD\06\1D1K\08\101\1B\09\0F\A0\04\08\0F\FB\0F\03\04K\10+16\B8\00*8, \00\1F5\0E\01\00\030\01\0BB\08\049\10\1E1E\08\047\10\04^\00*20\9D\07\2221\9E\07\199W\0A\1F0\A0\07\01?131\A1\07\03\1210\0F\2213\AC\01\1F3\DD\02\03;132\\\02\134\\\02\08\FA\0E?133\A4\07\03\121\8A\0D\01 \00\0F\8F\0A\04\1F3\17\03\05\1E5B\12\0D\D2\03\1F7\D2\03\06\148[\12\02G\12\07\D2\03'9, \00\0F\D2\03\03\189\D2\03/50\D2\03\05\1C5\BA\04\136\BB\00.6:\E8\17\0F\D2\03\01\02u\17\151\16\05\165\D3\03\1B9[\00\137[\00\187\D3\03/98\D2\03\02/15\06\1A\03\01\FB\02C%ctaG\05#hl\E1\07\03\BF\02\01 \00\09*\1B\03\82\02\05 \00\195g\00\1F9!\04\05(0,<\00\1A9\1D\00$1,$\00.16l\0B\03,\03\1B1n\0B\03o\0B\0E+\03401,\0A\01\02)\00\088\04\1388\04*018\04/028\04\05503,\22\00\0B8\04?04,\9E\08\0E\0F=\04\03\130\8D\04+04\BC\00*6, \00\1E3=\04%07E\01\09\A8\00\04\F0\00\1E7\12\01*9,^\00\1A8=\04\2209=\04\198\ED\05\1F2=\04\02/23=\04\04\02_\14\121\E6\0B?122=\04\03\1C2\DC\03\138\C6\02\08\DC\18?125=\04\04\02\8E\14\01 \00\0F=\04\04/26\82\03\04+9:T\08\149U\08\190\E2\18\0F\17\1D\02\12,\1A\00?-16l\08\02\08\05\03\1F6\FA\0F\03\227,\1C\00\08d\01\1F8\AC\04\05$9,2\00\198\FF\00$0,\1D\00\0F\A7\04\03*10\DD\03\01o\00\0F&\01\01\1C1a\09\05b\09\09\F2\0F\1F2t\1B\0D\152\A9\04\172\BF\09\0C\E4\01$12^\00\09\B1\1E/16\AC\04\01/31}\08\02/32[\04\03\02\F2\13\02\B3\06.32.\03\147\03\16\08,\03\03+\03\0E*\03\159\9D\17\03\D7\17\0C\84\17)198\04/207\04\04%21\CE\17\0A5\04/22l\08\1F\132\98\0B\0B1\18(4,\1D\00\0F\BC\15\00\145\15\01\08\99\00\06\D0\17\0BO\00(7,V\00\1A6\BA\17\117\22\04\09y!\1F4 \04\01/35\1F\04\03'36\C1\16\1F4\BD\02\02\1C3\BA\03$137\02\08\FD\0F/37\B9\13\04#8,\1E\00\0F\B9\13\04\1F8\F4\02\06/4:\8C\13\00*12*\07\1F4)\07\04\05\88\13\0A'\07\02\A9\04\1F5\B9\13\00$7,\8E\13\186Y\05\0F\0D\04\06\03-\0A\147\E3\11\0F\FD\11\04\0F\B9\22\03\192\12\04/21\B9\08\05\1D2\12\04\145\1E\01.5:\DC\11\0F\04\14\06.22\04\14=7_1|\01\04\96\05(16\12\04\1F4\11\04\01/23\11\04\02/24\11\04\03\03\05\12\02m\12\1D4\15\03\145\09\12\08\C3!\02\FD\13\0C\12\03\147z\14\024\14\0C\B8\13\197\0A\04\1F8\09\04\04\149\FE\13\0C<\08\1F,\9A\10\0E\0Fx\0C\04\05I\14\09 \0C\17,\1D\00\1E9\07\05\05\0F\01\0F\08\14B\1F6\08\14\02\1F7\08\14\04#8,\1E\00\00:\00\0F\B6\02\03\0D0\02\04\ED\0A.17\04!\0E\0B\04\140\EA \0F\0B\04\04\1F0\ED\02\06\1F8&\08\06\1F9\9C\1C\05/52\E2\10\05\02.%\06\12\03\166\12\03\1C2\CE\0B$20\FC\0F\09\8F\08/64\E5\10\0A(64\8A\00\1F6\17\09\06,65o\00\04\FD\04\09\E3\17/66\DF\03\07\02w\08\156\CD\00\179\CD\00\0C\AF\01\152\C9\10\182\CD\00/10\F4\22\06-10D\00\04\FE\05\09\FB\15/10\F8\22\03/11\F9\22\08\02\FE\02\03\07\0B2110p\12\07\A0\04\0D5\0A$24}\00(4:\E5\03\0F\B4!\05482, \00\0A\E8\03/83\EF\07\1F\128\98\03*83\A8\18(5,\1D\00\192}\04\0F\E6\22\05\128\C4\04\1D8n 888,U\00\09\E6\22\03\D8\1F\1B8\E6\22\138\CB\04\0D\02\03\02\15\03\00\1E\00\09>\02\02\C6\02\03\1D\00\190\1A\00#4, \00\09\D5\15/75\EC\04\05\02\18\03\037\00\195K\00&7, \00\0F\83\1F\03)77\C5\00\1F8\908\06\0CT\11$45d\01\08;\0E/79\EC\04\08#1,\22\00\04\EC\04\07\A8#\1D4`\00\041\07(46\EC\04/57\82\0A\05\03\228\1D7\1A\0B\1F9I\08$\05)8\0A\8B7(1,\1D\00\09\BD\06\1F0\EC\04\08\02\0B8\1A8?)\07'8\0B\F17(4,m\00\1F3\A03\00)64\EC\04\1F6\B39\02\1F8\A0!\04\02e\12$81\94\00\0C\AD\00\013\02*82\AD\00\03\F3*\1D6\AD\00(8,\BE8\0A\E9-\2268\85\04\08\B8\07\1F8\84\04\02/84\AC\00\04#5,\1E\00\00:\00\0F\C6\02\02\1D8\7F\04\04\\\10\184\\\10\1F8~\04\04\02\DB\12\1F8|\04\06/87\FD\02\06\1F8{\04\06/9:!?\0F_inter!?\0A\08\22\00\0F!?\0B\0F*\00\00\1F1*\00\08\0F!?\22\1F8\03* \1D2!?\0D\02*\1F4\E5B\00\1F3!?\10\08\C9\00\0E\C5)\0F>\00 \0F\C4)\13\1F8a?$\0Ex\01\0Fa?\0D\0E\D4\01\0Fa?\0E\0E1\02\0Fa?\84\0A(!\02:\07\18y:\07\04*!\0F^?\07\0E\13!/16\E6\06\03\0E*!\0C\85\00\1F9\94\1D\04\04(!\0B\BA$\02\93\08\0EW!\06\951\0FL\08\03\04\8D\09\0E\A5\1C\0F7\1E\13\12t&\01\0D\BD1\02\97\18\184G\00\1F6\06\1E\06\177Q\1E\0E9\1E\0E\95)\0C7\1E\07\06\1E\1F0S\09\01&21M\1E\1E2\FC>\155\B7/\0F8\1D1\0B]\00\148\09\01\08V\18\1E,4\04\0F[%\12\04\03\1D\09h*\07\86\1D\0F7\1D\00\04x)\0F@1W\0E\02#\0FA1\1B\096\02\1F2\EFB\06\0C\0D\1F\0E\B3\1D\0B\A7$\03\97\1D\145\9D\1D\0F\B7\1D\04\0E \02$17\B9\1D\0Fi#8\088#.0,\06\06\0F\11\0A\10\05\89#\0Fm;\02\04V+\0A\1D\00\04o#\0F\C7\01\01$23\0A#\0F\98\11\03/29\D7?\06\1F9z!\03\0A\0A\1E\138\ED.\1986D/307D\0C.30I/\1B88D\138I+)8_\96\1D&40\A3\04\0CH<\148\E6.\08\8A\01\1F9\8A\01#/30\AF\14\02\181/\15\0F\E5\14\00\04\D78\09\9A\14\0F\D3\16\05\04\E4\14\0D\9B\03)35\FD\14\1F4P9\00\1F5\E5\14\03\1E3\E5\14\0F\9A\04\0E\0F\E4\14*\07\9A\01\0F\E2=\07\06\9A0\08i\18\03\FF\17\1D0\B4\18\182,\15/41\1C:\00,2]\B4\17\04<\17\0E :\1F6\A2\17\02\05\EA\02\1Cf\D2\02\133w\02\08\C9\1F/42\8D<\04\05_1\0F\D7\0B\03/43/\03\04+4:\D0\16\08\EA\1A\062'\0Fq(\0D\0Ft>\00\1Fyp(\09\0E\10'\0D\85\05\03\F6&\143\FC&\0E\08>/28\08>\01\027\00\09\12\02\1F3\0E\11\02\03\B4:\147\E7&\0C\C6\03\145\103\0F\08(#\0CX\17\102\C2\07?subX\17\0F\037(\B05;\0Aret;\0A\0A}\0A\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([51057 x i8], [51057 x i8]* @3, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z12lud_diagonalPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %offset.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 %offset, i32* %offset.addr, align 4 + %kernel_args = alloca i8*, i64 3, align 16 + %0 = bitcast float** %m.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32* %matrix_dim.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32* %offset.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %7 = load i64, i64* %shmem_size, align 8 + %8 = load i8*, i8** %stream, align 8 + %9 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %10 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false) + %11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %14 = load i32, i32* %13, align 8 + %15 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %16 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast i8* %8 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, i32)* @_Z12lud_diagonalPfii to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13lud_perimeterPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %offset.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 %offset, i32* %offset.addr, align 4 + %kernel_args = alloca i8*, i64 3, align 16 + %0 = bitcast float** %m.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32* %matrix_dim.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32* %offset.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %7 = load i64, i64* %shmem_size, align 8 + %8 = load i8*, i8** %stream, align 8 + %9 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %10 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false) + %11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %14 = load i32, i32* %13, align 8 + %15 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %16 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast i8* %8 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, i32)* @_Z13lud_perimeterPfii to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z12lud_internalPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %offset.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 %offset, i32* %offset.addr, align 4 + %kernel_args = alloca i8*, i64 3, align 16 + %0 = bitcast float** %m.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32* %matrix_dim.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32* %offset.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %7 = load i64, i64* %shmem_size, align 8 + %8 = load i8*, i8** %stream, align 8 + %9 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %10 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false) + %11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %14 = load i32, i32* %13, align 8 + %15 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %16 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %20 = load i32, i32* %19, align 8 + %21 = bitcast i8* %8 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, i32)* @_Z12lud_internalPfii to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z8lud_cudaPfi(float* %m, i32 %matrix_dim) #0 { +entry: + %m.addr = alloca float*, align 8 + %matrix_dim.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %dimBlock = alloca %struct.dim3, align 4 + %m_debug = alloca float*, align 8 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp2 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp2.coerce = alloca { i64, i32 }, align 4 + %agg.tmp5 = alloca %struct.dim3, align 4 + %agg.tmp8 = alloca %struct.dim3, align 4 + %agg.tmp5.coerce = alloca { i64, i32 }, align 4 + %agg.tmp8.coerce = alloca { i64, i32 }, align 4 + %dimGrid = alloca %struct.dim3, align 4 + %agg.tmp20 = alloca %struct.dim3, align 4 + %agg.tmp21 = alloca %struct.dim3, align 4 + %agg.tmp20.coerce = alloca { i64, i32 }, align 4 + %agg.tmp21.coerce = alloca { i64, i32 }, align 4 + %agg.tmp27 = alloca %struct.dim3, align 4 + %agg.tmp28 = alloca %struct.dim3, align 4 + %agg.tmp27.coerce = alloca { i64, i32 }, align 4 + %agg.tmp28.coerce = alloca { i64, i32 }, align 4 + store float* %m, float** %m.addr, align 8 + store i32 %matrix_dim, i32* %matrix_dim.addr, align 4 + store i32 0, i32* %i, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1) + %0 = load i32, i32* %matrix_dim.addr, align 4 + %1 = load i32, i32* %matrix_dim.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv = sext i32 %mul to i64 + %mul1 = mul i64 %conv, 4 + %call = call noalias i8* @malloc(i64 %mul1) #5 + %2 = bitcast i8* %call to float* + store float* %2, float** %m_debug, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %3 = load i32, i32* %i, align 4 + %4 = load i32, i32* %matrix_dim.addr, align 4 + %sub = sub nsw i32 %4, 16 + %cmp = icmp slt i32 %3, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 1, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp2, i32 16, i32 1, i32 1) + %5 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %6 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %5, i8* align 4 %6, i64 12, i1 false) + %7 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %8 = load i64, i64* %7, align 4 + %9 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %10 = load i32, i32* %9, align 4 + %11 = bitcast { i64, i32 }* %agg.tmp2.coerce to i8* + %12 = bitcast %struct.dim3* %agg.tmp2 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %11, i8* align 4 %12, i64 12, i1 false) + %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp2.coerce, i32 0, i32 0 + %14 = load i64, i64* %13, align 4 + %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp2.coerce, i32 0, i32 1 + %16 = load i32, i32* %15, align 4 + %call3 = call i32 @__cudaPushCallConfiguration(i64 %8, i32 %10, i64 %14, i32 %16, i64 0, i8* null) + %tobool = icmp ne i32 %call3, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %17 = load float*, float** %m.addr, align 8 + %18 = load i32, i32* %matrix_dim.addr, align 4 + %19 = load i32, i32* %i, align 4 + call void @_Z12lud_diagonalPfii(float* %17, i32 %18, i32 %19) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.body + %call4 = call i32 @cudaDeviceSynchronize() + %20 = load i32, i32* %matrix_dim.addr, align 4 + %21 = load i32, i32* %i, align 4 + %sub6 = sub nsw i32 %20, %21 + %div = sdiv i32 %sub6, 16 + %sub7 = sub nsw i32 %div, 1 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp5, i32 %sub7, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp8, i32 32, i32 1, i32 1) + %22 = bitcast { i64, i32 }* %agg.tmp5.coerce to i8* + %23 = bitcast %struct.dim3* %agg.tmp5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %22, i8* align 4 %23, i64 12, i1 false) + %24 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp5.coerce, i32 0, i32 0 + %25 = load i64, i64* %24, align 4 + %26 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp5.coerce, i32 0, i32 1 + %27 = load i32, i32* %26, align 4 + %28 = bitcast { i64, i32 }* %agg.tmp8.coerce to i8* + %29 = bitcast %struct.dim3* %agg.tmp8 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %28, i8* align 4 %29, i64 12, i1 false) + %30 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp8.coerce, i32 0, i32 0 + %31 = load i64, i64* %30, align 4 + %32 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp8.coerce, i32 0, i32 1 + %33 = load i32, i32* %32, align 4 + %call9 = call i32 @__cudaPushCallConfiguration(i64 %25, i32 %27, i64 %31, i32 %33, i64 0, i8* null) + %tobool10 = icmp ne i32 %call9, 0 + br i1 %tobool10, label %kcall.end12, label %kcall.configok11 + +kcall.configok11: ; preds = %kcall.end + %34 = load float*, float** %m.addr, align 8 + %35 = load i32, i32* %matrix_dim.addr, align 4 + %36 = load i32, i32* %i, align 4 + call void @_Z13lud_perimeterPfii(float* %34, i32 %35, i32 %36) + br label %kcall.end12 + +kcall.end12: ; preds = %kcall.configok11, %kcall.end + %call13 = call i32 @cudaDeviceSynchronize() + %37 = load i32, i32* %matrix_dim.addr, align 4 + %38 = load i32, i32* %i, align 4 + %sub14 = sub nsw i32 %37, %38 + %div15 = sdiv i32 %sub14, 16 + %sub16 = sub nsw i32 %div15, 1 + %39 = load i32, i32* %matrix_dim.addr, align 4 + %40 = load i32, i32* %i, align 4 + %sub17 = sub nsw i32 %39, %40 + %div18 = sdiv i32 %sub17, 16 + %sub19 = sub nsw i32 %div18, 1 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %sub16, i32 %sub19, i32 1) + %41 = bitcast %struct.dim3* %agg.tmp20 to i8* + %42 = bitcast %struct.dim3* %dimGrid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %41, i8* align 4 %42, i64 12, i1 false) + %43 = bitcast %struct.dim3* %agg.tmp21 to i8* + %44 = bitcast %struct.dim3* %dimBlock to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %43, i8* align 4 %44, i64 12, i1 false) + %45 = bitcast { i64, i32 }* %agg.tmp20.coerce to i8* + %46 = bitcast %struct.dim3* %agg.tmp20 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %45, i8* align 4 %46, i64 12, i1 false) + %47 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp20.coerce, i32 0, i32 0 + %48 = load i64, i64* %47, align 4 + %49 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp20.coerce, i32 0, i32 1 + %50 = load i32, i32* %49, align 4 + %51 = bitcast { i64, i32 }* %agg.tmp21.coerce to i8* + %52 = bitcast %struct.dim3* %agg.tmp21 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %51, i8* align 4 %52, i64 12, i1 false) + %53 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp21.coerce, i32 0, i32 0 + %54 = load i64, i64* %53, align 4 + %55 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp21.coerce, i32 0, i32 1 + %56 = load i32, i32* %55, align 4 + %call22 = call i32 @__cudaPushCallConfiguration(i64 %48, i32 %50, i64 %54, i32 %56, i64 0, i8* null) + %tobool23 = icmp ne i32 %call22, 0 + br i1 %tobool23, label %kcall.end25, label %kcall.configok24 + +kcall.configok24: ; preds = %kcall.end12 + %57 = load float*, float** %m.addr, align 8 + %58 = load i32, i32* %matrix_dim.addr, align 4 + %59 = load i32, i32* %i, align 4 + call void @_Z12lud_internalPfii(float* %57, i32 %58, i32 %59) + br label %kcall.end25 + +kcall.end25: ; preds = %kcall.configok24, %kcall.end12 + %call26 = call i32 @cudaDeviceSynchronize() + br label %for.inc + +for.inc: ; preds = %kcall.end25 + %60 = load i32, i32* %i, align 4 + %add = add nsw i32 %60, 16 + store i32 %add, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp27, i32 1, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp28, i32 16, i32 1, i32 1) + %61 = bitcast { i64, i32 }* %agg.tmp27.coerce to i8* + %62 = bitcast %struct.dim3* %agg.tmp27 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %61, i8* align 4 %62, i64 12, i1 false) + %63 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp27.coerce, i32 0, i32 0 + %64 = load i64, i64* %63, align 4 + %65 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp27.coerce, i32 0, i32 1 + %66 = load i32, i32* %65, align 4 + %67 = bitcast { i64, i32 }* %agg.tmp28.coerce to i8* + %68 = bitcast %struct.dim3* %agg.tmp28 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %67, i8* align 4 %68, i64 12, i1 false) + %69 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp28.coerce, i32 0, i32 0 + %70 = load i64, i64* %69, align 4 + %71 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp28.coerce, i32 0, i32 1 + %72 = load i32, i32* %71, align 4 + %call29 = call i32 @__cudaPushCallConfiguration(i64 %64, i32 %66, i64 %70, i32 %72, i64 0, i8* null) + %tobool30 = icmp ne i32 %call29, 0 + br i1 %tobool30, label %kcall.end32, label %kcall.configok31 + +kcall.configok31: ; preds = %for.end + %73 = load float*, float** %m.addr, align 8 + %74 = load i32, i32* %matrix_dim.addr, align 4 + %75 = load i32, i32* %i, align 4 + call void @_Z12lud_diagonalPfii(float* %73, i32 %74, i32 %75) + br label %kcall.end32 + +kcall.end32: ; preds = %kcall.configok31, %for.end + %call33 = call i32 @cudaDeviceSynchronize() + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #3 + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #4 + +declare dso_local i32 @cudaDeviceSynchronize() #4 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, i32)* @_Z12lud_diagonalPfii to i8*), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, i32)* @_Z13lud_perimeterPfii to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + %3 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, i32)* @_Z12lud_internalPfii to i8*), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @2, i64 0, i64 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @2, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/lud/run.sh b/examples/lud/run.sh new file mode 100644 index 0000000..793ed32 --- /dev/null +++ b/examples/lud/run.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e +llvm-as lud_kernel-cuda-nvptx64-nvidia-cuda-sm_50.ll +llvm-as lud_kernel-host-x86_64-unknown-linux-gnu.ll +llvm-as common-host-x86_64-unknown-linux-gnu.ll +llvm-as lud-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator lud_kernel-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc +../../build/compilation/hostTranslator lud_kernel-host-x86_64-unknown-linux-gnu.bc kernel_host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj kernel_host.bc +llc --relocation-model=pic --filetype=obj lud-host-x86_64-unknown-linux-gnu.bc -o host.o +llc --relocation-model=pic --filetype=obj common-host-x86_64-unknown-linux-gnu.bc -o common.o + +g++ -Wall -L../../build/runtime \ + -L../../build/runtime/threadPool \ + -o lud_cuda -fPIC -no-pie host.o kernel_host.o kernel.o common.o -lc -lx86Runtime -lthreadPool -lpthread +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./lud_cuda -s 256 -v > res.log +if grep -q "PASS" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/myocyte/run.sh b/examples/myocyte/run.sh new file mode 100644 index 0000000..2edff3e --- /dev/null +++ b/examples/myocyte/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e +llvm-as main-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as main-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \ + -o myocyte.out -fPIC -no-pie host.o kernel.o \ + -lc -lx86Runtime -lthreadPool -lpthread -lm + +./myocyte.out 100 1 0 +if grep -q "1.3705539" output.txt; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/nn/filelist_4 b/examples/nn/filelist_4 new file mode 100644 index 0000000..23ef193 --- /dev/null +++ b/examples/nn/filelist_4 @@ -0,0 +1,4 @@ +../../rodinia-data/nn/cane4_0.db +../../rodinia-data/nn/cane4_1.db +../../rodinia-data/nn/cane4_2.db +../../rodinia-data/nn/cane4_3.db diff --git a/examples/nn/nn_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/nn/nn_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..761c56c --- /dev/null +++ b/examples/nn/nn_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,271 @@ +; ModuleID = 'nn_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "nn_cuda.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockDim_t = type { i8 } +%struct.__cuda_builtin_gridDim_t = type { i8 } +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } +%struct.latLong = type { float, float } + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any + +$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 +@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z6euclidP7latLongPfiff(%struct.latLong* %d_locations, float* %d_distances, i32 %numRecords, float %lat, float %lng) #0 { +entry: + %d_locations.addr = alloca %struct.latLong*, align 8 + %d_distances.addr = alloca float*, align 8 + %numRecords.addr = alloca i32, align 4 + %lat.addr = alloca float, align 4 + %lng.addr = alloca float, align 4 + %globalId = alloca i32, align 4 + %latLong = alloca %struct.latLong*, align 8 + %dist = alloca float*, align 8 + store %struct.latLong* %d_locations, %struct.latLong** %d_locations.addr, align 8 + store float* %d_distances, float** %d_distances.addr, align 8 + store i32 %numRecords, i32* %numRecords.addr, align 4 + store float %lat, float* %lat.addr, align 4 + store float %lng, float* %lng.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #4 + %call1 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #4 + %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #4 + %mul = mul i32 %call1, %call2 + %call3 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #4 + %add = add i32 %mul, %call3 + %mul4 = mul i32 %call, %add + %call5 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #4 + %add6 = add i32 %mul4, %call5 + store i32 %add6, i32* %globalId, align 4 + %0 = load %struct.latLong*, %struct.latLong** %d_locations.addr, align 8 + %1 = load i32, i32* %globalId, align 4 + %idx.ext = sext i32 %1 to i64 + %add.ptr = getelementptr inbounds %struct.latLong, %struct.latLong* %0, i64 %idx.ext + store %struct.latLong* %add.ptr, %struct.latLong** %latLong, align 8 + %2 = load i32, i32* %globalId, align 4 + %3 = load i32, i32* %numRecords.addr, align 4 + %cmp = icmp slt i32 %2, %3 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %4 = load float*, float** %d_distances.addr, align 8 + %5 = load i32, i32* %globalId, align 4 + %idx.ext7 = sext i32 %5 to i64 + %add.ptr8 = getelementptr inbounds float, float* %4, i64 %idx.ext7 + store float* %add.ptr8, float** %dist, align 8 + %6 = load float, float* %lat.addr, align 4 + %7 = load %struct.latLong*, %struct.latLong** %latLong, align 8 + %lat9 = getelementptr inbounds %struct.latLong, %struct.latLong* %7, i32 0, i32 0 + %8 = load float, float* %lat9, align 4 + %sub = fsub contract float %6, %8 + %9 = load float, float* %lat.addr, align 4 + %10 = load %struct.latLong*, %struct.latLong** %latLong, align 8 + %lat10 = getelementptr inbounds %struct.latLong, %struct.latLong* %10, i32 0, i32 0 + %11 = load float, float* %lat10, align 4 + %sub11 = fsub contract float %9, %11 + %mul12 = fmul contract float %sub, %sub11 + %12 = load float, float* %lng.addr, align 4 + %13 = load %struct.latLong*, %struct.latLong** %latLong, align 8 + %lng13 = getelementptr inbounds %struct.latLong, %struct.latLong* %13, i32 0, i32 1 + %14 = load float, float* %lng13, align 4 + %sub14 = fsub contract float %12, %14 + %15 = load float, float* %lng.addr, align 4 + %16 = load %struct.latLong*, %struct.latLong** %latLong, align 8 + %lng15 = getelementptr inbounds %struct.latLong, %struct.latLong* %16, i32 0, i32 1 + %17 = load float, float* %lng15, align 4 + %sub16 = fsub contract float %15, %17 + %mul17 = fmul contract float %sub14, %sub16 + %add18 = fadd contract float %mul12, %mul17 + %call19 = call float @_ZL4sqrtf(float %add18) #4 + %18 = load float*, float** %dist, align 8 + store float %call19, float* %18, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define internal float @_ZL4sqrtf(float %__x) #1 { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %call = call float @_ZL5sqrtff(float %0) #4 + ret float %call +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: alwaysinline convergent nounwind +define internal float @_ZL5sqrtff(float %__a) #1 { +entry: + %__a.addr = alloca float, align 4 + store float %__a, float* %__a.addr, align 4 + %0 = load float, float* %__a.addr, align 4 + %call = call float @__nv_sqrtf(float %0) #4 + ret float %call +} + +; Function Attrs: alwaysinline convergent inlinehint nounwind +define internal float @__nv_sqrtf(float %x) #3 { + %1 = call float @llvm.nvvm.sqrt.f(float %x) + ret float %1 +} + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.sqrt.f(float) #2 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (%struct.latLong*, float*, i32, float, float)* @_Z6euclidP7latLongPfiff, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/nn/nn_cuda-host-x86_64-unknown-linux-gnu.ll b/examples/nn/nn_cuda-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..785a93f --- /dev/null +++ b/examples/nn/nn_cuda-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,3691 @@ +; ModuleID = 'nn_cuda-host-x86_64-unknown-linux-gnu.bc' +source_filename = "nn_cuda.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.latLong = type { float, float } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base >::_Vector_impl" } +%"struct.std::_Vector_base >::_Vector_impl" = type { %struct.record*, %struct.record*, %struct.record* } +%struct.record = type { [53 x i8], float } +%"class.std::vector.0" = type { %"struct.std::_Vector_base.1" } +%"struct.std::_Vector_base.1" = type { %"struct.std::_Vector_base >::_Vector_impl" } +%"struct.std::_Vector_base >::_Vector_impl" = type { %struct.latLong*, %struct.latLong*, %struct.latLong* } +%struct.cudaDeviceProp = type { [256 x i8], %struct.CUuuid_st, [8 x i8], i32, i64, i64, i32, i32, i64, i32, [3 x i32], [3 x i32], i32, i64, i32, i32, i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], [2 x i32], [3 x i32], [2 x i32], [3 x i32], [3 x i32], i32, [2 x i32], [3 x i32], [2 x i32], i32, [2 x i32], [3 x i32], [2 x i32], [3 x i32], i32, [2 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32 } +%struct.CUuuid_st = type { [16 x i8] } +%"class.std::allocator.2" = type { i8 } +%"class.std::allocator" = type { i8 } +%"class.__gnu_cxx::__normal_iterator" = type { %struct.latLong* } +%"class.__gnu_cxx::__normal_iterator.5" = type { %struct.record* } +%"class.__gnu_cxx::new_allocator" = type { i8 } +%"class.__gnu_cxx::new_allocator.3" = type { i8 } + +$_ZNSt6vectorI6recordSaIS0_EEC2Ev = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EEC2Ev = comdat any + +$_ZN4dim3C2Ejjj = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EEixEm = comdat any + +$_ZNSt6vectorI6recordSaIS0_EEixEm = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EED2Ev = comdat any + +$__clang_call_terminate = comdat any + +$_ZNSt6vectorI6recordSaIS0_EED2Ev = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EE9push_backERKS0_ = comdat any + +$_ZNSt6vectorI6recordSaIS0_EE9push_backERKS0_ = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EEC2Ev = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implC2Ev = comdat any + +$_ZNSaI6recordEC2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorI6recordEC2Ev = comdat any + +$_ZSt8_DestroyIP6recordS0_EvT_S2_RSaIT0_E = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EED2Ev = comdat any + +$_ZSt8_DestroyIP6recordEvT_S2_ = comdat any + +$_ZNSt12_Destroy_auxILb1EE9__destroyIP6recordEEvT_S4_ = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EE13_M_deallocateEPS0_m = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implD2Ev = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE10deallocateERS2_PS1_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorI6recordE10deallocateEPS1_m = comdat any + +$_ZNSaI6recordED2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorI6recordED2Ev = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EEC2Ev = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implC2Ev = comdat any + +$_ZNSaI7latLongEC2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorI7latLongEC2Ev = comdat any + +$_ZSt8_DestroyIP7latLongS0_EvT_S2_RSaIT0_E = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EED2Ev = comdat any + +$_ZSt8_DestroyIP7latLongEvT_S2_ = comdat any + +$_ZNSt12_Destroy_auxILb1EE9__destroyIP7latLongEEvT_S4_ = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EE13_M_deallocateEPS0_m = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implD2Ev = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE10deallocateERS2_PS1_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorI7latLongE10deallocateEPS1_m = comdat any + +$_ZNSaI7latLongED2Ev = comdat any + +$_ZN9__gnu_cxx13new_allocatorI7latLongED2Ev = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE9constructIS1_EEvRS2_PS1_RKT_ = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS0_S2_EERKS0_ = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EE3endEv = comdat any + +$_ZN9__gnu_cxx13new_allocatorI7latLongE9constructEPS1_RKS1_ = comdat any + +$_ZNKSt6vectorI7latLongSaIS0_EE12_M_check_lenEmPKc = comdat any + +$_ZN9__gnu_cxxmiIP7latLongSt6vectorIS1_SaIS1_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKS9_SC_ = comdat any + +$_ZNSt6vectorI7latLongSaIS0_EE5beginEv = comdat any + +$_ZNSt12_Vector_baseI7latLongSaIS0_EE11_M_allocateEm = comdat any + +$_ZSt34__uninitialized_move_if_noexcept_aIP7latLongS1_SaIS0_EET0_T_S4_S3_RT1_ = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEE4baseEv = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE7destroyERS2_PS1_ = comdat any + +$_ZNKSt6vectorI7latLongSaIS0_EE8max_sizeEv = comdat any + +$_ZNKSt6vectorI7latLongSaIS0_EE4sizeEv = comdat any + +$_ZSt3maxImERKT_S2_S2_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE8max_sizeERKS2_ = comdat any + +$_ZNKSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv = comdat any + +$_ZNK9__gnu_cxx13new_allocatorI7latLongE8max_sizeEv = comdat any + +$_ZN9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEEC2ERKS2_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE8allocateERS2_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorI7latLongE8allocateEmPKv = comdat any + +$_ZSt22__uninitialized_copy_aIP7latLongS1_S0_ET0_T_S3_S2_RSaIT1_E = comdat any + +$_ZSt18uninitialized_copyIP7latLongS1_ET0_T_S3_S2_ = comdat any + +$_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIP7latLongS3_EET0_T_S5_S4_ = comdat any + +$_ZSt4copyIP7latLongS1_ET0_T_S3_S2_ = comdat any + +$_ZSt14__copy_move_a2ILb0EP7latLongS1_ET1_T0_S3_S2_ = comdat any + +$_ZSt12__miter_baseIP7latLongET_S2_ = comdat any + +$_ZSt13__copy_move_aILb0EP7latLongS1_ET1_T0_S3_S2_ = comdat any + +$_ZSt12__niter_baseIP7latLongET_S2_ = comdat any + +$_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mI7latLongEEPT_PKS4_S7_S5_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorI7latLongE7destroyEPS1_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE9constructIS1_EEvRS2_PS1_RKT_ = comdat any + +$_ZNSt6vectorI6recordSaIS0_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS0_S2_EERKS0_ = comdat any + +$_ZNSt6vectorI6recordSaIS0_EE3endEv = comdat any + +$_ZN9__gnu_cxx13new_allocatorI6recordE9constructEPS1_RKS1_ = comdat any + +$_ZNKSt6vectorI6recordSaIS0_EE12_M_check_lenEmPKc = comdat any + +$_ZN9__gnu_cxxmiIP6recordSt6vectorIS1_SaIS1_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKS9_SC_ = comdat any + +$_ZNSt6vectorI6recordSaIS0_EE5beginEv = comdat any + +$_ZNSt12_Vector_baseI6recordSaIS0_EE11_M_allocateEm = comdat any + +$_ZSt34__uninitialized_move_if_noexcept_aIP6recordS1_SaIS0_EET0_T_S4_S3_RT1_ = comdat any + +$_ZNK9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEE4baseEv = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE7destroyERS2_PS1_ = comdat any + +$_ZNKSt6vectorI6recordSaIS0_EE8max_sizeEv = comdat any + +$_ZNKSt6vectorI6recordSaIS0_EE4sizeEv = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE8max_sizeERKS2_ = comdat any + +$_ZNKSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv = comdat any + +$_ZNK9__gnu_cxx13new_allocatorI6recordE8max_sizeEv = comdat any + +$_ZN9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEEC2ERKS2_ = comdat any + +$_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE8allocateERS2_m = comdat any + +$_ZN9__gnu_cxx13new_allocatorI6recordE8allocateEmPKv = comdat any + +$_ZSt22__uninitialized_copy_aIP6recordS1_S0_ET0_T_S3_S2_RSaIT1_E = comdat any + +$_ZSt18uninitialized_copyIP6recordS1_ET0_T_S3_S2_ = comdat any + +$_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIP6recordS3_EET0_T_S5_S4_ = comdat any + +$_ZSt4copyIP6recordS1_ET0_T_S3_S2_ = comdat any + +$_ZSt14__copy_move_a2ILb0EP6recordS1_ET1_T0_S3_S2_ = comdat any + +$_ZSt12__miter_baseIP6recordET_S2_ = comdat any + +$_ZSt13__copy_move_aILb0EP6recordS1_ET1_T0_S3_S2_ = comdat any + +$_ZSt12__niter_baseIP6recordET_S2_ = comdat any + +$_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mI6recordEEPT_PKS4_S7_S5_ = comdat any + +$_ZN9__gnu_cxx13new_allocatorI6recordE7destroyEPS1_ = comdat any + +@.str = private unnamed_addr constant [12 x i8] c"before all\0A\00", align 1 +@.str.1 = private unnamed_addr constant [18 x i8] c"after before all\0A\00", align 1 +@.str.2 = private unnamed_addr constant [13 x i8] c"before call\0A\00", align 1 +@.str.3 = private unnamed_addr constant [12 x i8] c"after call\0A\00", align 1 +@.str.4 = private unnamed_addr constant [13 x i8] c"before find\0A\00", align 1 +@.str.5 = private unnamed_addr constant [12 x i8] c"after find\0A\00", align 1 +@.str.6 = private unnamed_addr constant [20 x i8] c"%s --> Distance=%f\0A\00", align 1 +@.str.7 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.8 = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.9 = private unnamed_addr constant [24 x i8] c"error reading filelist\0A\00", align 1 +@.str.10 = private unnamed_addr constant [20 x i8] c"error opening a db\0A\00", align 1 +@.str.11 = private unnamed_addr constant [24 x i8] c"Nearest Neighbor Usage\0A\00", align 1 +@.str.12 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 +@.str.13 = private unnamed_addr constant [90 x i8] c"nearestNeighbor [filename] -r [int] -lat [float] -lng [float] [-hqt] [-p [int] -d [int]]\0A\00", align 1 +@.str.14 = private unnamed_addr constant [10 x i8] c"example:\0A\00", align 1 +@.str.15 = private unnamed_addr constant [55 x i8] c"$ ./nearestNeighbor filelist.txt -r 5 -lat 30 -lng 90\0A\00", align 1 +@.str.16 = private unnamed_addr constant [59 x i8] c"filename the filename that lists the data input files\0A\00", align 1 +@.str.17 = private unnamed_addr constant [60 x i8] c"-r [int] the number of records to return (default: 10)\0A\00", align 1 +@.str.18 = private unnamed_addr constant [62 x i8] c"-lat [float] the latitude for nearest neighbors (default: 0)\0A\00", align 1 +@.str.19 = private unnamed_addr constant [63 x i8] c"-lng [float] the longitude for nearest neighbors (default: 0)\0A\00", align 1 +@.str.20 = private unnamed_addr constant [36 x i8] c"-h, --help Display the help file\0A\00", align 1 +@.str.21 = private unnamed_addr constant [52 x i8] c"-q Quiet mode. Suppress all text output.\0A\00", align 1 +@.str.22 = private unnamed_addr constant [40 x i8] c"-t Print timing information.\0A\00", align 1 +@.str.23 = private unnamed_addr constant [73 x i8] c"-p [int] Choose the platform (must choose both platform and device)\0A\00", align 1 +@.str.24 = private unnamed_addr constant [71 x i8] c"-d [int] Choose the device (must choose both platform and device)\0A\00", align 1 +@.str.25 = private unnamed_addr constant [60 x i8] c"Notes: 1. The filename is required as the first parameter.\0A\00", align 1 +@.str.26 = private unnamed_addr constant [61 x i8] c" 2. If you declare either the device or the platform,\0A\00", align 1 +@.str.27 = private unnamed_addr constant [35 x i8] c" you must declare both.\0A\0A\00", align 1 +@.str.28 = private unnamed_addr constant [26 x i8] c"vector::_M_realloc_insert\00", align 1 +@0 = private unnamed_addr constant [24 x i8] c"_Z6euclidP7latLongPfiff\00", align 1 +@1 = private constant [8313 x i8] c"P\EDU\BA\01\00\10\00h \00\00\00\00\00\00\02\00\01\01@\00\00\00\A8\19\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\00\19\00\00\00\00\00\00\C0\16\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z6euclidP7latLongPfiff\00.nv.info._Z6euclidP7latLongPfiff\00.nv.shared._Z6euclidP7latLongPfiff\00.nv.global\00.nv.constant0._Z6euclidP7latLongPfiff\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z6euclidP7latLongPfiff\00.text._Z6euclidP7latLongPfiff\00.nv.info._Z6euclidP7latLongPfiff\00.nv.shared._Z6euclidP7latLongPfiff\00.nv.global\00blockDim\00gridDim\00blockIdx\00threadIdx\00$_Z6euclidP7latLongPfiff$__cuda_sm20_sqrt_rn_f32_slowpath\00.nv.constant0._Z6euclidP7latLongPfiff\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00J\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AC\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B7\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\C0\00\00\00\01\00\08\00\03\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\C8\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\D1\00\00\00\01\00\08\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\DB\00\00\00\22\00\07\00\D0\0E\00\00\00\00\00\00p\02\00\00\00\00\00\00\15\01\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00@\11\00\00\00\00\00\00\04/\08\00\09\00\00\00\0D\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00\00\00\00\00\04\11\08\00\07\00\00\00\00\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\008\00\00\00\04\11\08\00\09\00\00\008\00\00\00\010\00\00\01*\00\00\04\0A\08\00\08\00\00\00@\01\1C\00\03\19\1C\00\04\17\0C\00\00\00\00\00\04\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\14\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\18\04\00\008\04\00\00\04\1C\04\00\C8\0E\00\00\04\1E\04\00\90\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\17visible .entry _Z6euclidP7latLongPfiff\A0\04\00\9A\00\0F%\00\04\0E\81\04\0F-\00\0F\07b\04\00\C6\00\0F-\00\0B\07C\04\1Ff-\00\0E\1F3-\00\12\0F\05\04\1B?6[5\F1\0C\16wpred %p\D7\0A\00\87\00k%f<14>)\04\1E1r\08/17+\04\0C\1F6+\04\12\02s\00\00-\03\0F\00\01\0C\1D]5\00\1F15\00\0E\0F\EC\03\00\0F5\00\0F\0F\CD\03\01\0F\A0\00\0F\0F\AE\03\01\0Fl\00\0F#0]%\01#to\99\13\07\E8\04\02\82\03\01[\0E\0A\1C\00\144q\03\0F;\00\03\145\D3\03\0F;\00\00\116\1C\00\1F5\EF\03\02\1A6\16\00\03\EF\03*d4\D9\03'24\06\04\15f\16\00\01D\00\1Bf\16\00\02\05\04+f2\DB\08{%ntid.x\1B\04\\%ncta\18\00\00\EB\00\02\17\00\B1y;\0Amul.lo.s\1A\00#5,7\00(r41\00\1561\00cx;\0Aadd.\00$7,3\00\1B6H\00#8,\95\00(r7H\00\\9, %tF\00410,2\00\1B9\EF\04\03\1C\05\110\06\02\03m\01$7,`\01\01\16\00\02D\005d8,3\00T;\0AshlR\03#9,\1E\00\133s\00\03E\02#0,L\00\00$\00\0A\A3\01\144\BF\05\03w\00\02\E2\02\181a\00\06\17\00%2,\C1\01\92;\0Asetp.ge\94\003p1,8\00\00'\00\F2\0B;\0A@%p1 bra LBB6_2;\0Abra.uni\10\0021;\0A\08\00\17:\EB\00\05u\00)16\ED\00/12\EE\00\04413, \00\1A2\F0\00$4,Q\00\01'\00\0B\F2\00\03\AC\02!14f\00\02\09\04%3,\85\02\08\7F\01515,'\01\07.\00\104\16\00\00\1E\00\00\91\00Sub.rn\19\00\225,L\00(%f]\00%6,\CC\02\07\16\00\147E\00,+4G\00\228,6\002%f7k\02\05\1A\00$9,\1F\00f8;\0Afma\1A\00\01\E5\01%f5\05\00\1A9H\03\124]\03\170\81\00\05q\01*4],\00\120,\00\181,\00\04\85\01\01\FE\077qrty\00\01\F0\00)128\01%6,m\01\07\\\00\22rd\12\04;f13\12\02\132\12\02\B02:\0Aret;\0A\0A}\0A\00\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([8313 x i8], [8313 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z6euclidP7latLongPfiff(%struct.latLong* %d_locations, float* %d_distances, i32 %numRecords, float %lat, float %lng) #0 { +entry: + %d_locations.addr = alloca %struct.latLong*, align 8 + %d_distances.addr = alloca float*, align 8 + %numRecords.addr = alloca i32, align 4 + %lat.addr = alloca float, align 4 + %lng.addr = alloca float, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store %struct.latLong* %d_locations, %struct.latLong** %d_locations.addr, align 8 + store float* %d_distances, float** %d_distances.addr, align 8 + store i32 %numRecords, i32* %numRecords.addr, align 4 + store float %lat, float* %lat.addr, align 4 + store float %lng, float* %lng.addr, align 4 + %kernel_args = alloca i8*, i64 5, align 16 + %0 = bitcast %struct.latLong** %d_locations.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast float** %d_distances.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32* %numRecords.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast float* %lat.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast float* %lng.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %11 = load i64, i64* %shmem_size, align 8 + %12 = load i8*, i8** %stream, align 8 + %13 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %14 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 12, i1 false) + %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %16 = load i64, i64* %15, align 8 + %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %18 = load i32, i32* %17, align 8 + %19 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %20 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast i8* %12 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (%struct.latLong*, float*, i32, float, float)* @_Z6euclidP7latLongPfiff to i8*), i64 %16, i32 %18, i64 %22, i32 %24, i8** %kernel_args, i64 %11, %struct.CUstream_st* %25) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %i = alloca i32, align 4 + %lat = alloca float, align 4 + %lng = alloca float, align 4 + %quiet = alloca i32, align 4 + %timing = alloca i32, align 4 + %platform = alloca i32, align 4 + %device = alloca i32, align 4 + %records = alloca %"class.std::vector", align 8 + %locations = alloca %"class.std::vector.0", align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %filename = alloca [100 x i8], align 16 + %resultsCount = alloca i32, align 4 + %cleanup.dest.slot = alloca i32, align 4 + %numRecords = alloca i32, align 4 + %distances = alloca float*, align 8 + %d_locations = alloca %struct.latLong*, align 8 + %d_distances = alloca float*, align 8 + %deviceProp = alloca %struct.cudaDeviceProp, align 8 + %maxGridX = alloca i64, align 8 + %threadsPerBlock = alloca i64, align 8 + %totalDeviceMemory = alloca i64, align 8 + %freeDeviceMemory = alloca i64, align 8 + %blocks = alloca i64, align 8 + %gridY = alloca i64, align 8 + %gridX = alloca i64, align 8 + %gridDim = alloca %struct.dim3, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp46 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp46.coerce = alloca { i64, i32 }, align 4 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + store i32 0, i32* %i, align 4 + store i32 0, i32* %quiet, align 4 + store i32 0, i32* %timing, align 4 + store i32 0, i32* %platform, align 4 + store i32 0, i32* %device, align 4 + call void @_ZNSt6vectorI6recordSaIS0_EEC2Ev(%"class.std::vector"* %records) + invoke void @_ZNSt6vectorI7latLongSaIS0_EEC2Ev(%"class.std::vector.0"* %locations) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + store i32 10, i32* %resultsCount, align 4 + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + %arraydecay = getelementptr inbounds [100 x i8], [100 x i8]* %filename, i64 0, i64 0 + %call3 = invoke i32 @_Z16parseCommandlineiPPcS_PiPfS2_S1_S1_S1_S1_(i32 %0, i8** %1, i8* %arraydecay, i32* %resultsCount, float* %lat, float* %lng, i32* %quiet, i32* %timing, i32* %platform, i32* %device) + to label %invoke.cont2 unwind label %lpad1 + +invoke.cont2: ; preds = %invoke.cont + %tobool = icmp ne i32 %call3, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %invoke.cont2 + invoke void @_Z10printUsagev() + to label %invoke.cont4 unwind label %lpad1 + +invoke.cont4: ; preds = %if.then + store i32 0, i32* %retval, align 4 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +lpad: ; preds = %cleanup, %entry + %2 = landingpad { i8*, i32 } + cleanup + %3 = extractvalue { i8*, i32 } %2, 0 + store i8* %3, i8** %exn.slot, align 8 + %4 = extractvalue { i8*, i32 } %2, 1 + store i32 %4, i32* %ehselector.slot, align 4 + br label %ehcleanup + +lpad1: ; preds = %invoke.cont80, %if.end79, %invoke.cont74, %invoke.cont70, %for.body, %invoke.cont63, %invoke.cont61, %invoke.cont59, %invoke.cont55, %invoke.cont53, %kcall.end, %kcall.configok, %invoke.cont48, %invoke.cont44, %invoke.cont42, %invoke.cont38, %invoke.cont36, %invoke.cont32, %invoke.cont27, %invoke.cont16, %invoke.cont14, %invoke.cont12, %if.end11, %invoke.cont5, %if.end, %if.then, %invoke.cont + %5 = landingpad { i8*, i32 } + cleanup + %6 = extractvalue { i8*, i32 } %5, 0 + store i8* %6, i8** %exn.slot, align 8 + %7 = extractvalue { i8*, i32 } %5, 1 + store i32 %7, i32* %ehselector.slot, align 4 + invoke void @_ZNSt6vectorI7latLongSaIS0_EED2Ev(%"class.std::vector.0"* %locations) + to label %invoke.cont85 unwind label %terminate.lpad + +if.end: ; preds = %invoke.cont2 + %call6 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i64 0, i64 0)) + to label %invoke.cont5 unwind label %lpad1 + +invoke.cont5: ; preds = %if.end + %arraydecay7 = getelementptr inbounds [100 x i8], [100 x i8]* %filename, i64 0, i64 0 + %call9 = invoke i32 @_Z8loadDataPcRSt6vectorI6recordSaIS1_EERS0_I7latLongSaIS5_EE(i8* %arraydecay7, %"class.std::vector"* dereferenceable(24) %records, %"class.std::vector.0"* dereferenceable(24) %locations) + to label %invoke.cont8 unwind label %lpad1 + +invoke.cont8: ; preds = %invoke.cont5 + store i32 %call9, i32* %numRecords, align 4 + %8 = load i32, i32* %resultsCount, align 4 + %9 = load i32, i32* %numRecords, align 4 + %cmp = icmp sgt i32 %8, %9 + br i1 %cmp, label %if.then10, label %if.end11 + +if.then10: ; preds = %invoke.cont8 + %10 = load i32, i32* %numRecords, align 4 + store i32 %10, i32* %resultsCount, align 4 + br label %if.end11 + +if.end11: ; preds = %if.then10, %invoke.cont8 + %call13 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.1, i64 0, i64 0)) + to label %invoke.cont12 unwind label %lpad1 + +invoke.cont12: ; preds = %if.end11 + %call15 = invoke i32 @cudaGetDeviceProperties(%struct.cudaDeviceProp* %deviceProp, i32 0) + to label %invoke.cont14 unwind label %lpad1 + +invoke.cont14: ; preds = %invoke.cont12 + %call17 = invoke i32 @cudaDeviceSynchronize() + to label %invoke.cont16 unwind label %lpad1 + +invoke.cont16: ; preds = %invoke.cont14 + %maxGridSize = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 + %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize, i64 0, i64 0 + %11 = load i32, i32* %arrayidx, align 8 + %conv = sext i32 %11 to i64 + store i64 %conv, i64* %maxGridX, align 8 + store i64 256, i64* %threadsPerBlock, align 8 + %12 = load i32, i32* %numRecords, align 4 + %conv18 = sext i32 %12 to i64 + %13 = load i64, i64* %threadsPerBlock, align 8 + %add = add i64 %conv18, %13 + %sub = sub i64 %add, 1 + %14 = load i64, i64* %threadsPerBlock, align 8 + %div = udiv i64 %sub, %14 + store i64 %div, i64* %blocks, align 8 + %15 = load i64, i64* %blocks, align 8 + %16 = load i64, i64* %maxGridX, align 8 + %add19 = add i64 %15, %16 + %sub20 = sub i64 %add19, 1 + %17 = load i64, i64* %maxGridX, align 8 + %div21 = udiv i64 %sub20, %17 + store i64 %div21, i64* %gridY, align 8 + %18 = load i64, i64* %blocks, align 8 + %19 = load i64, i64* %gridY, align 8 + %add22 = add i64 %18, %19 + %sub23 = sub i64 %add22, 1 + %20 = load i64, i64* %gridY, align 8 + %div24 = udiv i64 %sub23, %20 + store i64 %div24, i64* %gridX, align 8 + %21 = load i64, i64* %gridX, align 8 + %conv25 = trunc i64 %21 to i32 + %22 = load i64, i64* %gridY, align 8 + %conv26 = trunc i64 %22 to i32 + invoke void @_ZN4dim3C2Ejjj(%struct.dim3* %gridDim, i32 %conv25, i32 %conv26, i32 1) + to label %invoke.cont27 unwind label %lpad1 + +invoke.cont27: ; preds = %invoke.cont16 + %23 = load i32, i32* %numRecords, align 4 + %conv28 = sext i32 %23 to i64 + %mul = mul i64 4, %conv28 + %call29 = call noalias i8* @malloc(i64 %mul) #12 + %24 = bitcast i8* %call29 to float* + store float* %24, float** %distances, align 8 + %25 = bitcast %struct.latLong** %d_locations to i8** + %26 = load i32, i32* %numRecords, align 4 + %conv30 = sext i32 %26 to i64 + %mul31 = mul i64 8, %conv30 + %call33 = invoke i32 @cudaMalloc(i8** %25, i64 %mul31) + to label %invoke.cont32 unwind label %lpad1 + +invoke.cont32: ; preds = %invoke.cont27 + %27 = bitcast float** %d_distances to i8** + %28 = load i32, i32* %numRecords, align 4 + %conv34 = sext i32 %28 to i64 + %mul35 = mul i64 4, %conv34 + %call37 = invoke i32 @cudaMalloc(i8** %27, i64 %mul35) + to label %invoke.cont36 unwind label %lpad1 + +invoke.cont36: ; preds = %invoke.cont32 + %29 = load %struct.latLong*, %struct.latLong** %d_locations, align 8 + %30 = bitcast %struct.latLong* %29 to i8* + %call39 = invoke dereferenceable(8) %struct.latLong* @_ZNSt6vectorI7latLongSaIS0_EEixEm(%"class.std::vector.0"* %locations, i64 0) + to label %invoke.cont38 unwind label %lpad1 + +invoke.cont38: ; preds = %invoke.cont36 + %31 = bitcast %struct.latLong* %call39 to i8* + %32 = load i32, i32* %numRecords, align 4 + %conv40 = sext i32 %32 to i64 + %mul41 = mul i64 8, %conv40 + %call43 = invoke i32 @cudaMemcpy(i8* %30, i8* %31, i64 %mul41, i32 1) + to label %invoke.cont42 unwind label %lpad1 + +invoke.cont42: ; preds = %invoke.cont38 + %call45 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.2, i64 0, i64 0)) + to label %invoke.cont44 unwind label %lpad1 + +invoke.cont44: ; preds = %invoke.cont42 + %33 = bitcast %struct.dim3* %agg.tmp to i8* + %34 = bitcast %struct.dim3* %gridDim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %33, i8* align 4 %34, i64 12, i1 false) + %35 = load i64, i64* %threadsPerBlock, align 8 + %conv47 = trunc i64 %35 to i32 + invoke void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp46, i32 %conv47, i32 1, i32 1) + to label %invoke.cont48 unwind label %lpad1 + +invoke.cont48: ; preds = %invoke.cont44 + %36 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %37 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %36, i8* align 4 %37, i64 12, i1 false) + %38 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %39 = load i64, i64* %38, align 4 + %40 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %41 = load i32, i32* %40, align 4 + %42 = bitcast { i64, i32 }* %agg.tmp46.coerce to i8* + %43 = bitcast %struct.dim3* %agg.tmp46 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %42, i8* align 4 %43, i64 12, i1 false) + %44 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp46.coerce, i32 0, i32 0 + %45 = load i64, i64* %44, align 4 + %46 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp46.coerce, i32 0, i32 1 + %47 = load i32, i32* %46, align 4 + %call50 = invoke i32 @__cudaPushCallConfiguration(i64 %39, i32 %41, i64 %45, i32 %47, i64 0, i8* null) + to label %invoke.cont49 unwind label %lpad1 + +invoke.cont49: ; preds = %invoke.cont48 + %tobool51 = icmp ne i32 %call50, 0 + br i1 %tobool51, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %invoke.cont49 + %48 = load %struct.latLong*, %struct.latLong** %d_locations, align 8 + %49 = load float*, float** %d_distances, align 8 + %50 = load i32, i32* %numRecords, align 4 + %51 = load float, float* %lat, align 4 + %52 = load float, float* %lng, align 4 + invoke void @_Z6euclidP7latLongPfiff(%struct.latLong* %48, float* %49, i32 %50, float %51, float %52) + to label %invoke.cont52 unwind label %lpad1 + +invoke.cont52: ; preds = %kcall.configok + br label %kcall.end + +kcall.end: ; preds = %invoke.cont52, %invoke.cont49 + %call54 = invoke i32 @cudaDeviceSynchronize() + to label %invoke.cont53 unwind label %lpad1 + +invoke.cont53: ; preds = %kcall.end + %call56 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.3, i64 0, i64 0)) + to label %invoke.cont55 unwind label %lpad1 + +invoke.cont55: ; preds = %invoke.cont53 + %53 = load float*, float** %distances, align 8 + %54 = bitcast float* %53 to i8* + %55 = load float*, float** %d_distances, align 8 + %56 = bitcast float* %55 to i8* + %57 = load i32, i32* %numRecords, align 4 + %conv57 = sext i32 %57 to i64 + %mul58 = mul i64 4, %conv57 + %call60 = invoke i32 @cudaMemcpy(i8* %54, i8* %56, i64 %mul58, i32 2) + to label %invoke.cont59 unwind label %lpad1 + +invoke.cont59: ; preds = %invoke.cont55 + %call62 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.4, i64 0, i64 0)) + to label %invoke.cont61 unwind label %lpad1 + +invoke.cont61: ; preds = %invoke.cont59 + %58 = load float*, float** %distances, align 8 + %59 = load i32, i32* %numRecords, align 4 + %60 = load i32, i32* %resultsCount, align 4 + invoke void @_Z10findLowestRSt6vectorI6recordSaIS0_EEPfii(%"class.std::vector"* dereferenceable(24) %records, float* %58, i32 %59, i32 %60) + to label %invoke.cont63 unwind label %lpad1 + +invoke.cont63: ; preds = %invoke.cont61 + %call65 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.5, i64 0, i64 0)) + to label %invoke.cont64 unwind label %lpad1 + +invoke.cont64: ; preds = %invoke.cont63 + %61 = load i32, i32* %quiet, align 4 + %tobool66 = icmp ne i32 %61, 0 + br i1 %tobool66, label %if.end79, label %if.then67 + +if.then67: ; preds = %invoke.cont64 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then67 + %62 = load i32, i32* %i, align 4 + %63 = load i32, i32* %resultsCount, align 4 + %cmp68 = icmp slt i32 %62, %63 + br i1 %cmp68, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %64 = load i32, i32* %i, align 4 + %conv69 = sext i32 %64 to i64 + %call71 = invoke dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %records, i64 %conv69) + to label %invoke.cont70 unwind label %lpad1 + +invoke.cont70: ; preds = %for.body + %recString = getelementptr inbounds %struct.record, %struct.record* %call71, i32 0, i32 0 + %arraydecay72 = getelementptr inbounds [53 x i8], [53 x i8]* %recString, i64 0, i64 0 + %65 = load i32, i32* %i, align 4 + %conv73 = sext i32 %65 to i64 + %call75 = invoke dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %records, i64 %conv73) + to label %invoke.cont74 unwind label %lpad1 + +invoke.cont74: ; preds = %invoke.cont70 + %distance = getelementptr inbounds %struct.record, %struct.record* %call75, i32 0, i32 1 + %66 = load float, float* %distance, align 4 + %conv76 = fpext float %66 to double + %call78 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.6, i64 0, i64 0), i8* %arraydecay72, double %conv76) + to label %invoke.cont77 unwind label %lpad1 + +invoke.cont77: ; preds = %invoke.cont74 + br label %for.inc + +for.inc: ; preds = %invoke.cont77 + %67 = load i32, i32* %i, align 4 + %inc = add nsw i32 %67, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + br label %if.end79 + +if.end79: ; preds = %for.end, %invoke.cont64 + %68 = load float*, float** %distances, align 8 + %69 = bitcast float* %68 to i8* + call void @free(i8* %69) #12 + %70 = load %struct.latLong*, %struct.latLong** %d_locations, align 8 + %71 = bitcast %struct.latLong* %70 to i8* + %call81 = invoke i32 @cudaFree(i8* %71) + to label %invoke.cont80 unwind label %lpad1 + +invoke.cont80: ; preds = %if.end79 + %72 = load float*, float** %d_distances, align 8 + %73 = bitcast float* %72 to i8* + %call83 = invoke i32 @cudaFree(i8* %73) + to label %invoke.cont82 unwind label %lpad1 + +invoke.cont82: ; preds = %invoke.cont80 + store i32 0, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +cleanup: ; preds = %invoke.cont82, %invoke.cont4 + invoke void @_ZNSt6vectorI7latLongSaIS0_EED2Ev(%"class.std::vector.0"* %locations) + to label %invoke.cont84 unwind label %lpad + +invoke.cont84: ; preds = %cleanup + call void @_ZNSt6vectorI6recordSaIS0_EED2Ev(%"class.std::vector"* %records) + %cleanup.dest = load i32, i32* %cleanup.dest.slot, align 4 + switch i32 %cleanup.dest, label %unreachable [ + i32 0, label %cleanup.cont + i32 1, label %cleanup.cont + ] + +cleanup.cont: ; preds = %invoke.cont84, %invoke.cont84 + %74 = load i32, i32* %retval, align 4 + ret i32 %74 + +invoke.cont85: ; preds = %lpad1 + br label %ehcleanup + +ehcleanup: ; preds = %invoke.cont85, %lpad + invoke void @_ZNSt6vectorI6recordSaIS0_EED2Ev(%"class.std::vector"* %records) + to label %invoke.cont87 unwind label %terminate.lpad + +invoke.cont87: ; preds = %ehcleanup + br label %eh.resume + +eh.resume: ; preds = %invoke.cont87 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val88 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val88 + +terminate.lpad: ; preds = %ehcleanup, %lpad1 + %75 = landingpad { i8*, i32 } + catch i8* null + %76 = extractvalue { i8*, i32 } %75, 0 + call void @__clang_call_terminate(i8* %76) #13 + unreachable + +unreachable: ; preds = %invoke.cont84 + unreachable +} + +declare dso_local i32 @cudaSetDevice(i32) #3 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI6recordSaIS0_EEC2Ev(%"class.std::vector"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + call void @_ZNSt12_Vector_baseI6recordSaIS0_EEC2Ev(%"struct.std::_Vector_base"* %0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI7latLongSaIS0_EEC2Ev(%"class.std::vector.0"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + call void @_ZNSt12_Vector_baseI7latLongSaIS0_EEC2Ev(%"struct.std::_Vector_base.1"* %0) + ret void +} + +declare dso_local i32 @__gxx_personality_v0(...) + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @_Z16parseCommandlineiPPcS_PiPfS2_S1_S1_S1_S1_(i32 %argc, i8** %argv, i8* %filename, i32* %r, float* %lat, float* %lng, i32* %q, i32* %t, i32* %p, i32* %d) #0 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %filename.addr = alloca i8*, align 8 + %r.addr = alloca i32*, align 8 + %lat.addr = alloca float*, align 8 + %lng.addr = alloca float*, align 8 + %q.addr = alloca i32*, align 8 + %t.addr = alloca i32*, align 8 + %p.addr = alloca i32*, align 8 + %d.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + %flag = alloca i8, align 1 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + store i8* %filename, i8** %filename.addr, align 8 + store i32* %r, i32** %r.addr, align 8 + store float* %lat, float** %lat.addr, align 8 + store float* %lng, float** %lng.addr, align 8 + store i32* %q, i32** %q.addr, align 8 + store i32* %t, i32** %t.addr, align 8 + store i32* %p, i32** %p.addr, align 8 + store i32* %d, i32** %d.addr, align 8 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp slt i32 %0, 2 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 1, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + %1 = load i8*, i8** %filename.addr, align 8 + %2 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %2, i64 1 + %3 = load i8*, i8** %arrayidx, align 8 + %call = call i8* @strncpy(i8* %1, i8* %3, i64 100) + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %4 = load i32, i32* %i, align 4 + %5 = load i32, i32* %argc.addr, align 4 + %cmp1 = icmp slt i32 %4, %5 + br i1 %cmp1, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %6 = load i8**, i8*** %argv.addr, align 8 + %7 = load i32, i32* %i, align 4 + %idxprom = sext i32 %7 to i64 + %arrayidx2 = getelementptr inbounds i8*, i8** %6, i64 %idxprom + %8 = load i8*, i8** %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds i8, i8* %8, i64 0 + %9 = load i8, i8* %arrayidx3, align 1 + %conv = sext i8 %9 to i32 + %cmp4 = icmp eq i32 %conv, 45 + br i1 %cmp4, label %if.then5, label %if.end44 + +if.then5: ; preds = %for.body + %10 = load i8**, i8*** %argv.addr, align 8 + %11 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %11 to i64 + %arrayidx7 = getelementptr inbounds i8*, i8** %10, i64 %idxprom6 + %12 = load i8*, i8** %arrayidx7, align 8 + %arrayidx8 = getelementptr inbounds i8, i8* %12, i64 1 + %13 = load i8, i8* %arrayidx8, align 1 + store i8 %13, i8* %flag, align 1 + %14 = load i8, i8* %flag, align 1 + %conv9 = sext i8 %14 to i32 + switch i32 %conv9, label %sw.epilog [ + i32 114, label %sw.bb + i32 108, label %sw.bb13 + i32 104, label %sw.bb31 + i32 113, label %sw.bb32 + i32 116, label %sw.bb33 + i32 112, label %sw.bb34 + i32 100, label %sw.bb39 + ] + +sw.bb: ; preds = %if.then5 + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + %16 = load i8**, i8*** %argv.addr, align 8 + %17 = load i32, i32* %i, align 4 + %idxprom10 = sext i32 %17 to i64 + %arrayidx11 = getelementptr inbounds i8*, i8** %16, i64 %idxprom10 + %18 = load i8*, i8** %arrayidx11, align 8 + %call12 = call i32 @atoi(i8* %18) #14 + %19 = load i32*, i32** %r.addr, align 8 + store i32 %call12, i32* %19, align 4 + br label %sw.epilog + +sw.bb13: ; preds = %if.then5 + %20 = load i8**, i8*** %argv.addr, align 8 + %21 = load i32, i32* %i, align 4 + %idxprom14 = sext i32 %21 to i64 + %arrayidx15 = getelementptr inbounds i8*, i8** %20, i64 %idxprom14 + %22 = load i8*, i8** %arrayidx15, align 8 + %arrayidx16 = getelementptr inbounds i8, i8* %22, i64 2 + %23 = load i8, i8* %arrayidx16, align 1 + %conv17 = sext i8 %23 to i32 + %cmp18 = icmp eq i32 %conv17, 97 + br i1 %cmp18, label %if.then19, label %if.else + +if.then19: ; preds = %sw.bb13 + %24 = load i8**, i8*** %argv.addr, align 8 + %25 = load i32, i32* %i, align 4 + %add = add nsw i32 %25, 1 + %idxprom20 = sext i32 %add to i64 + %arrayidx21 = getelementptr inbounds i8*, i8** %24, i64 %idxprom20 + %26 = load i8*, i8** %arrayidx21, align 8 + %call22 = call double @atof(i8* %26) #14 + %conv23 = fptrunc double %call22 to float + %27 = load float*, float** %lat.addr, align 8 + store float %conv23, float* %27, align 4 + br label %if.end29 + +if.else: ; preds = %sw.bb13 + %28 = load i8**, i8*** %argv.addr, align 8 + %29 = load i32, i32* %i, align 4 + %add24 = add nsw i32 %29, 1 + %idxprom25 = sext i32 %add24 to i64 + %arrayidx26 = getelementptr inbounds i8*, i8** %28, i64 %idxprom25 + %30 = load i8*, i8** %arrayidx26, align 8 + %call27 = call double @atof(i8* %30) #14 + %conv28 = fptrunc double %call27 to float + %31 = load float*, float** %lng.addr, align 8 + store float %conv28, float* %31, align 4 + br label %if.end29 + +if.end29: ; preds = %if.else, %if.then19 + %32 = load i32, i32* %i, align 4 + %inc30 = add nsw i32 %32, 1 + store i32 %inc30, i32* %i, align 4 + br label %sw.epilog + +sw.bb31: ; preds = %if.then5 + store i32 1, i32* %retval, align 4 + br label %return + +sw.bb32: ; preds = %if.then5 + %33 = load i32*, i32** %q.addr, align 8 + store i32 1, i32* %33, align 4 + br label %sw.epilog + +sw.bb33: ; preds = %if.then5 + %34 = load i32*, i32** %t.addr, align 8 + store i32 1, i32* %34, align 4 + br label %sw.epilog + +sw.bb34: ; preds = %if.then5 + %35 = load i32, i32* %i, align 4 + %inc35 = add nsw i32 %35, 1 + store i32 %inc35, i32* %i, align 4 + %36 = load i8**, i8*** %argv.addr, align 8 + %37 = load i32, i32* %i, align 4 + %idxprom36 = sext i32 %37 to i64 + %arrayidx37 = getelementptr inbounds i8*, i8** %36, i64 %idxprom36 + %38 = load i8*, i8** %arrayidx37, align 8 + %call38 = call i32 @atoi(i8* %38) #14 + %39 = load i32*, i32** %p.addr, align 8 + store i32 %call38, i32* %39, align 4 + br label %sw.epilog + +sw.bb39: ; preds = %if.then5 + %40 = load i32, i32* %i, align 4 + %inc40 = add nsw i32 %40, 1 + store i32 %inc40, i32* %i, align 4 + %41 = load i8**, i8*** %argv.addr, align 8 + %42 = load i32, i32* %i, align 4 + %idxprom41 = sext i32 %42 to i64 + %arrayidx42 = getelementptr inbounds i8*, i8** %41, i64 %idxprom41 + %43 = load i8*, i8** %arrayidx42, align 8 + %call43 = call i32 @atoi(i8* %43) #14 + %44 = load i32*, i32** %d.addr, align 8 + store i32 %call43, i32* %44, align 4 + br label %sw.epilog + +sw.epilog: ; preds = %sw.bb39, %sw.bb34, %sw.bb33, %sw.bb32, %if.end29, %sw.bb, %if.then5 + br label %if.end44 + +if.end44: ; preds = %sw.epilog, %for.body + br label %for.inc + +for.inc: ; preds = %if.end44 + %45 = load i32, i32* %i, align 4 + %inc45 = add nsw i32 %45, 1 + store i32 %inc45, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %46 = load i32*, i32** %d.addr, align 8 + %47 = load i32, i32* %46, align 4 + %cmp46 = icmp sge i32 %47, 0 + br i1 %cmp46, label %land.lhs.true, label %lor.lhs.false + +land.lhs.true: ; preds = %for.end + %48 = load i32*, i32** %p.addr, align 8 + %49 = load i32, i32* %48, align 4 + %cmp47 = icmp slt i32 %49, 0 + br i1 %cmp47, label %if.then51, label %lor.lhs.false + +lor.lhs.false: ; preds = %land.lhs.true, %for.end + %50 = load i32*, i32** %p.addr, align 8 + %51 = load i32, i32* %50, align 4 + %cmp48 = icmp sge i32 %51, 0 + br i1 %cmp48, label %land.lhs.true49, label %if.end52 + +land.lhs.true49: ; preds = %lor.lhs.false + %52 = load i32*, i32** %d.addr, align 8 + %53 = load i32, i32* %52, align 4 + %cmp50 = icmp slt i32 %53, 0 + br i1 %cmp50, label %if.then51, label %if.end52 + +if.then51: ; preds = %land.lhs.true49, %land.lhs.true + store i32 1, i32* %retval, align 4 + br label %return + +if.end52: ; preds = %land.lhs.true49, %lor.lhs.false + store i32 0, i32* %retval, align 4 + br label %return + +return: ; preds = %if.end52, %if.then51, %sw.bb31, %if.then + %54 = load i32, i32* %retval, align 4 + ret i32 %54 +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z10printUsagev() #0 { +entry: + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.11, i64 0, i64 0)) + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([90 x i8], [90 x i8]* @.str.13, i64 0, i64 0)) + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.14, i64 0, i64 0)) + %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([55 x i8], [55 x i8]* @.str.15, i64 0, i64 0)) + %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([59 x i8], [59 x i8]* @.str.16, i64 0, i64 0)) + %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @.str.17, i64 0, i64 0)) + %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([62 x i8], [62 x i8]* @.str.18, i64 0, i64 0)) + %call10 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([63 x i8], [63 x i8]* @.str.19, i64 0, i64 0)) + %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.20, i64 0, i64 0)) + %call13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.21, i64 0, i64 0)) + %call14 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.22, i64 0, i64 0)) + %call15 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call16 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([73 x i8], [73 x i8]* @.str.23, i64 0, i64 0)) + %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([71 x i8], [71 x i8]* @.str.24, i64 0, i64 0)) + %call18 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call19 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.12, i64 0, i64 0)) + %call20 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @.str.25, i64 0, i64 0)) + %call21 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([61 x i8], [61 x i8]* @.str.26, i64 0, i64 0)) + %call22 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.27, i64 0, i64 0)) + ret void +} + +declare dso_local i32 @printf(i8*, ...) #3 + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @_Z8loadDataPcRSt6vectorI6recordSaIS1_EERS0_I7latLongSaIS5_EE(i8* %filename, %"class.std::vector"* dereferenceable(24) %records, %"class.std::vector.0"* dereferenceable(24) %locations) #0 { +entry: + %filename.addr = alloca i8*, align 8 + %records.addr = alloca %"class.std::vector"*, align 8 + %locations.addr = alloca %"class.std::vector.0"*, align 8 + %flist = alloca %struct._IO_FILE*, align 8 + %fp = alloca %struct._IO_FILE*, align 8 + %i = alloca i32, align 4 + %dbname = alloca [64 x i8], align 16 + %recNum = alloca i32, align 4 + %record = alloca %struct.record, align 4 + %latLong = alloca %struct.latLong, align 4 + %substr = alloca [6 x i8], align 1 + store i8* %filename, i8** %filename.addr, align 8 + store %"class.std::vector"* %records, %"class.std::vector"** %records.addr, align 8 + store %"class.std::vector.0"* %locations, %"class.std::vector.0"** %locations.addr, align 8 + store i32 0, i32* %i, align 4 + store i32 0, i32* %recNum, align 4 + %0 = load i8*, i8** %filename.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.7, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %flist, align 8 + br label %while.cond + +while.cond: ; preds = %while.end, %entry + %1 = load %struct._IO_FILE*, %struct._IO_FILE** %flist, align 8 + %call1 = call i32 @feof(%struct._IO_FILE* %1) #12 + %tobool = icmp ne i32 %call1, 0 + %lnot = xor i1 %tobool, true + br i1 %lnot, label %while.body, label %while.end48 + +while.body: ; preds = %while.cond + %2 = load %struct._IO_FILE*, %struct._IO_FILE** %flist, align 8 + %arraydecay = getelementptr inbounds [64 x i8], [64 x i8]* %dbname, i64 0, i64 0 + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.8, i64 0, i64 0), i8* %arraydecay) + %cmp = icmp ne i32 %call2, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %while.body + %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.9, i64 0, i64 0)) + call void @exit(i32 0) #13 + unreachable + +if.end: ; preds = %while.body + %arraydecay4 = getelementptr inbounds [64 x i8], [64 x i8]* %dbname, i64 0, i64 0 + %call5 = call %struct._IO_FILE* @fopen(i8* %arraydecay4, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.7, i64 0, i64 0)) + store %struct._IO_FILE* %call5, %struct._IO_FILE** %fp, align 8 + %4 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %tobool6 = icmp ne %struct._IO_FILE* %4, null + br i1 %tobool6, label %if.end9, label %if.then7 + +if.then7: ; preds = %if.end + %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.10, i64 0, i64 0)) + call void @exit(i32 1) #13 + unreachable + +if.end9: ; preds = %if.end + br label %while.cond10 + +while.cond10: ; preds = %for.end41, %if.end9 + %5 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call11 = call i32 @feof(%struct._IO_FILE* %5) #12 + %tobool12 = icmp ne i32 %call11, 0 + %lnot13 = xor i1 %tobool12, true + br i1 %lnot13, label %while.body14, label %while.end + +while.body14: ; preds = %while.cond10 + %recString = getelementptr inbounds %struct.record, %struct.record* %record, i32 0, i32 0 + %arraydecay15 = getelementptr inbounds [53 x i8], [53 x i8]* %recString, i64 0, i64 0 + %6 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call16 = call i8* @fgets(i8* %arraydecay15, i32 49, %struct._IO_FILE* %6) + %7 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call17 = call i32 @fgetc(%struct._IO_FILE* %7) + %8 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call18 = call i32 @feof(%struct._IO_FILE* %8) #12 + %tobool19 = icmp ne i32 %call18, 0 + br i1 %tobool19, label %if.then20, label %if.end21 + +if.then20: ; preds = %while.body14 + br label %while.end + +if.end21: ; preds = %while.body14 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end21 + %9 = load i32, i32* %i, align 4 + %cmp22 = icmp slt i32 %9, 5 + br i1 %cmp22, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %recString23 = getelementptr inbounds %struct.record, %struct.record* %record, i32 0, i32 0 + %arraydecay24 = getelementptr inbounds [53 x i8], [53 x i8]* %recString23, i64 0, i64 0 + %10 = load i32, i32* %i, align 4 + %idx.ext = sext i32 %10 to i64 + %add.ptr = getelementptr inbounds i8, i8* %arraydecay24, i64 %idx.ext + %add.ptr25 = getelementptr inbounds i8, i8* %add.ptr, i64 28 + %11 = load i8, i8* %add.ptr25, align 1 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds [6 x i8], [6 x i8]* %substr, i64 0, i64 %idxprom + store i8 %11, i8* %arrayidx, align 1 + br label %for.inc + +for.inc: ; preds = %for.body + %13 = load i32, i32* %i, align 4 + %inc = add nsw i32 %13, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx26 = getelementptr inbounds [6 x i8], [6 x i8]* %substr, i64 0, i64 5 + store i8 0, i8* %arrayidx26, align 1 + %arraydecay27 = getelementptr inbounds [6 x i8], [6 x i8]* %substr, i64 0, i64 0 + %call28 = call double @atof(i8* %arraydecay27) #14 + %conv = fptrunc double %call28 to float + %lat = getelementptr inbounds %struct.latLong, %struct.latLong* %latLong, i32 0, i32 0 + store float %conv, float* %lat, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond29 + +for.cond29: ; preds = %for.inc39, %for.end + %14 = load i32, i32* %i, align 4 + %cmp30 = icmp slt i32 %14, 5 + br i1 %cmp30, label %for.body31, label %for.end41 + +for.body31: ; preds = %for.cond29 + %recString32 = getelementptr inbounds %struct.record, %struct.record* %record, i32 0, i32 0 + %arraydecay33 = getelementptr inbounds [53 x i8], [53 x i8]* %recString32, i64 0, i64 0 + %15 = load i32, i32* %i, align 4 + %idx.ext34 = sext i32 %15 to i64 + %add.ptr35 = getelementptr inbounds i8, i8* %arraydecay33, i64 %idx.ext34 + %add.ptr36 = getelementptr inbounds i8, i8* %add.ptr35, i64 33 + %16 = load i8, i8* %add.ptr36, align 1 + %17 = load i32, i32* %i, align 4 + %idxprom37 = sext i32 %17 to i64 + %arrayidx38 = getelementptr inbounds [6 x i8], [6 x i8]* %substr, i64 0, i64 %idxprom37 + store i8 %16, i8* %arrayidx38, align 1 + br label %for.inc39 + +for.inc39: ; preds = %for.body31 + %18 = load i32, i32* %i, align 4 + %inc40 = add nsw i32 %18, 1 + store i32 %inc40, i32* %i, align 4 + br label %for.cond29 + +for.end41: ; preds = %for.cond29 + %arrayidx42 = getelementptr inbounds [6 x i8], [6 x i8]* %substr, i64 0, i64 5 + store i8 0, i8* %arrayidx42, align 1 + %arraydecay43 = getelementptr inbounds [6 x i8], [6 x i8]* %substr, i64 0, i64 0 + %call44 = call double @atof(i8* %arraydecay43) #14 + %conv45 = fptrunc double %call44 to float + %lng = getelementptr inbounds %struct.latLong, %struct.latLong* %latLong, i32 0, i32 1 + store float %conv45, float* %lng, align 4 + %19 = load %"class.std::vector.0"*, %"class.std::vector.0"** %locations.addr, align 8 + call void @_ZNSt6vectorI7latLongSaIS0_EE9push_backERKS0_(%"class.std::vector.0"* %19, %struct.latLong* dereferenceable(8) %latLong) + %20 = load %"class.std::vector"*, %"class.std::vector"** %records.addr, align 8 + call void @_ZNSt6vectorI6recordSaIS0_EE9push_backERKS0_(%"class.std::vector"* %20, %struct.record* dereferenceable(60) %record) + %21 = load i32, i32* %recNum, align 4 + %inc46 = add nsw i32 %21, 1 + store i32 %inc46, i32* %recNum, align 4 + br label %while.cond10 + +while.end: ; preds = %if.then20, %while.cond10 + %22 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call47 = call i32 @fclose(%struct._IO_FILE* %22) + br label %while.cond + +while.end48: ; preds = %while.cond + %23 = load %struct._IO_FILE*, %struct._IO_FILE** %flist, align 8 + %call49 = call i32 @fclose(%struct._IO_FILE* %23) + %24 = load i32, i32* %recNum, align 4 + ret i32 %24 +} + +declare dso_local i32 @cudaGetDeviceProperties(%struct.cudaDeviceProp*, i32) #3 + +declare dso_local i32 @cudaDeviceSynchronize() #3 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #5 + +declare dso_local i32 @cudaMalloc(i8**, i64) #3 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #3 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %struct.latLong* @_ZNSt6vectorI7latLongSaIS0_EEixEm(%"class.std::vector.0"* %this, i64 %__n) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %1 = load %struct.latLong*, %struct.latLong** %_M_start, align 8 + %2 = load i64, i64* %__n.addr, align 8 + %add.ptr = getelementptr inbounds %struct.latLong, %struct.latLong* %1, i64 %2 + ret %struct.latLong* %add.ptr +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #3 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z10findLowestRSt6vectorI6recordSaIS0_EEPfii(%"class.std::vector"* dereferenceable(24) %records, float* %distances, i32 %numRecords, i32 %topN) #0 { +entry: + %records.addr = alloca %"class.std::vector"*, align 8 + %distances.addr = alloca float*, align 8 + %numRecords.addr = alloca i32, align 4 + %topN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %val = alloca float, align 4 + %minLoc = alloca i32, align 4 + %tempRec = alloca %struct.record*, align 8 + %tempDist = alloca float, align 4 + store %"class.std::vector"* %records, %"class.std::vector"** %records.addr, align 8 + store float* %distances, float** %distances.addr, align 8 + store i32 %numRecords, i32* %numRecords.addr, align 4 + store i32 %topN, i32* %topN.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc25, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %topN.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end27 + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + store i32 %2, i32* %minLoc, align 4 + %3 = load i32, i32* %i, align 4 + store i32 %3, i32* %j, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %4 = load i32, i32* %j, align 4 + %5 = load i32, i32* %numRecords.addr, align 4 + %cmp2 = icmp slt i32 %4, %5 + br i1 %cmp2, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %6 = load float*, float** %distances.addr, align 8 + %7 = load i32, i32* %j, align 4 + %idxprom = sext i32 %7 to i64 + %arrayidx = getelementptr inbounds float, float* %6, i64 %idxprom + %8 = load float, float* %arrayidx, align 4 + store float %8, float* %val, align 4 + %9 = load float, float* %val, align 4 + %10 = load float*, float** %distances.addr, align 8 + %11 = load i32, i32* %minLoc, align 4 + %idxprom4 = sext i32 %11 to i64 + %arrayidx5 = getelementptr inbounds float, float* %10, i64 %idxprom4 + %12 = load float, float* %arrayidx5, align 4 + %cmp6 = fcmp olt float %9, %12 + br i1 %cmp6, label %if.then, label %if.end + +if.then: ; preds = %for.body3 + %13 = load i32, i32* %j, align 4 + store i32 %13, i32* %minLoc, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body3 + br label %for.inc + +for.inc: ; preds = %if.end + %14 = load i32, i32* %j, align 4 + %inc = add nsw i32 %14, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + %15 = load %"class.std::vector"*, %"class.std::vector"** %records.addr, align 8 + %16 = load i32, i32* %i, align 4 + %conv = sext i32 %16 to i64 + %call = call dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %15, i64 %conv) + store %struct.record* %call, %struct.record** %tempRec, align 8 + %17 = load %"class.std::vector"*, %"class.std::vector"** %records.addr, align 8 + %18 = load i32, i32* %minLoc, align 4 + %conv7 = sext i32 %18 to i64 + %call8 = call dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %17, i64 %conv7) + %19 = load %"class.std::vector"*, %"class.std::vector"** %records.addr, align 8 + %20 = load i32, i32* %i, align 4 + %conv9 = sext i32 %20 to i64 + %call10 = call dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %19, i64 %conv9) + %21 = bitcast %struct.record* %call10 to i8* + %22 = bitcast %struct.record* %call8 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %21, i8* align 4 %22, i64 60, i1 false) + %23 = load %struct.record*, %struct.record** %tempRec, align 8 + %24 = load %"class.std::vector"*, %"class.std::vector"** %records.addr, align 8 + %25 = load i32, i32* %minLoc, align 4 + %conv11 = sext i32 %25 to i64 + %call12 = call dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %24, i64 %conv11) + %26 = bitcast %struct.record* %call12 to i8* + %27 = bitcast %struct.record* %23 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %26, i8* align 4 %27, i64 60, i1 false) + %28 = load float*, float** %distances.addr, align 8 + %29 = load i32, i32* %i, align 4 + %idxprom13 = sext i32 %29 to i64 + %arrayidx14 = getelementptr inbounds float, float* %28, i64 %idxprom13 + %30 = load float, float* %arrayidx14, align 4 + store float %30, float* %tempDist, align 4 + %31 = load float*, float** %distances.addr, align 8 + %32 = load i32, i32* %minLoc, align 4 + %idxprom15 = sext i32 %32 to i64 + %arrayidx16 = getelementptr inbounds float, float* %31, i64 %idxprom15 + %33 = load float, float* %arrayidx16, align 4 + %34 = load float*, float** %distances.addr, align 8 + %35 = load i32, i32* %i, align 4 + %idxprom17 = sext i32 %35 to i64 + %arrayidx18 = getelementptr inbounds float, float* %34, i64 %idxprom17 + store float %33, float* %arrayidx18, align 4 + %36 = load float, float* %tempDist, align 4 + %37 = load float*, float** %distances.addr, align 8 + %38 = load i32, i32* %minLoc, align 4 + %idxprom19 = sext i32 %38 to i64 + %arrayidx20 = getelementptr inbounds float, float* %37, i64 %idxprom19 + store float %36, float* %arrayidx20, align 4 + %39 = load float*, float** %distances.addr, align 8 + %40 = load i32, i32* %i, align 4 + %idxprom21 = sext i32 %40 to i64 + %arrayidx22 = getelementptr inbounds float, float* %39, i64 %idxprom21 + %41 = load float, float* %arrayidx22, align 4 + %42 = load %"class.std::vector"*, %"class.std::vector"** %records.addr, align 8 + %43 = load i32, i32* %i, align 4 + %conv23 = sext i32 %43 to i64 + %call24 = call dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %42, i64 %conv23) + %distance = getelementptr inbounds %struct.record, %struct.record* %call24, i32 0, i32 1 + store float %41, float* %distance, align 4 + br label %for.inc25 + +for.inc25: ; preds = %for.end + %44 = load i32, i32* %i, align 4 + %inc26 = add nsw i32 %44, 1 + store i32 %inc26, i32* %i, align 4 + br label %for.cond + +for.end27: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(60) %struct.record* @_ZNSt6vectorI6recordSaIS0_EEixEm(%"class.std::vector"* %this, i64 %__n) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %1 = load %struct.record*, %struct.record** %_M_start, align 8 + %2 = load i64, i64* %__n.addr, align 8 + %add.ptr = getelementptr inbounds %struct.record, %struct.record* %1, i64 %2 + ret %struct.record* %add.ptr +} + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #5 + +declare dso_local i32 @cudaFree(i8*) #3 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI7latLongSaIS0_EED2Ev(%"class.std::vector.0"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %1 = load %struct.latLong*, %struct.latLong** %_M_start, align 8 + %2 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %2, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 1 + %3 = load %struct.latLong*, %struct.latLong** %_M_finish, align 8 + %4 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %call = invoke dereferenceable(1) %"class.std::allocator.2"* @_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %4) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + invoke void @_ZSt8_DestroyIP7latLongS0_EvT_S2_RSaIT0_E(%struct.latLong* %1, %struct.latLong* %3, %"class.std::allocator.2"* dereferenceable(1) %call) + to label %invoke.cont3 unwind label %lpad + +invoke.cont3: ; preds = %invoke.cont + %5 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + call void @_ZNSt12_Vector_baseI7latLongSaIS0_EED2Ev(%"struct.std::_Vector_base.1"* %5) + ret void + +lpad: ; preds = %invoke.cont, %entry + %6 = landingpad { i8*, i32 } + cleanup + %7 = extractvalue { i8*, i32 } %6, 0 + store i8* %7, i8** %exn.slot, align 8 + %8 = extractvalue { i8*, i32 } %6, 1 + store i32 %8, i32* %ehselector.slot, align 4 + %9 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + invoke void @_ZNSt12_Vector_baseI7latLongSaIS0_EED2Ev(%"struct.std::_Vector_base.1"* %9) + to label %invoke.cont4 unwind label %terminate.lpad + +invoke.cont4: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont4 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val5 + +terminate.lpad: ; preds = %lpad + %10 = landingpad { i8*, i32 } + catch i8* null + %11 = extractvalue { i8*, i32 } %10, 0 + call void @__clang_call_terminate(i8* %11) #13 + unreachable +} + +; Function Attrs: noinline noreturn nounwind +define linkonce_odr hidden void @__clang_call_terminate(i8* %0) #6 comdat { + %2 = call i8* @__cxa_begin_catch(i8* %0) #12 + call void @_ZSt9terminatev() #13 + unreachable +} + +declare dso_local i8* @__cxa_begin_catch(i8*) + +declare dso_local void @_ZSt9terminatev() + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI6recordSaIS0_EED2Ev(%"class.std::vector"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %1 = load %struct.record*, %struct.record** %_M_start, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 1 + %3 = load %struct.record*, %struct.record** %_M_finish, align 8 + %4 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %4) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + invoke void @_ZSt8_DestroyIP6recordS0_EvT_S2_RSaIT0_E(%struct.record* %1, %struct.record* %3, %"class.std::allocator"* dereferenceable(1) %call) + to label %invoke.cont3 unwind label %lpad + +invoke.cont3: ; preds = %invoke.cont + %5 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + call void @_ZNSt12_Vector_baseI6recordSaIS0_EED2Ev(%"struct.std::_Vector_base"* %5) + ret void + +lpad: ; preds = %invoke.cont, %entry + %6 = landingpad { i8*, i32 } + cleanup + %7 = extractvalue { i8*, i32 } %6, 0 + store i8* %7, i8** %exn.slot, align 8 + %8 = extractvalue { i8*, i32 } %6, 1 + store i32 %8, i32* %ehselector.slot, align 4 + %9 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + invoke void @_ZNSt12_Vector_baseI6recordSaIS0_EED2Ev(%"struct.std::_Vector_base"* %9) + to label %invoke.cont4 unwind label %terminate.lpad + +invoke.cont4: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont4 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val5 + +terminate.lpad: ; preds = %lpad + %10 = landingpad { i8*, i32 } + catch i8* null + %11 = extractvalue { i8*, i32 } %10, 0 + call void @__clang_call_terminate(i8* %11) #13 + unreachable +} + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #3 + +; Function Attrs: nounwind +declare dso_local i32 @feof(%struct._IO_FILE*) #5 + +declare dso_local i32 @fscanf(%struct._IO_FILE*, i8*, ...) #3 + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #3 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #7 + +declare dso_local i8* @fgets(i8*, i32, %struct._IO_FILE*) #3 + +declare dso_local i32 @fgetc(%struct._IO_FILE*) #3 + +; Function Attrs: nounwind readonly +declare dso_local double @atof(i8*) #8 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI7latLongSaIS0_EE9push_backERKS0_(%"class.std::vector.0"* %this, %struct.latLong* dereferenceable(8) %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__x.addr = alloca %struct.latLong*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store %struct.latLong* %__x, %struct.latLong** %__x.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %struct.latLong*, %struct.latLong** %_M_finish, align 8 + %2 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %2, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 2 + %3 = load %struct.latLong*, %struct.latLong** %_M_end_of_storage, align 8 + %cmp = icmp ne %struct.latLong* %1, %3 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %4 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %4, i32 0, i32 0 + %5 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3 to %"class.std::allocator.2"* + %6 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl4 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %6, i32 0, i32 0 + %_M_finish5 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl4, i32 0, i32 1 + %7 = load %struct.latLong*, %struct.latLong** %_M_finish5, align 8 + %8 = load %struct.latLong*, %struct.latLong** %__x.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE9constructIS1_EEvRS2_PS1_RKT_(%"class.std::allocator.2"* dereferenceable(1) %5, %struct.latLong* %7, %struct.latLong* dereferenceable(8) %8) + %9 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %9, i32 0, i32 0 + %_M_finish7 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6, i32 0, i32 1 + %10 = load %struct.latLong*, %struct.latLong** %_M_finish7, align 8 + %incdec.ptr = getelementptr inbounds %struct.latLong, %struct.latLong* %10, i32 1 + store %struct.latLong* %incdec.ptr, %struct.latLong** %_M_finish7, align 8 + br label %if.end + +if.else: ; preds = %entry + %call = call %struct.latLong* @_ZNSt6vectorI7latLongSaIS0_EE3endEv(%"class.std::vector.0"* %this1) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + store %struct.latLong* %call, %struct.latLong** %coerce.dive, align 8 + %11 = load %struct.latLong*, %struct.latLong** %__x.addr, align 8 + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %agg.tmp, i32 0, i32 0 + %12 = load %struct.latLong*, %struct.latLong** %coerce.dive8, align 8 + call void @_ZNSt6vectorI7latLongSaIS0_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS0_S2_EERKS0_(%"class.std::vector.0"* %this1, %struct.latLong* %12, %struct.latLong* dereferenceable(8) %11) + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI6recordSaIS0_EE9push_backERKS0_(%"class.std::vector"* %this, %struct.record* dereferenceable(60) %__x) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %__x.addr = alloca %struct.record*, align 8 + %agg.tmp = alloca %"class.__gnu_cxx::__normal_iterator.5", align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store %struct.record* %__x, %struct.record** %__x.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %struct.record*, %struct.record** %_M_finish, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 2 + %3 = load %struct.record*, %struct.record** %_M_end_of_storage, align 8 + %cmp = icmp ne %struct.record* %1, %3 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %4 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %4, i32 0, i32 0 + %5 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3 to %"class.std::allocator"* + %6 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl4 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %6, i32 0, i32 0 + %_M_finish5 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl4, i32 0, i32 1 + %7 = load %struct.record*, %struct.record** %_M_finish5, align 8 + %8 = load %struct.record*, %struct.record** %__x.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE9constructIS1_EEvRS2_PS1_RKT_(%"class.std::allocator"* dereferenceable(1) %5, %struct.record* %7, %struct.record* dereferenceable(60) %8) + %9 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %9, i32 0, i32 0 + %_M_finish7 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6, i32 0, i32 1 + %10 = load %struct.record*, %struct.record** %_M_finish7, align 8 + %incdec.ptr = getelementptr inbounds %struct.record, %struct.record* %10, i32 1 + store %struct.record* %incdec.ptr, %struct.record** %_M_finish7, align 8 + br label %if.end + +if.else: ; preds = %entry + %call = call %struct.record* @_ZNSt6vectorI6recordSaIS0_EE3endEv(%"class.std::vector"* %this1) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %agg.tmp, i32 0, i32 0 + store %struct.record* %call, %struct.record** %coerce.dive, align 8 + %11 = load %struct.record*, %struct.record** %__x.addr, align 8 + %coerce.dive8 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %agg.tmp, i32 0, i32 0 + %12 = load %struct.record*, %struct.record** %coerce.dive8, align 8 + call void @_ZNSt6vectorI6recordSaIS0_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS0_S2_EERKS0_(%"class.std::vector"* %this1, %struct.record* %12, %struct.record* dereferenceable(60) %11) + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare dso_local i32 @fclose(%struct._IO_FILE*) #3 + +declare dso_local i8* @strncpy(i8*, i8*, i64) #3 + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #8 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI6recordSaIS0_EEC2Ev(%"struct.std::_Vector_base"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implC2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implC2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator"* + call void @_ZNSaI6recordEC2Ev(%"class.std::allocator"* %0) #12 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 0 + store %struct.record* null, %struct.record** %_M_start, align 8 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 1 + store %struct.record* null, %struct.record** %_M_finish, align 8 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 2 + store %struct.record* null, %struct.record** %_M_end_of_storage, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaI6recordEC2Ev(%"class.std::allocator"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %this, %"class.std::allocator"** %this.addr, align 8 + %this1 = load %"class.std::allocator"*, %"class.std::allocator"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator"* %this1 to %"class.__gnu_cxx::new_allocator"* + call void @_ZN9__gnu_cxx13new_allocatorI6recordEC2Ev(%"class.__gnu_cxx::new_allocator"* %0) #12 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI6recordEC2Ev(%"class.__gnu_cxx::new_allocator"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8_DestroyIP6recordS0_EvT_S2_RSaIT0_E(%struct.record* %__first, %struct.record* %__last, %"class.std::allocator"* dereferenceable(1) %0) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %.addr = alloca %"class.std::allocator"*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %"class.std::allocator"* %0, %"class.std::allocator"** %.addr, align 8 + %1 = load %struct.record*, %struct.record** %__first.addr, align 8 + %2 = load %struct.record*, %struct.record** %__last.addr, align 8 + call void @_ZSt8_DestroyIP6recordEvT_S2_(%struct.record* %1, %struct.record* %2) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + ret %"class.std::allocator"* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI6recordSaIS0_EED2Ev(%"struct.std::_Vector_base"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %0 = load %struct.record*, %struct.record** %_M_start, align 8 + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 2 + %1 = load %struct.record*, %struct.record** %_M_end_of_storage, align 8 + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %_M_start4 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3, i32 0, i32 0 + %2 = load %struct.record*, %struct.record** %_M_start4, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.record* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.record* %2 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 60 + invoke void @_ZNSt12_Vector_baseI6recordSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base"* %this1, %struct.record* %0, i64 %sub.ptr.div) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %_M_impl5 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl5) #12 + ret void + +lpad: ; preds = %entry + %3 = landingpad { i8*, i32 } + cleanup + %4 = extractvalue { i8*, i32 } %3, 0 + store i8* %4, i8** %exn.slot, align 8 + %5 = extractvalue { i8*, i32 } %3, 1 + store i32 %5, i32* %ehselector.slot, align 4 + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6) #12 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val7 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val7 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8_DestroyIP6recordEvT_S2_(%struct.record* %__first, %struct.record* %__last) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + call void @_ZNSt12_Destroy_auxILb1EE9__destroyIP6recordEEvT_S4_(%struct.record* %0, %struct.record* %1) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Destroy_auxILb1EE9__destroyIP6recordEEvT_S4_(%struct.record* %0, %struct.record* %1) #4 comdat align 2 { +entry: + %.addr = alloca %struct.record*, align 8 + %.addr1 = alloca %struct.record*, align 8 + store %struct.record* %0, %struct.record** %.addr, align 8 + store %struct.record* %1, %struct.record** %.addr1, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI6recordSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base"* %this, %struct.record* %__p, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %0 = load %struct.record*, %struct.record** %__p.addr, align 8 + %tobool = icmp ne %struct.record* %0, null + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + %2 = load %struct.record*, %struct.record** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE10deallocateERS2_PS1_m(%"class.std::allocator"* dereferenceable(1) %1, %struct.record* %2, i64 %3) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI6recordSaIS0_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator"* + call void @_ZNSaI6recordED2Ev(%"class.std::allocator"* %0) #12 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE10deallocateERS2_PS1_m(%"class.std::allocator"* dereferenceable(1) %__a, %struct.record* %__p, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load %struct.record*, %struct.record** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorI6recordE10deallocateEPS1_m(%"class.__gnu_cxx::new_allocator"* %1, %struct.record* %2, i64 %3) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI6recordE10deallocateEPS1_m(%"class.__gnu_cxx::new_allocator"* %this, %struct.record* %__p, i64 %0) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + %.addr = alloca i64, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + store i64 %0, i64* %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %1 = load %struct.record*, %struct.record** %__p.addr, align 8 + %2 = bitcast %struct.record* %1 to i8* + call void @_ZdlPv(i8* %2) #12 + ret void +} + +; Function Attrs: nobuiltin nounwind +declare dso_local void @_ZdlPv(i8*) #9 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaI6recordED2Ev(%"class.std::allocator"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %this, %"class.std::allocator"** %this.addr, align 8 + %this1 = load %"class.std::allocator"*, %"class.std::allocator"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator"* %this1 to %"class.__gnu_cxx::new_allocator"* + call void @_ZN9__gnu_cxx13new_allocatorI6recordED2Ev(%"class.__gnu_cxx::new_allocator"* %0) #12 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI6recordED2Ev(%"class.__gnu_cxx::new_allocator"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI7latLongSaIS0_EEC2Ev(%"struct.std::_Vector_base.1"* %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base.1"*, align 8 + store %"struct.std::_Vector_base.1"* %this, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base.1"*, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implC2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implC2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator.2"* + call void @_ZNSaI7latLongEC2Ev(%"class.std::allocator.2"* %0) #12 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 0 + store %struct.latLong* null, %struct.latLong** %_M_start, align 8 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 1 + store %struct.latLong* null, %struct.latLong** %_M_finish, align 8 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %this1, i32 0, i32 2 + store %struct.latLong* null, %struct.latLong** %_M_end_of_storage, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaI7latLongEC2Ev(%"class.std::allocator.2"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.2"*, align 8 + store %"class.std::allocator.2"* %this, %"class.std::allocator.2"** %this.addr, align 8 + %this1 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.2"* %this1 to %"class.__gnu_cxx::new_allocator.3"* + call void @_ZN9__gnu_cxx13new_allocatorI7latLongEC2Ev(%"class.__gnu_cxx::new_allocator.3"* %0) #12 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI7latLongEC2Ev(%"class.__gnu_cxx::new_allocator.3"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8_DestroyIP7latLongS0_EvT_S2_RSaIT0_E(%struct.latLong* %__first, %struct.latLong* %__last, %"class.std::allocator.2"* dereferenceable(1) %0) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %.addr = alloca %"class.std::allocator.2"*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %"class.std::allocator.2"* %0, %"class.std::allocator.2"** %.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + call void @_ZSt8_DestroyIP7latLongEvT_S2_(%struct.latLong* %1, %struct.latLong* %2) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator.2"* @_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base.1"*, align 8 + store %"struct.std::_Vector_base.1"* %this, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base.1"*, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator.2"* + ret %"class.std::allocator.2"* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI7latLongSaIS0_EED2Ev(%"struct.std::_Vector_base.1"* %this) unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %"struct.std::_Vector_base.1"*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %"struct.std::_Vector_base.1"* %this, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base.1"*, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + %0 = load %struct.latLong*, %struct.latLong** %_M_start, align 8 + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 2 + %1 = load %struct.latLong*, %struct.latLong** %_M_end_of_storage, align 8 + %_M_impl3 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %_M_start4 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl3, i32 0, i32 0 + %2 = load %struct.latLong*, %struct.latLong** %_M_start4, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.latLong* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.latLong* %2 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + invoke void @_ZNSt12_Vector_baseI7latLongSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base.1"* %this1, %struct.latLong* %0, i64 %sub.ptr.div) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %_M_impl5 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl5) #12 + ret void + +lpad: ; preds = %entry + %3 = landingpad { i8*, i32 } + cleanup + %4 = extractvalue { i8*, i32 } %3, 0 + store i8* %4, i8** %exn.slot, align 8 + %5 = extractvalue { i8*, i32 } %3, 1 + store i32 %5, i32* %ehselector.slot, align 4 + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + call void @_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6) #12 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val7 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val7 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZSt8_DestroyIP7latLongEvT_S2_(%struct.latLong* %__first, %struct.latLong* %__last) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + call void @_ZNSt12_Destroy_auxILb1EE9__destroyIP7latLongEEvT_S4_(%struct.latLong* %0, %struct.latLong* %1) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Destroy_auxILb1EE9__destroyIP7latLongEEvT_S4_(%struct.latLong* %0, %struct.latLong* %1) #4 comdat align 2 { +entry: + %.addr = alloca %struct.latLong*, align 8 + %.addr1 = alloca %struct.latLong*, align 8 + store %struct.latLong* %0, %struct.latLong** %.addr, align 8 + store %struct.latLong* %1, %struct.latLong** %.addr1, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI7latLongSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base.1"* %this, %struct.latLong* %__p, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base.1"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base.1"* %this, %"struct.std::_Vector_base.1"** %this.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base.1"*, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + %tobool = icmp ne %struct.latLong* %0, null + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator.2"* + %2 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE10deallocateERS2_PS1_m(%"class.std::allocator.2"* dereferenceable(1) %1, %struct.latLong* %2, i64 %3) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSt12_Vector_baseI7latLongSaIS0_EE12_Vector_implD2Ev(%"struct.std::_Vector_base >::_Vector_impl"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base >::_Vector_impl"*, align 8 + store %"struct.std::_Vector_base >::_Vector_impl"* %this, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base >::_Vector_impl"*, %"struct.std::_Vector_base >::_Vector_impl"** %this.addr, align 8 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %this1 to %"class.std::allocator.2"* + call void @_ZNSaI7latLongED2Ev(%"class.std::allocator.2"* %0) #12 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE10deallocateERS2_PS1_m(%"class.std::allocator.2"* dereferenceable(1) %__a, %struct.latLong* %__p, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.2"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator.2"* %__a, %"class.std::allocator.2"** %__a.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.2"* %0 to %"class.__gnu_cxx::new_allocator.3"* + %2 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + %3 = load i64, i64* %__n.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorI7latLongE10deallocateEPS1_m(%"class.__gnu_cxx::new_allocator.3"* %1, %struct.latLong* %2, i64 %3) + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI7latLongE10deallocateEPS1_m(%"class.__gnu_cxx::new_allocator.3"* %this, %struct.latLong* %__p, i64 %0) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + %.addr = alloca i64, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + store i64 %0, i64* %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + %2 = bitcast %struct.latLong* %1 to i8* + call void @_ZdlPv(i8* %2) #12 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZNSaI7latLongED2Ev(%"class.std::allocator.2"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::allocator.2"*, align 8 + store %"class.std::allocator.2"* %this, %"class.std::allocator.2"** %this.addr, align 8 + %this1 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %this.addr, align 8 + %0 = bitcast %"class.std::allocator.2"* %this1 to %"class.__gnu_cxx::new_allocator.3"* + call void @_ZN9__gnu_cxx13new_allocatorI7latLongED2Ev(%"class.__gnu_cxx::new_allocator.3"* %0) #12 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI7latLongED2Ev(%"class.__gnu_cxx::new_allocator.3"* %this) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE9constructIS1_EEvRS2_PS1_RKT_(%"class.std::allocator.2"* dereferenceable(1) %__a, %struct.latLong* %__p, %struct.latLong* dereferenceable(8) %__arg) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.2"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + %__arg.addr = alloca %struct.latLong*, align 8 + store %"class.std::allocator.2"* %__a, %"class.std::allocator.2"** %__a.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + store %struct.latLong* %__arg, %struct.latLong** %__arg.addr, align 8 + %0 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.2"* %0 to %"class.__gnu_cxx::new_allocator.3"* + %2 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + %3 = load %struct.latLong*, %struct.latLong** %__arg.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorI7latLongE9constructEPS1_RKS1_(%"class.__gnu_cxx::new_allocator.3"* %1, %struct.latLong* %2, %struct.latLong* dereferenceable(8) %3) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI7latLongSaIS0_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS0_S2_EERKS0_(%"class.std::vector.0"* %this, %struct.latLong* %__position.coerce, %struct.latLong* dereferenceable(8) %__x) #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %__position = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__x.addr = alloca %struct.latLong*, align 8 + %__len = alloca i64, align 8 + %__elems_before = alloca i64, align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %__new_start = alloca %struct.latLong*, align 8 + %__new_finish = alloca %struct.latLong*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %__position, i32 0, i32 0 + store %struct.latLong* %__position.coerce, %struct.latLong** %coerce.dive, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store %struct.latLong* %__x, %struct.latLong** %__x.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE12_M_check_lenEmPKc(%"class.std::vector.0"* %this1, i64 1, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.28, i64 0, i64 0)) + store i64 %call, i64* %__len, align 8 + %call2 = call %struct.latLong* @_ZNSt6vectorI7latLongSaIS0_EE5beginEv(%"class.std::vector.0"* %this1) + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %ref.tmp, i32 0, i32 0 + store %struct.latLong* %call2, %struct.latLong** %coerce.dive3, align 8 + %call4 = call i64 @_ZN9__gnu_cxxmiIP7latLongSt6vectorIS1_SaIS1_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKS9_SC_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__position, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %ref.tmp) + store i64 %call4, i64* %__elems_before, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %1 = load i64, i64* %__len, align 8 + %call5 = call %struct.latLong* @_ZNSt12_Vector_baseI7latLongSaIS0_EE11_M_allocateEm(%"struct.std::_Vector_base.1"* %0, i64 %1) + store %struct.latLong* %call5, %struct.latLong** %__new_start, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + store %struct.latLong* %2, %struct.latLong** %__new_finish, align 8 + %3 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %3, i32 0, i32 0 + %4 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator.2"* + %5 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %6 = load i64, i64* %__elems_before, align 8 + %add.ptr = getelementptr inbounds %struct.latLong, %struct.latLong* %5, i64 %6 + %7 = load %struct.latLong*, %struct.latLong** %__x.addr, align 8 + invoke void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE9constructIS1_EEvRS2_PS1_RKT_(%"class.std::allocator.2"* dereferenceable(1) %4, %struct.latLong* %add.ptr, %struct.latLong* dereferenceable(8) %7) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + store %struct.latLong* null, %struct.latLong** %__new_finish, align 8 + %8 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %8, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6, i32 0, i32 0 + %9 = load %struct.latLong*, %struct.latLong** %_M_start, align 8 + %call8 = invoke dereferenceable(8) %struct.latLong** @_ZNK9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %__position) + to label %invoke.cont7 unwind label %lpad + +invoke.cont7: ; preds = %invoke.cont + %10 = load %struct.latLong*, %struct.latLong** %call8, align 8 + %11 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %12 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %call10 = invoke dereferenceable(1) %"class.std::allocator.2"* @_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %12) + to label %invoke.cont9 unwind label %lpad + +invoke.cont9: ; preds = %invoke.cont7 + %call12 = invoke %struct.latLong* @_ZSt34__uninitialized_move_if_noexcept_aIP7latLongS1_SaIS0_EET0_T_S4_S3_RT1_(%struct.latLong* %9, %struct.latLong* %10, %struct.latLong* %11, %"class.std::allocator.2"* dereferenceable(1) %call10) + to label %invoke.cont11 unwind label %lpad + +invoke.cont11: ; preds = %invoke.cont9 + store %struct.latLong* %call12, %struct.latLong** %__new_finish, align 8 + %13 = load %struct.latLong*, %struct.latLong** %__new_finish, align 8 + %incdec.ptr = getelementptr inbounds %struct.latLong, %struct.latLong* %13, i32 1 + store %struct.latLong* %incdec.ptr, %struct.latLong** %__new_finish, align 8 + %call14 = invoke dereferenceable(8) %struct.latLong** @_ZNK9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %__position) + to label %invoke.cont13 unwind label %lpad + +invoke.cont13: ; preds = %invoke.cont11 + %14 = load %struct.latLong*, %struct.latLong** %call14, align 8 + %15 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl15 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %15, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl15, i32 0, i32 1 + %16 = load %struct.latLong*, %struct.latLong** %_M_finish, align 8 + %17 = load %struct.latLong*, %struct.latLong** %__new_finish, align 8 + %18 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %call17 = invoke dereferenceable(1) %"class.std::allocator.2"* @_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %18) + to label %invoke.cont16 unwind label %lpad + +invoke.cont16: ; preds = %invoke.cont13 + %call19 = invoke %struct.latLong* @_ZSt34__uninitialized_move_if_noexcept_aIP7latLongS1_SaIS0_EET0_T_S4_S3_RT1_(%struct.latLong* %14, %struct.latLong* %16, %struct.latLong* %17, %"class.std::allocator.2"* dereferenceable(1) %call17) + to label %invoke.cont18 unwind label %lpad + +invoke.cont18: ; preds = %invoke.cont16 + store %struct.latLong* %call19, %struct.latLong** %__new_finish, align 8 + br label %try.cont + +lpad: ; preds = %invoke.cont16, %invoke.cont13, %invoke.cont11, %invoke.cont9, %invoke.cont7, %invoke.cont, %entry + %19 = landingpad { i8*, i32 } + catch i8* null + %20 = extractvalue { i8*, i32 } %19, 0 + store i8* %20, i8** %exn.slot, align 8 + %21 = extractvalue { i8*, i32 } %19, 1 + store i32 %21, i32* %ehselector.slot, align 4 + br label %catch + +catch: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %22 = call i8* @__cxa_begin_catch(i8* %exn) #12 + %23 = load %struct.latLong*, %struct.latLong** %__new_finish, align 8 + %tobool = icmp ne %struct.latLong* %23, null + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %catch + %24 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl20 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %24, i32 0, i32 0 + %25 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl20 to %"class.std::allocator.2"* + %26 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %27 = load i64, i64* %__elems_before, align 8 + %add.ptr21 = getelementptr inbounds %struct.latLong, %struct.latLong* %26, i64 %27 + invoke void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE7destroyERS2_PS1_(%"class.std::allocator.2"* dereferenceable(1) %25, %struct.latLong* %add.ptr21) + to label %invoke.cont23 unwind label %lpad22 + +invoke.cont23: ; preds = %if.then + br label %if.end + +lpad22: ; preds = %invoke.cont27, %if.end, %invoke.cont24, %if.else, %if.then + %28 = landingpad { i8*, i32 } + cleanup + %29 = extractvalue { i8*, i32 } %28, 0 + store i8* %29, i8** %exn.slot, align 8 + %30 = extractvalue { i8*, i32 } %28, 1 + store i32 %30, i32* %ehselector.slot, align 4 + invoke void @__cxa_end_catch() + to label %invoke.cont28 unwind label %terminate.lpad + +if.else: ; preds = %catch + %31 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %32 = load %struct.latLong*, %struct.latLong** %__new_finish, align 8 + %33 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %call25 = invoke dereferenceable(1) %"class.std::allocator.2"* @_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %33) + to label %invoke.cont24 unwind label %lpad22 + +invoke.cont24: ; preds = %if.else + invoke void @_ZSt8_DestroyIP7latLongS0_EvT_S2_RSaIT0_E(%struct.latLong* %31, %struct.latLong* %32, %"class.std::allocator.2"* dereferenceable(1) %call25) + to label %invoke.cont26 unwind label %lpad22 + +invoke.cont26: ; preds = %invoke.cont24 + br label %if.end + +if.end: ; preds = %invoke.cont26, %invoke.cont23 + %34 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %35 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %36 = load i64, i64* %__len, align 8 + invoke void @_ZNSt12_Vector_baseI7latLongSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base.1"* %34, %struct.latLong* %35, i64 %36) + to label %invoke.cont27 unwind label %lpad22 + +invoke.cont27: ; preds = %if.end + invoke void @__cxa_rethrow() #15 + to label %unreachable unwind label %lpad22 + +invoke.cont28: ; preds = %lpad22 + br label %eh.resume + +try.cont: ; preds = %invoke.cont18 + %37 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl29 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %37, i32 0, i32 0 + %_M_start30 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl29, i32 0, i32 0 + %38 = load %struct.latLong*, %struct.latLong** %_M_start30, align 8 + %39 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl31 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %39, i32 0, i32 0 + %_M_finish32 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl31, i32 0, i32 1 + %40 = load %struct.latLong*, %struct.latLong** %_M_finish32, align 8 + %41 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %call33 = call dereferenceable(1) %"class.std::allocator.2"* @_ZNSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %41) + call void @_ZSt8_DestroyIP7latLongS0_EvT_S2_RSaIT0_E(%struct.latLong* %38, %struct.latLong* %40, %"class.std::allocator.2"* dereferenceable(1) %call33) + %42 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %43 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl34 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %43, i32 0, i32 0 + %_M_start35 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl34, i32 0, i32 0 + %44 = load %struct.latLong*, %struct.latLong** %_M_start35, align 8 + %45 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl36 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %45, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl36, i32 0, i32 2 + %46 = load %struct.latLong*, %struct.latLong** %_M_end_of_storage, align 8 + %47 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl37 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %47, i32 0, i32 0 + %_M_start38 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl37, i32 0, i32 0 + %48 = load %struct.latLong*, %struct.latLong** %_M_start38, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.latLong* %46 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.latLong* %48 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + call void @_ZNSt12_Vector_baseI7latLongSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base.1"* %42, %struct.latLong* %44, i64 %sub.ptr.div) + %49 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %50 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl39 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %50, i32 0, i32 0 + %_M_start40 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl39, i32 0, i32 0 + store %struct.latLong* %49, %struct.latLong** %_M_start40, align 8 + %51 = load %struct.latLong*, %struct.latLong** %__new_finish, align 8 + %52 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl41 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %52, i32 0, i32 0 + %_M_finish42 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl41, i32 0, i32 1 + store %struct.latLong* %51, %struct.latLong** %_M_finish42, align 8 + %53 = load %struct.latLong*, %struct.latLong** %__new_start, align 8 + %54 = load i64, i64* %__len, align 8 + %add.ptr43 = getelementptr inbounds %struct.latLong, %struct.latLong* %53, i64 %54 + %55 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl44 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %55, i32 0, i32 0 + %_M_end_of_storage45 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl44, i32 0, i32 2 + store %struct.latLong* %add.ptr43, %struct.latLong** %_M_end_of_storage45, align 8 + ret void + +eh.resume: ; preds = %invoke.cont28 + %exn46 = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn46, 0 + %lpad.val47 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val47 + +terminate.lpad: ; preds = %lpad22 + %56 = landingpad { i8*, i32 } + catch i8* null + %57 = extractvalue { i8*, i32 } %56, 0 + call void @__clang_call_terminate(i8* %57) #13 + unreachable + +unreachable: ; preds = %invoke.cont27 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZNSt6vectorI7latLongSaIS0_EE3endEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + call void @_ZN9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEEC2ERKS2_(%"class.__gnu_cxx::__normal_iterator"* %retval, %struct.latLong** dereferenceable(8) %_M_finish) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %retval, i32 0, i32 0 + %1 = load %struct.latLong*, %struct.latLong** %coerce.dive, align 8 + ret %struct.latLong* %1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI7latLongE9constructEPS1_RKS1_(%"class.__gnu_cxx::new_allocator.3"* %this, %struct.latLong* %__p, %struct.latLong* dereferenceable(8) %__val) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + %__val.addr = alloca %struct.latLong*, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + store %struct.latLong* %__val, %struct.latLong** %__val.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + %1 = bitcast %struct.latLong* %0 to i8* + %2 = bitcast i8* %1 to %struct.latLong* + %3 = load %struct.latLong*, %struct.latLong** %__val.addr, align 8 + %4 = bitcast %struct.latLong* %2 to i8* + %5 = bitcast %struct.latLong* %3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %4, i8* align 4 %5, i64 8, i1 false) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorI7latLongSaIS0_EE12_M_check_lenEmPKc(%"class.std::vector.0"* %this, i64 %__n, i8* %__s) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + %__n.addr = alloca i64, align 8 + %__s.addr = alloca i8*, align 8 + %__len = alloca i64, align 8 + %ref.tmp = alloca i64, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %__s, i8** %__s.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE8max_sizeEv(%"class.std::vector.0"* %this1) + %call2 = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE4sizeEv(%"class.std::vector.0"* %this1) + %sub = sub i64 %call, %call2 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ult i64 %sub, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i8*, i8** %__s.addr, align 8 + call void @_ZSt20__throw_length_errorPKc(i8* %1) #15 + unreachable + +if.end: ; preds = %entry + %call3 = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE4sizeEv(%"class.std::vector.0"* %this1) + %call4 = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE4sizeEv(%"class.std::vector.0"* %this1) + store i64 %call4, i64* %ref.tmp, align 8 + %call5 = call dereferenceable(8) i64* @_ZSt3maxImERKT_S2_S2_(i64* dereferenceable(8) %ref.tmp, i64* dereferenceable(8) %__n.addr) + %2 = load i64, i64* %call5, align 8 + %add = add i64 %call3, %2 + store i64 %add, i64* %__len, align 8 + %3 = load i64, i64* %__len, align 8 + %call6 = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE4sizeEv(%"class.std::vector.0"* %this1) + %cmp7 = icmp ult i64 %3, %call6 + br i1 %cmp7, label %cond.true, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %4 = load i64, i64* %__len, align 8 + %call8 = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE8max_sizeEv(%"class.std::vector.0"* %this1) + %cmp9 = icmp ugt i64 %4, %call8 + br i1 %cmp9, label %cond.true, label %cond.false + +cond.true: ; preds = %lor.lhs.false, %if.end + %call10 = call i64 @_ZNKSt6vectorI7latLongSaIS0_EE8max_sizeEv(%"class.std::vector.0"* %this1) + br label %cond.end + +cond.false: ; preds = %lor.lhs.false + %5 = load i64, i64* %__len, align 8 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i64 [ %call10, %cond.true ], [ %5, %cond.false ] + ret i64 %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxxmiIP7latLongSt6vectorIS1_SaIS1_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKS9_SC_(%"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__lhs, %"class.__gnu_cxx::__normal_iterator"* dereferenceable(8) %__rhs) #0 comdat { +entry: + %__lhs.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + %__rhs.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %__lhs, %"class.__gnu_cxx::__normal_iterator"** %__lhs.addr, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %__rhs, %"class.__gnu_cxx::__normal_iterator"** %__rhs.addr, align 8 + %0 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %__lhs.addr, align 8 + %call = call dereferenceable(8) %struct.latLong** @_ZNK9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %0) + %1 = load %struct.latLong*, %struct.latLong** %call, align 8 + %2 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %__rhs.addr, align 8 + %call1 = call dereferenceable(8) %struct.latLong** @_ZNK9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %2) + %3 = load %struct.latLong*, %struct.latLong** %call1, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.latLong* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.latLong* %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + ret i64 %sub.ptr.div +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZNSt6vectorI7latLongSaIS0_EE5beginEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator", align 8 + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + call void @_ZN9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEEC2ERKS2_(%"class.__gnu_cxx::__normal_iterator"* %retval, %struct.latLong** dereferenceable(8) %_M_start) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %retval, i32 0, i32 0 + %1 = load %struct.latLong*, %struct.latLong** %coerce.dive, align 8 + ret %struct.latLong* %1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZNSt12_Vector_baseI7latLongSaIS0_EE11_M_allocateEm(%"struct.std::_Vector_base.1"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base.1"*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base.1"* %this, %"struct.std::_Vector_base.1"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base.1"*, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ne i64 %0, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator.2"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %struct.latLong* @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE8allocateERS2_m(%"class.std::allocator.2"* dereferenceable(1) %1, i64 %2) + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi %struct.latLong* [ %call, %cond.true ], [ null, %cond.false ] + ret %struct.latLong* %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt34__uninitialized_move_if_noexcept_aIP7latLongS1_SaIS0_EET0_T_S4_S3_RT1_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result, %"class.std::allocator.2"* dereferenceable(1) %__alloc) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + %__alloc.addr = alloca %"class.std::allocator.2"*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + store %"class.std::allocator.2"* %__alloc, %"class.std::allocator.2"** %__alloc.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %3 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %__alloc.addr, align 8 + %call = call %struct.latLong* @_ZSt22__uninitialized_copy_aIP7latLongS1_S0_ET0_T_S3_S2_RSaIT1_E(%struct.latLong* %0, %struct.latLong* %1, %struct.latLong* %2, %"class.std::allocator.2"* dereferenceable(1) %3) + ret %struct.latLong* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %struct.latLong** @_ZNK9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + ret %struct.latLong** %_M_current +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE7destroyERS2_PS1_(%"class.std::allocator.2"* dereferenceable(1) %__a, %struct.latLong* %__p) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.2"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + store %"class.std::allocator.2"* %__a, %"class.std::allocator.2"** %__a.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + %0 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.2"* %0 to %"class.__gnu_cxx::new_allocator.3"* + %2 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorI7latLongE7destroyEPS1_(%"class.__gnu_cxx::new_allocator.3"* %1, %struct.latLong* %2) + ret void +} + +declare dso_local void @__cxa_rethrow() + +declare dso_local void @__cxa_end_catch() + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorI7latLongSaIS0_EE8max_sizeEv(%"class.std::vector.0"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %call = call dereferenceable(1) %"class.std::allocator.2"* @_ZNKSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %0) + %call2 = call i64 @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE8max_sizeERKS2_(%"class.std::allocator.2"* dereferenceable(1) %call) + ret i64 %call2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorI7latLongSaIS0_EE4sizeEv(%"class.std::vector.0"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector.0"*, align 8 + store %"class.std::vector.0"* %this, %"class.std::vector.0"** %this.addr, align 8 + %this1 = load %"class.std::vector.0"*, %"class.std::vector.0"** %this.addr, align 8 + %0 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %struct.latLong*, %struct.latLong** %_M_finish, align 8 + %2 = bitcast %"class.std::vector.0"* %this1 to %"struct.std::_Vector_base.1"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %2, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 0 + %3 = load %struct.latLong*, %struct.latLong** %_M_start, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.latLong* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.latLong* %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + ret i64 %sub.ptr.div +} + +; Function Attrs: noreturn +declare dso_local void @_ZSt20__throw_length_errorPKc(i8*) #10 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) i64* @_ZSt3maxImERKT_S2_S2_(i64* dereferenceable(8) %__a, i64* dereferenceable(8) %__b) #4 comdat { +entry: + %retval = alloca i64*, align 8 + %__a.addr = alloca i64*, align 8 + %__b.addr = alloca i64*, align 8 + store i64* %__a, i64** %__a.addr, align 8 + store i64* %__b, i64** %__b.addr, align 8 + %0 = load i64*, i64** %__a.addr, align 8 + %1 = load i64, i64* %0, align 8 + %2 = load i64*, i64** %__b.addr, align 8 + %3 = load i64, i64* %2, align 8 + %cmp = icmp ult i64 %1, %3 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %4 = load i64*, i64** %__b.addr, align 8 + store i64* %4, i64** %retval, align 8 + br label %return + +if.end: ; preds = %entry + %5 = load i64*, i64** %__a.addr, align 8 + store i64* %5, i64** %retval, align 8 + br label %return + +return: ; preds = %if.end, %if.then + %6 = load i64*, i64** %retval, align 8 + ret i64* %6 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE8max_sizeERKS2_(%"class.std::allocator.2"* dereferenceable(1) %__a) #4 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.2"*, align 8 + store %"class.std::allocator.2"* %__a, %"class.std::allocator.2"** %__a.addr, align 8 + %0 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.2"* %0 to %"class.__gnu_cxx::new_allocator.3"* + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorI7latLongE8max_sizeEv(%"class.__gnu_cxx::new_allocator.3"* %1) #12 + ret i64 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator.2"* @_ZNKSt12_Vector_baseI7latLongSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base.1"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base.1"*, align 8 + store %"struct.std::_Vector_base.1"* %this, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base.1"*, %"struct.std::_Vector_base.1"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base.1", %"struct.std::_Vector_base.1"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator.2"* + ret %"class.std::allocator.2"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNK9__gnu_cxx13new_allocatorI7latLongE8max_sizeEv(%"class.__gnu_cxx::new_allocator.3"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + ret i64 2305843009213693951 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx17__normal_iteratorIP7latLongSt6vectorIS1_SaIS1_EEEC2ERKS2_(%"class.__gnu_cxx::__normal_iterator"* %this, %struct.latLong** dereferenceable(8) %__i) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator"*, align 8 + %__i.addr = alloca %struct.latLong**, align 8 + store %"class.__gnu_cxx::__normal_iterator"* %this, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + store %struct.latLong** %__i, %struct.latLong*** %__i.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator"*, %"class.__gnu_cxx::__normal_iterator"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator", %"class.__gnu_cxx::__normal_iterator"* %this1, i32 0, i32 0 + %0 = load %struct.latLong**, %struct.latLong*** %__i.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %0, align 8 + store %struct.latLong* %1, %struct.latLong** %_M_current, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZN9__gnu_cxx14__alloc_traitsISaI7latLongEE8allocateERS2_m(%"class.std::allocator.2"* dereferenceable(1) %__a, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator.2"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator.2"* %__a, %"class.std::allocator.2"** %__a.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator.2"*, %"class.std::allocator.2"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator.2"* %0 to %"class.__gnu_cxx::new_allocator.3"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %struct.latLong* @_ZN9__gnu_cxx13new_allocatorI7latLongE8allocateEmPKv(%"class.__gnu_cxx::new_allocator.3"* %1, i64 %2, i8* null) + ret %struct.latLong* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZN9__gnu_cxx13new_allocatorI7latLongE8allocateEmPKv(%"class.__gnu_cxx::new_allocator.3"* %this, i64 %__n, i8* %0) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + %__n.addr = alloca i64, align 8 + %.addr = alloca i8*, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %0, i8** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorI7latLongE8max_sizeEv(%"class.__gnu_cxx::new_allocator.3"* %this1) #12 + %cmp = icmp ugt i64 %1, %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_ZSt17__throw_bad_allocv() #15 + unreachable + +if.end: ; preds = %entry + %2 = load i64, i64* %__n.addr, align 8 + %mul = mul i64 %2, 8 + %call2 = call i8* @_Znwm(i64 %mul) + %3 = bitcast i8* %call2 to %struct.latLong* + ret %struct.latLong* %3 +} + +; Function Attrs: noreturn +declare dso_local void @_ZSt17__throw_bad_allocv() #10 + +; Function Attrs: nobuiltin +declare dso_local noalias i8* @_Znwm(i64) #11 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt22__uninitialized_copy_aIP7latLongS1_S0_ET0_T_S3_S2_RSaIT1_E(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result, %"class.std::allocator.2"* dereferenceable(1) %0) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + %.addr = alloca %"class.std::allocator.2"*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + store %"class.std::allocator.2"* %0, %"class.std::allocator.2"** %.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %3 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %call = call %struct.latLong* @_ZSt18uninitialized_copyIP7latLongS1_ET0_T_S3_S2_(%struct.latLong* %1, %struct.latLong* %2, %struct.latLong* %3) + ret %struct.latLong* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt18uninitialized_copyIP7latLongS1_ET0_T_S3_S2_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + %__assignable = alloca i8, align 1 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + store i8 1, i8* %__assignable, align 1 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %call = call %struct.latLong* @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIP7latLongS3_EET0_T_S5_S4_(%struct.latLong* %0, %struct.latLong* %1, %struct.latLong* %2) + ret %struct.latLong* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIP7latLongS3_EET0_T_S5_S4_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result) #0 comdat align 2 { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %call = call %struct.latLong* @_ZSt4copyIP7latLongS1_ET0_T_S3_S2_(%struct.latLong* %0, %struct.latLong* %1, %struct.latLong* %2) + ret %struct.latLong* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt4copyIP7latLongS1_ET0_T_S3_S2_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %call = call %struct.latLong* @_ZSt12__miter_baseIP7latLongET_S2_(%struct.latLong* %0) + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %call1 = call %struct.latLong* @_ZSt12__miter_baseIP7latLongET_S2_(%struct.latLong* %1) + %2 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %call2 = call %struct.latLong* @_ZSt14__copy_move_a2ILb0EP7latLongS1_ET1_T0_S3_S2_(%struct.latLong* %call, %struct.latLong* %call1, %struct.latLong* %2) + ret %struct.latLong* %call2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt14__copy_move_a2ILb0EP7latLongS1_ET1_T0_S3_S2_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %call = call %struct.latLong* @_ZSt12__niter_baseIP7latLongET_S2_(%struct.latLong* %0) + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %call1 = call %struct.latLong* @_ZSt12__niter_baseIP7latLongET_S2_(%struct.latLong* %1) + %2 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %call2 = call %struct.latLong* @_ZSt12__niter_baseIP7latLongET_S2_(%struct.latLong* %2) + %call3 = call %struct.latLong* @_ZSt13__copy_move_aILb0EP7latLongS1_ET1_T0_S3_S2_(%struct.latLong* %call, %struct.latLong* %call1, %struct.latLong* %call2) + ret %struct.latLong* %call3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt12__miter_baseIP7latLongET_S2_(%struct.latLong* %__it) #4 comdat { +entry: + %__it.addr = alloca %struct.latLong*, align 8 + store %struct.latLong* %__it, %struct.latLong** %__it.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__it.addr, align 8 + ret %struct.latLong* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt13__copy_move_aILb0EP7latLongS1_ET1_T0_S3_S2_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + %__simple = alloca i8, align 1 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + store i8 1, i8* %__simple, align 1 + %0 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %2 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %call = call %struct.latLong* @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mI7latLongEEPT_PKS4_S7_S5_(%struct.latLong* %0, %struct.latLong* %1, %struct.latLong* %2) + ret %struct.latLong* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZSt12__niter_baseIP7latLongET_S2_(%struct.latLong* %__it) #4 comdat { +entry: + %__it.addr = alloca %struct.latLong*, align 8 + store %struct.latLong* %__it, %struct.latLong** %__it.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__it.addr, align 8 + ret %struct.latLong* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %struct.latLong* @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mI7latLongEEPT_PKS4_S7_S5_(%struct.latLong* %__first, %struct.latLong* %__last, %struct.latLong* %__result) #4 comdat align 2 { +entry: + %__first.addr = alloca %struct.latLong*, align 8 + %__last.addr = alloca %struct.latLong*, align 8 + %__result.addr = alloca %struct.latLong*, align 8 + %_Num = alloca i64, align 8 + store %struct.latLong* %__first, %struct.latLong** %__first.addr, align 8 + store %struct.latLong* %__last, %struct.latLong** %__last.addr, align 8 + store %struct.latLong* %__result, %struct.latLong** %__result.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__last.addr, align 8 + %1 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.latLong* %0 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.latLong* %1 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 8 + store i64 %sub.ptr.div, i64* %_Num, align 8 + %2 = load i64, i64* %_Num, align 8 + %tobool = icmp ne i64 %2, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %3 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %4 = bitcast %struct.latLong* %3 to i8* + %5 = load %struct.latLong*, %struct.latLong** %__first.addr, align 8 + %6 = bitcast %struct.latLong* %5 to i8* + %7 = load i64, i64* %_Num, align 8 + %mul = mul i64 8, %7 + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %4, i8* align 4 %6, i64 %mul, i1 false) + br label %if.end + +if.end: ; preds = %if.then, %entry + %8 = load %struct.latLong*, %struct.latLong** %__result.addr, align 8 + %9 = load i64, i64* %_Num, align 8 + %add.ptr = getelementptr inbounds %struct.latLong, %struct.latLong* %8, i64 %9 + ret %struct.latLong* %add.ptr +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI7latLongE7destroyEPS1_(%"class.__gnu_cxx::new_allocator.3"* %this, %struct.latLong* %__p) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator.3"*, align 8 + %__p.addr = alloca %struct.latLong*, align 8 + store %"class.__gnu_cxx::new_allocator.3"* %this, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + store %struct.latLong* %__p, %struct.latLong** %__p.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator.3"*, %"class.__gnu_cxx::new_allocator.3"** %this.addr, align 8 + %0 = load %struct.latLong*, %struct.latLong** %__p.addr, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE9constructIS1_EEvRS2_PS1_RKT_(%"class.std::allocator"* dereferenceable(1) %__a, %struct.record* %__p, %struct.record* dereferenceable(60) %__arg) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + %__arg.addr = alloca %struct.record*, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + store %struct.record* %__arg, %struct.record** %__arg.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load %struct.record*, %struct.record** %__p.addr, align 8 + %3 = load %struct.record*, %struct.record** %__arg.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorI6recordE9constructEPS1_RKS1_(%"class.__gnu_cxx::new_allocator"* %1, %struct.record* %2, %struct.record* dereferenceable(60) %3) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZNSt6vectorI6recordSaIS0_EE17_M_realloc_insertEN9__gnu_cxx17__normal_iteratorIPS0_S2_EERKS0_(%"class.std::vector"* %this, %struct.record* %__position.coerce, %struct.record* dereferenceable(60) %__x) #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %__position = alloca %"class.__gnu_cxx::__normal_iterator.5", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + %__x.addr = alloca %struct.record*, align 8 + %__len = alloca i64, align 8 + %__elems_before = alloca i64, align 8 + %ref.tmp = alloca %"class.__gnu_cxx::__normal_iterator.5", align 8 + %__new_start = alloca %struct.record*, align 8 + %__new_finish = alloca %struct.record*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %__position, i32 0, i32 0 + store %struct.record* %__position.coerce, %struct.record** %coerce.dive, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store %struct.record* %__x, %struct.record** %__x.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorI6recordSaIS0_EE12_M_check_lenEmPKc(%"class.std::vector"* %this1, i64 1, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.28, i64 0, i64 0)) + store i64 %call, i64* %__len, align 8 + %call2 = call %struct.record* @_ZNSt6vectorI6recordSaIS0_EE5beginEv(%"class.std::vector"* %this1) + %coerce.dive3 = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %ref.tmp, i32 0, i32 0 + store %struct.record* %call2, %struct.record** %coerce.dive3, align 8 + %call4 = call i64 @_ZN9__gnu_cxxmiIP6recordSt6vectorIS1_SaIS1_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKS9_SC_(%"class.__gnu_cxx::__normal_iterator.5"* dereferenceable(8) %__position, %"class.__gnu_cxx::__normal_iterator.5"* dereferenceable(8) %ref.tmp) + store i64 %call4, i64* %__elems_before, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %1 = load i64, i64* %__len, align 8 + %call5 = call %struct.record* @_ZNSt12_Vector_baseI6recordSaIS0_EE11_M_allocateEm(%"struct.std::_Vector_base"* %0, i64 %1) + store %struct.record* %call5, %struct.record** %__new_start, align 8 + %2 = load %struct.record*, %struct.record** %__new_start, align 8 + store %struct.record* %2, %struct.record** %__new_finish, align 8 + %3 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %3, i32 0, i32 0 + %4 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + %5 = load %struct.record*, %struct.record** %__new_start, align 8 + %6 = load i64, i64* %__elems_before, align 8 + %add.ptr = getelementptr inbounds %struct.record, %struct.record* %5, i64 %6 + %7 = load %struct.record*, %struct.record** %__x.addr, align 8 + invoke void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE9constructIS1_EEvRS2_PS1_RKT_(%"class.std::allocator"* dereferenceable(1) %4, %struct.record* %add.ptr, %struct.record* dereferenceable(60) %7) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + store %struct.record* null, %struct.record** %__new_finish, align 8 + %8 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl6 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %8, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl6, i32 0, i32 0 + %9 = load %struct.record*, %struct.record** %_M_start, align 8 + %call8 = invoke dereferenceable(8) %struct.record** @_ZNK9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.5"* %__position) + to label %invoke.cont7 unwind label %lpad + +invoke.cont7: ; preds = %invoke.cont + %10 = load %struct.record*, %struct.record** %call8, align 8 + %11 = load %struct.record*, %struct.record** %__new_start, align 8 + %12 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call10 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %12) + to label %invoke.cont9 unwind label %lpad + +invoke.cont9: ; preds = %invoke.cont7 + %call12 = invoke %struct.record* @_ZSt34__uninitialized_move_if_noexcept_aIP6recordS1_SaIS0_EET0_T_S4_S3_RT1_(%struct.record* %9, %struct.record* %10, %struct.record* %11, %"class.std::allocator"* dereferenceable(1) %call10) + to label %invoke.cont11 unwind label %lpad + +invoke.cont11: ; preds = %invoke.cont9 + store %struct.record* %call12, %struct.record** %__new_finish, align 8 + %13 = load %struct.record*, %struct.record** %__new_finish, align 8 + %incdec.ptr = getelementptr inbounds %struct.record, %struct.record* %13, i32 1 + store %struct.record* %incdec.ptr, %struct.record** %__new_finish, align 8 + %call14 = invoke dereferenceable(8) %struct.record** @_ZNK9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.5"* %__position) + to label %invoke.cont13 unwind label %lpad + +invoke.cont13: ; preds = %invoke.cont11 + %14 = load %struct.record*, %struct.record** %call14, align 8 + %15 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl15 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %15, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl15, i32 0, i32 1 + %16 = load %struct.record*, %struct.record** %_M_finish, align 8 + %17 = load %struct.record*, %struct.record** %__new_finish, align 8 + %18 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call17 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %18) + to label %invoke.cont16 unwind label %lpad + +invoke.cont16: ; preds = %invoke.cont13 + %call19 = invoke %struct.record* @_ZSt34__uninitialized_move_if_noexcept_aIP6recordS1_SaIS0_EET0_T_S4_S3_RT1_(%struct.record* %14, %struct.record* %16, %struct.record* %17, %"class.std::allocator"* dereferenceable(1) %call17) + to label %invoke.cont18 unwind label %lpad + +invoke.cont18: ; preds = %invoke.cont16 + store %struct.record* %call19, %struct.record** %__new_finish, align 8 + br label %try.cont + +lpad: ; preds = %invoke.cont16, %invoke.cont13, %invoke.cont11, %invoke.cont9, %invoke.cont7, %invoke.cont, %entry + %19 = landingpad { i8*, i32 } + catch i8* null + %20 = extractvalue { i8*, i32 } %19, 0 + store i8* %20, i8** %exn.slot, align 8 + %21 = extractvalue { i8*, i32 } %19, 1 + store i32 %21, i32* %ehselector.slot, align 4 + br label %catch + +catch: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %22 = call i8* @__cxa_begin_catch(i8* %exn) #12 + %23 = load %struct.record*, %struct.record** %__new_finish, align 8 + %tobool = icmp ne %struct.record* %23, null + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %catch + %24 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl20 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %24, i32 0, i32 0 + %25 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl20 to %"class.std::allocator"* + %26 = load %struct.record*, %struct.record** %__new_start, align 8 + %27 = load i64, i64* %__elems_before, align 8 + %add.ptr21 = getelementptr inbounds %struct.record, %struct.record* %26, i64 %27 + invoke void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE7destroyERS2_PS1_(%"class.std::allocator"* dereferenceable(1) %25, %struct.record* %add.ptr21) + to label %invoke.cont23 unwind label %lpad22 + +invoke.cont23: ; preds = %if.then + br label %if.end + +lpad22: ; preds = %invoke.cont27, %if.end, %invoke.cont24, %if.else, %if.then + %28 = landingpad { i8*, i32 } + cleanup + %29 = extractvalue { i8*, i32 } %28, 0 + store i8* %29, i8** %exn.slot, align 8 + %30 = extractvalue { i8*, i32 } %28, 1 + store i32 %30, i32* %ehselector.slot, align 4 + invoke void @__cxa_end_catch() + to label %invoke.cont28 unwind label %terminate.lpad + +if.else: ; preds = %catch + %31 = load %struct.record*, %struct.record** %__new_start, align 8 + %32 = load %struct.record*, %struct.record** %__new_finish, align 8 + %33 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call25 = invoke dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %33) + to label %invoke.cont24 unwind label %lpad22 + +invoke.cont24: ; preds = %if.else + invoke void @_ZSt8_DestroyIP6recordS0_EvT_S2_RSaIT0_E(%struct.record* %31, %struct.record* %32, %"class.std::allocator"* dereferenceable(1) %call25) + to label %invoke.cont26 unwind label %lpad22 + +invoke.cont26: ; preds = %invoke.cont24 + br label %if.end + +if.end: ; preds = %invoke.cont26, %invoke.cont23 + %34 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %35 = load %struct.record*, %struct.record** %__new_start, align 8 + %36 = load i64, i64* %__len, align 8 + invoke void @_ZNSt12_Vector_baseI6recordSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base"* %34, %struct.record* %35, i64 %36) + to label %invoke.cont27 unwind label %lpad22 + +invoke.cont27: ; preds = %if.end + invoke void @__cxa_rethrow() #15 + to label %unreachable unwind label %lpad22 + +invoke.cont28: ; preds = %lpad22 + br label %eh.resume + +try.cont: ; preds = %invoke.cont18 + %37 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl29 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %37, i32 0, i32 0 + %_M_start30 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl29, i32 0, i32 0 + %38 = load %struct.record*, %struct.record** %_M_start30, align 8 + %39 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl31 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %39, i32 0, i32 0 + %_M_finish32 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl31, i32 0, i32 1 + %40 = load %struct.record*, %struct.record** %_M_finish32, align 8 + %41 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call33 = call dereferenceable(1) %"class.std::allocator"* @_ZNSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %41) + call void @_ZSt8_DestroyIP6recordS0_EvT_S2_RSaIT0_E(%struct.record* %38, %struct.record* %40, %"class.std::allocator"* dereferenceable(1) %call33) + %42 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %43 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl34 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %43, i32 0, i32 0 + %_M_start35 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl34, i32 0, i32 0 + %44 = load %struct.record*, %struct.record** %_M_start35, align 8 + %45 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl36 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %45, i32 0, i32 0 + %_M_end_of_storage = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl36, i32 0, i32 2 + %46 = load %struct.record*, %struct.record** %_M_end_of_storage, align 8 + %47 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl37 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %47, i32 0, i32 0 + %_M_start38 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl37, i32 0, i32 0 + %48 = load %struct.record*, %struct.record** %_M_start38, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.record* %46 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.record* %48 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 60 + call void @_ZNSt12_Vector_baseI6recordSaIS0_EE13_M_deallocateEPS0_m(%"struct.std::_Vector_base"* %42, %struct.record* %44, i64 %sub.ptr.div) + %49 = load %struct.record*, %struct.record** %__new_start, align 8 + %50 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl39 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %50, i32 0, i32 0 + %_M_start40 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl39, i32 0, i32 0 + store %struct.record* %49, %struct.record** %_M_start40, align 8 + %51 = load %struct.record*, %struct.record** %__new_finish, align 8 + %52 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl41 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %52, i32 0, i32 0 + %_M_finish42 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl41, i32 0, i32 1 + store %struct.record* %51, %struct.record** %_M_finish42, align 8 + %53 = load %struct.record*, %struct.record** %__new_start, align 8 + %54 = load i64, i64* %__len, align 8 + %add.ptr43 = getelementptr inbounds %struct.record, %struct.record* %53, i64 %54 + %55 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl44 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %55, i32 0, i32 0 + %_M_end_of_storage45 = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl44, i32 0, i32 2 + store %struct.record* %add.ptr43, %struct.record** %_M_end_of_storage45, align 8 + ret void + +eh.resume: ; preds = %invoke.cont28 + %exn46 = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn46, 0 + %lpad.val47 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val47 + +terminate.lpad: ; preds = %lpad22 + %56 = landingpad { i8*, i32 } + catch i8* null + %57 = extractvalue { i8*, i32 } %56, 0 + call void @__clang_call_terminate(i8* %57) #13 + unreachable + +unreachable: ; preds = %invoke.cont27 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZNSt6vectorI6recordSaIS0_EE3endEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator.5", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + call void @_ZN9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEEC2ERKS2_(%"class.__gnu_cxx::__normal_iterator.5"* %retval, %struct.record** dereferenceable(8) %_M_finish) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %retval, i32 0, i32 0 + %1 = load %struct.record*, %struct.record** %coerce.dive, align 8 + ret %struct.record* %1 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI6recordE9constructEPS1_RKS1_(%"class.__gnu_cxx::new_allocator"* %this, %struct.record* %__p, %struct.record* dereferenceable(60) %__val) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + %__val.addr = alloca %struct.record*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + store %struct.record* %__val, %struct.record** %__val.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %0 = load %struct.record*, %struct.record** %__p.addr, align 8 + %1 = bitcast %struct.record* %0 to i8* + %2 = bitcast i8* %1 to %struct.record* + %3 = load %struct.record*, %struct.record** %__val.addr, align 8 + %4 = bitcast %struct.record* %2 to i8* + %5 = bitcast %struct.record* %3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %4, i8* align 4 %5, i64 60, i1 false) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorI6recordSaIS0_EE12_M_check_lenEmPKc(%"class.std::vector"* %this, i64 %__n, i8* %__s) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + %__n.addr = alloca i64, align 8 + %__s.addr = alloca i8*, align 8 + %__len = alloca i64, align 8 + %ref.tmp = alloca i64, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %__s, i8** %__s.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %call = call i64 @_ZNKSt6vectorI6recordSaIS0_EE8max_sizeEv(%"class.std::vector"* %this1) + %call2 = call i64 @_ZNKSt6vectorI6recordSaIS0_EE4sizeEv(%"class.std::vector"* %this1) + %sub = sub i64 %call, %call2 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ult i64 %sub, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i8*, i8** %__s.addr, align 8 + call void @_ZSt20__throw_length_errorPKc(i8* %1) #15 + unreachable + +if.end: ; preds = %entry + %call3 = call i64 @_ZNKSt6vectorI6recordSaIS0_EE4sizeEv(%"class.std::vector"* %this1) + %call4 = call i64 @_ZNKSt6vectorI6recordSaIS0_EE4sizeEv(%"class.std::vector"* %this1) + store i64 %call4, i64* %ref.tmp, align 8 + %call5 = call dereferenceable(8) i64* @_ZSt3maxImERKT_S2_S2_(i64* dereferenceable(8) %ref.tmp, i64* dereferenceable(8) %__n.addr) + %2 = load i64, i64* %call5, align 8 + %add = add i64 %call3, %2 + store i64 %add, i64* %__len, align 8 + %3 = load i64, i64* %__len, align 8 + %call6 = call i64 @_ZNKSt6vectorI6recordSaIS0_EE4sizeEv(%"class.std::vector"* %this1) + %cmp7 = icmp ult i64 %3, %call6 + br i1 %cmp7, label %cond.true, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %4 = load i64, i64* %__len, align 8 + %call8 = call i64 @_ZNKSt6vectorI6recordSaIS0_EE8max_sizeEv(%"class.std::vector"* %this1) + %cmp9 = icmp ugt i64 %4, %call8 + br i1 %cmp9, label %cond.true, label %cond.false + +cond.true: ; preds = %lor.lhs.false, %if.end + %call10 = call i64 @_ZNKSt6vectorI6recordSaIS0_EE8max_sizeEv(%"class.std::vector"* %this1) + br label %cond.end + +cond.false: ; preds = %lor.lhs.false + %5 = load i64, i64* %__len, align 8 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i64 [ %call10, %cond.true ], [ %5, %cond.false ] + ret i64 %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxxmiIP6recordSt6vectorIS1_SaIS1_EEEENS_17__normal_iteratorIT_T0_E15difference_typeERKS9_SC_(%"class.__gnu_cxx::__normal_iterator.5"* dereferenceable(8) %__lhs, %"class.__gnu_cxx::__normal_iterator.5"* dereferenceable(8) %__rhs) #0 comdat { +entry: + %__lhs.addr = alloca %"class.__gnu_cxx::__normal_iterator.5"*, align 8 + %__rhs.addr = alloca %"class.__gnu_cxx::__normal_iterator.5"*, align 8 + store %"class.__gnu_cxx::__normal_iterator.5"* %__lhs, %"class.__gnu_cxx::__normal_iterator.5"** %__lhs.addr, align 8 + store %"class.__gnu_cxx::__normal_iterator.5"* %__rhs, %"class.__gnu_cxx::__normal_iterator.5"** %__rhs.addr, align 8 + %0 = load %"class.__gnu_cxx::__normal_iterator.5"*, %"class.__gnu_cxx::__normal_iterator.5"** %__lhs.addr, align 8 + %call = call dereferenceable(8) %struct.record** @_ZNK9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.5"* %0) + %1 = load %struct.record*, %struct.record** %call, align 8 + %2 = load %"class.__gnu_cxx::__normal_iterator.5"*, %"class.__gnu_cxx::__normal_iterator.5"** %__rhs.addr, align 8 + %call1 = call dereferenceable(8) %struct.record** @_ZNK9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.5"* %2) + %3 = load %struct.record*, %struct.record** %call1, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.record* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.record* %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 60 + ret i64 %sub.ptr.div +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZNSt6vectorI6recordSaIS0_EE5beginEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %retval = alloca %"class.__gnu_cxx::__normal_iterator.5", align 8 + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 0 + call void @_ZN9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEEC2ERKS2_(%"class.__gnu_cxx::__normal_iterator.5"* %retval, %struct.record** dereferenceable(8) %_M_start) + %coerce.dive = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %retval, i32 0, i32 0 + %1 = load %struct.record*, %struct.record** %coerce.dive, align 8 + ret %struct.record* %1 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZNSt12_Vector_baseI6recordSaIS0_EE11_M_allocateEm(%"struct.std::_Vector_base"* %this, i64 %__n) #0 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + %__n.addr = alloca i64, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %0 = load i64, i64* %__n.addr, align 8 + %cmp = icmp ne i64 %0, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %1 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %struct.record* @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE8allocateERS2_m(%"class.std::allocator"* dereferenceable(1) %1, i64 %2) + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi %struct.record* [ %call, %cond.true ], [ null, %cond.false ] + ret %struct.record* %cond +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt34__uninitialized_move_if_noexcept_aIP6recordS1_SaIS0_EET0_T_S4_S3_RT1_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result, %"class.std::allocator"* dereferenceable(1) %__alloc) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + %__alloc.addr = alloca %"class.std::allocator"*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + store %"class.std::allocator"* %__alloc, %"class.std::allocator"** %__alloc.addr, align 8 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + %2 = load %struct.record*, %struct.record** %__result.addr, align 8 + %3 = load %"class.std::allocator"*, %"class.std::allocator"** %__alloc.addr, align 8 + %call = call %struct.record* @_ZSt22__uninitialized_copy_aIP6recordS1_S0_ET0_T_S3_S2_RSaIT1_E(%struct.record* %0, %struct.record* %1, %struct.record* %2, %"class.std::allocator"* dereferenceable(1) %3) + ret %struct.record* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(8) %struct.record** @_ZNK9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEE4baseEv(%"class.__gnu_cxx::__normal_iterator.5"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator.5"*, align 8 + store %"class.__gnu_cxx::__normal_iterator.5"* %this, %"class.__gnu_cxx::__normal_iterator.5"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator.5"*, %"class.__gnu_cxx::__normal_iterator.5"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %this1, i32 0, i32 0 + ret %struct.record** %_M_current +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE7destroyERS2_PS1_(%"class.std::allocator"* dereferenceable(1) %__a, %struct.record* %__p) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load %struct.record*, %struct.record** %__p.addr, align 8 + call void @_ZN9__gnu_cxx13new_allocatorI6recordE7destroyEPS1_(%"class.__gnu_cxx::new_allocator"* %1, %struct.record* %2) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorI6recordSaIS0_EE8max_sizeEv(%"class.std::vector"* %this) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %call = call dereferenceable(1) %"class.std::allocator"* @_ZNKSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %0) + %call2 = call i64 @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE8max_sizeERKS2_(%"class.std::allocator"* dereferenceable(1) %call) + ret i64 %call2 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNKSt6vectorI6recordSaIS0_EE4sizeEv(%"class.std::vector"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.std::vector"*, align 8 + store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8 + %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8 + %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0 + %_M_finish = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl, i32 0, i32 1 + %1 = load %struct.record*, %struct.record** %_M_finish, align 8 + %2 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"* + %_M_impl2 = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %2, i32 0, i32 0 + %_M_start = getelementptr inbounds %"struct.std::_Vector_base >::_Vector_impl", %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl2, i32 0, i32 0 + %3 = load %struct.record*, %struct.record** %_M_start, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.record* %1 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.record* %3 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 60 + ret i64 %sub.ptr.div +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE8max_sizeERKS2_(%"class.std::allocator"* dereferenceable(1) %__a) #4 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorI6recordE8max_sizeEv(%"class.__gnu_cxx::new_allocator"* %1) #12 + ret i64 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local dereferenceable(1) %"class.std::allocator"* @_ZNKSt12_Vector_baseI6recordSaIS0_EE19_M_get_Tp_allocatorEv(%"struct.std::_Vector_base"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"struct.std::_Vector_base"*, align 8 + store %"struct.std::_Vector_base"* %this, %"struct.std::_Vector_base"** %this.addr, align 8 + %this1 = load %"struct.std::_Vector_base"*, %"struct.std::_Vector_base"** %this.addr, align 8 + %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %this1, i32 0, i32 0 + %0 = bitcast %"struct.std::_Vector_base >::_Vector_impl"* %_M_impl to %"class.std::allocator"* + ret %"class.std::allocator"* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZNK9__gnu_cxx13new_allocatorI6recordE8max_sizeEv(%"class.__gnu_cxx::new_allocator"* %this) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + ret i64 307445734561825860 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx17__normal_iteratorIP6recordSt6vectorIS1_SaIS1_EEEC2ERKS2_(%"class.__gnu_cxx::__normal_iterator.5"* %this, %struct.record** dereferenceable(8) %__i) unnamed_addr #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::__normal_iterator.5"*, align 8 + %__i.addr = alloca %struct.record**, align 8 + store %"class.__gnu_cxx::__normal_iterator.5"* %this, %"class.__gnu_cxx::__normal_iterator.5"** %this.addr, align 8 + store %struct.record** %__i, %struct.record*** %__i.addr, align 8 + %this1 = load %"class.__gnu_cxx::__normal_iterator.5"*, %"class.__gnu_cxx::__normal_iterator.5"** %this.addr, align 8 + %_M_current = getelementptr inbounds %"class.__gnu_cxx::__normal_iterator.5", %"class.__gnu_cxx::__normal_iterator.5"* %this1, i32 0, i32 0 + %0 = load %struct.record**, %struct.record*** %__i.addr, align 8 + %1 = load %struct.record*, %struct.record** %0, align 8 + store %struct.record* %1, %struct.record** %_M_current, align 8 + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZN9__gnu_cxx14__alloc_traitsISaI6recordEE8allocateERS2_m(%"class.std::allocator"* dereferenceable(1) %__a, i64 %__n) #0 comdat align 2 { +entry: + %__a.addr = alloca %"class.std::allocator"*, align 8 + %__n.addr = alloca i64, align 8 + store %"class.std::allocator"* %__a, %"class.std::allocator"** %__a.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + %0 = load %"class.std::allocator"*, %"class.std::allocator"** %__a.addr, align 8 + %1 = bitcast %"class.std::allocator"* %0 to %"class.__gnu_cxx::new_allocator"* + %2 = load i64, i64* %__n.addr, align 8 + %call = call %struct.record* @_ZN9__gnu_cxx13new_allocatorI6recordE8allocateEmPKv(%"class.__gnu_cxx::new_allocator"* %1, i64 %2, i8* null) + ret %struct.record* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZN9__gnu_cxx13new_allocatorI6recordE8allocateEmPKv(%"class.__gnu_cxx::new_allocator"* %this, i64 %__n, i8* %0) #0 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__n.addr = alloca i64, align 8 + %.addr = alloca i8*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store i64 %__n, i64* %__n.addr, align 8 + store i8* %0, i8** %.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %1 = load i64, i64* %__n.addr, align 8 + %call = call i64 @_ZNK9__gnu_cxx13new_allocatorI6recordE8max_sizeEv(%"class.__gnu_cxx::new_allocator"* %this1) #12 + %cmp = icmp ugt i64 %1, %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @_ZSt17__throw_bad_allocv() #15 + unreachable + +if.end: ; preds = %entry + %2 = load i64, i64* %__n.addr, align 8 + %mul = mul i64 %2, 60 + %call2 = call i8* @_Znwm(i64 %mul) + %3 = bitcast i8* %call2 to %struct.record* + ret %struct.record* %3 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt22__uninitialized_copy_aIP6recordS1_S0_ET0_T_S3_S2_RSaIT1_E(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result, %"class.std::allocator"* dereferenceable(1) %0) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + %.addr = alloca %"class.std::allocator"*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + store %"class.std::allocator"* %0, %"class.std::allocator"** %.addr, align 8 + %1 = load %struct.record*, %struct.record** %__first.addr, align 8 + %2 = load %struct.record*, %struct.record** %__last.addr, align 8 + %3 = load %struct.record*, %struct.record** %__result.addr, align 8 + %call = call %struct.record* @_ZSt18uninitialized_copyIP6recordS1_ET0_T_S3_S2_(%struct.record* %1, %struct.record* %2, %struct.record* %3) + ret %struct.record* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt18uninitialized_copyIP6recordS1_ET0_T_S3_S2_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + %__assignable = alloca i8, align 1 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + store i8 1, i8* %__assignable, align 1 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + %2 = load %struct.record*, %struct.record** %__result.addr, align 8 + %call = call %struct.record* @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIP6recordS3_EET0_T_S5_S4_(%struct.record* %0, %struct.record* %1, %struct.record* %2) + ret %struct.record* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZNSt20__uninitialized_copyILb1EE13__uninit_copyIP6recordS3_EET0_T_S5_S4_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result) #0 comdat align 2 { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + %2 = load %struct.record*, %struct.record** %__result.addr, align 8 + %call = call %struct.record* @_ZSt4copyIP6recordS1_ET0_T_S3_S2_(%struct.record* %0, %struct.record* %1, %struct.record* %2) + ret %struct.record* %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt4copyIP6recordS1_ET0_T_S3_S2_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %call = call %struct.record* @_ZSt12__miter_baseIP6recordET_S2_(%struct.record* %0) + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + %call1 = call %struct.record* @_ZSt12__miter_baseIP6recordET_S2_(%struct.record* %1) + %2 = load %struct.record*, %struct.record** %__result.addr, align 8 + %call2 = call %struct.record* @_ZSt14__copy_move_a2ILb0EP6recordS1_ET1_T0_S3_S2_(%struct.record* %call, %struct.record* %call1, %struct.record* %2) + ret %struct.record* %call2 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt14__copy_move_a2ILb0EP6recordS1_ET1_T0_S3_S2_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %call = call %struct.record* @_ZSt12__niter_baseIP6recordET_S2_(%struct.record* %0) + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + %call1 = call %struct.record* @_ZSt12__niter_baseIP6recordET_S2_(%struct.record* %1) + %2 = load %struct.record*, %struct.record** %__result.addr, align 8 + %call2 = call %struct.record* @_ZSt12__niter_baseIP6recordET_S2_(%struct.record* %2) + %call3 = call %struct.record* @_ZSt13__copy_move_aILb0EP6recordS1_ET1_T0_S3_S2_(%struct.record* %call, %struct.record* %call1, %struct.record* %call2) + ret %struct.record* %call3 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt12__miter_baseIP6recordET_S2_(%struct.record* %__it) #4 comdat { +entry: + %__it.addr = alloca %struct.record*, align 8 + store %struct.record* %__it, %struct.record** %__it.addr, align 8 + %0 = load %struct.record*, %struct.record** %__it.addr, align 8 + ret %struct.record* %0 +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt13__copy_move_aILb0EP6recordS1_ET1_T0_S3_S2_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result) #0 comdat { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + %__simple = alloca i8, align 1 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + store i8 1, i8* %__simple, align 1 + %0 = load %struct.record*, %struct.record** %__first.addr, align 8 + %1 = load %struct.record*, %struct.record** %__last.addr, align 8 + %2 = load %struct.record*, %struct.record** %__result.addr, align 8 + %call = call %struct.record* @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mI6recordEEPT_PKS4_S7_S5_(%struct.record* %0, %struct.record* %1, %struct.record* %2) + ret %struct.record* %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZSt12__niter_baseIP6recordET_S2_(%struct.record* %__it) #4 comdat { +entry: + %__it.addr = alloca %struct.record*, align 8 + store %struct.record* %__it, %struct.record** %__it.addr, align 8 + %0 = load %struct.record*, %struct.record** %__it.addr, align 8 + ret %struct.record* %0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local %struct.record* @_ZNSt11__copy_moveILb0ELb1ESt26random_access_iterator_tagE8__copy_mI6recordEEPT_PKS4_S7_S5_(%struct.record* %__first, %struct.record* %__last, %struct.record* %__result) #4 comdat align 2 { +entry: + %__first.addr = alloca %struct.record*, align 8 + %__last.addr = alloca %struct.record*, align 8 + %__result.addr = alloca %struct.record*, align 8 + %_Num = alloca i64, align 8 + store %struct.record* %__first, %struct.record** %__first.addr, align 8 + store %struct.record* %__last, %struct.record** %__last.addr, align 8 + store %struct.record* %__result, %struct.record** %__result.addr, align 8 + %0 = load %struct.record*, %struct.record** %__last.addr, align 8 + %1 = load %struct.record*, %struct.record** %__first.addr, align 8 + %sub.ptr.lhs.cast = ptrtoint %struct.record* %0 to i64 + %sub.ptr.rhs.cast = ptrtoint %struct.record* %1 to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 60 + store i64 %sub.ptr.div, i64* %_Num, align 8 + %2 = load i64, i64* %_Num, align 8 + %tobool = icmp ne i64 %2, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %3 = load %struct.record*, %struct.record** %__result.addr, align 8 + %4 = bitcast %struct.record* %3 to i8* + %5 = load %struct.record*, %struct.record** %__first.addr, align 8 + %6 = bitcast %struct.record* %5 to i8* + %7 = load i64, i64* %_Num, align 8 + %mul = mul i64 60, %7 + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %4, i8* align 4 %6, i64 %mul, i1 false) + br label %if.end + +if.end: ; preds = %if.then, %entry + %8 = load %struct.record*, %struct.record** %__result.addr, align 8 + %9 = load i64, i64* %_Num, align 8 + %add.ptr = getelementptr inbounds %struct.record, %struct.record* %8, i64 %9 + ret %struct.record* %add.ptr +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9__gnu_cxx13new_allocatorI6recordE7destroyEPS1_(%"class.__gnu_cxx::new_allocator"* %this, %struct.record* %__p) #4 comdat align 2 { +entry: + %this.addr = alloca %"class.__gnu_cxx::new_allocator"*, align 8 + %__p.addr = alloca %struct.record*, align 8 + store %"class.__gnu_cxx::new_allocator"* %this, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + store %struct.record* %__p, %struct.record** %__p.addr, align 8 + %this1 = load %"class.__gnu_cxx::new_allocator"*, %"class.__gnu_cxx::new_allocator"** %this.addr, align 8 + %0 = load %struct.record*, %struct.record** %__p.addr, align 8 + ret void +} + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (%struct.latLong*, float*, i32, float, float)* @_Z6euclidP7latLongPfiff to i8*), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { noinline noreturn nounwind } +attributes #7 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #10 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #11 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #12 = { nounwind } +attributes #13 = { noreturn nounwind } +attributes #14 = { nounwind readonly } +attributes #15 = { noreturn } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/nn/nn_cuda.cu b/examples/nn/nn_cuda.cu new file mode 100644 index 0000000..1749bea --- /dev/null +++ b/examples/nn/nn_cuda.cu @@ -0,0 +1,328 @@ +#include "cuda.h" +#include +#include +#include +#include + +#ifdef TIMING +#include "timing.h" + +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +#define min(a, b) a > b ? b : a +#define ceilDiv(a, b) (a + b - 1) / b +#define print(x) printf(#x ": %lu\n", (unsigned long)x) +#define DEBUG false + +#define DEFAULT_THREADS_PER_BLOCK 256 + +#define MAX_ARGS 10 +#define REC_LENGTH 53 // size of a record in db +#define LATITUDE_POS \ + 28 // character position of the latitude value in each record +#define OPEN 10000 // initial value of nearest neighbors + +typedef struct latLong { + float lat; + float lng; +} LatLong; + +typedef struct record { + char recString[REC_LENGTH]; + float distance; +} Record; + +int loadData(char *filename, std::vector &records, + std::vector &locations); +void findLowest(std::vector &records, float *distances, int numRecords, + int topN); +void printUsage(); +int parseCommandline(int argc, char *argv[], char *filename, int *r, float *lat, + float *lng, int *q, int *t, int *p, int *d); + +/** + * Kernel + * Executed on GPU + * Calculates the Euclidean distance from each record in the database to the + * target position + */ +__global__ void euclid(LatLong *d_locations, float *d_distances, int numRecords, + float lat, float lng) { + // int globalId = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * + // blockIdx.x + threadIdx.x; + int globalId = blockDim.x * (gridDim.x * blockIdx.y + blockIdx.x) + + threadIdx.x; // more efficient + LatLong *latLong = d_locations + globalId; + if (globalId < numRecords) { + float *dist = d_distances + globalId; + *dist = (float)sqrt((lat - latLong->lat) * (lat - latLong->lat) + + (lng - latLong->lng) * (lng - latLong->lng)); + } +} + +/** + * This program finds the k-nearest neighbors + **/ + +int main(int argc, char *argv[]) { + cudaSetDevice(0); + int i = 0; + float lat, lng; + int quiet = 0, timing = 0, platform = 0, device = 0; + + std::vector records; + std::vector locations; + char filename[100]; + int resultsCount = 10; + + // parse command line + if (parseCommandline(argc, argv, filename, &resultsCount, &lat, &lng, &quiet, + &timing, &platform, &device)) { + printUsage(); + return 0; + } + printf("before all\n"); + int numRecords = loadData(filename, records, locations); + if (resultsCount > numRecords) + resultsCount = numRecords; + printf("after before all\n"); + + // Pointers to host memory + float *distances; + // Pointers to device memory + LatLong *d_locations; + float *d_distances; + + // Scaling calculations - added by Sam Kauffman + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + cudaDeviceSynchronize(); + unsigned long maxGridX = deviceProp.maxGridSize[0]; + unsigned long threadsPerBlock = 256; + size_t totalDeviceMemory; + size_t freeDeviceMemory; + unsigned long blocks = + ceilDiv(numRecords, threadsPerBlock); // extra threads will do nothing + unsigned long gridY = ceilDiv(blocks, maxGridX); + unsigned long gridX = ceilDiv(blocks, gridY); + // There will be no more than (gridY - 1) extra blocks + dim3 gridDim(gridX, gridY); + + /** + * Allocate memory on host and device + */ + distances = (float *)malloc(sizeof(float) * numRecords); + cudaMalloc((void **)&d_locations, sizeof(LatLong) * numRecords); + cudaMalloc((void **)&d_distances, sizeof(float) * numRecords); + + /** + * Transfer data from host to device + */ + cudaMemcpy(d_locations, &locations[0], sizeof(LatLong) * numRecords, + cudaMemcpyHostToDevice); + + /** + * Execute kernel + */ + printf("before call\n"); + euclid<<>>(d_locations, d_distances, numRecords, + lat, lng); + cudaDeviceSynchronize(); + printf("after call\n"); + // Copy data from device memory to host memory + cudaMemcpy(distances, d_distances, sizeof(float) * numRecords, + cudaMemcpyDeviceToHost); + + // find the resultsCount least distances + printf("before find\n"); + findLowest(records, distances, numRecords, resultsCount); + printf("after find\n"); + // print out results + if (!quiet) + for (i = 0; i < resultsCount; i++) { + printf("%s --> Distance=%f\n", records[i].recString, records[i].distance); + } + free(distances); + // Free memory + cudaFree(d_locations); + cudaFree(d_distances); + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} + +int loadData(char *filename, std::vector &records, + std::vector &locations) { + FILE *flist, *fp; + int i = 0; + char dbname[64]; + int recNum = 0; + + /**Main processing **/ + + flist = fopen(filename, "r"); + while (!feof(flist)) { + /** + * Read in all records of length REC_LENGTH + * If this is the last file in the filelist, then done + * else open next file to be read next iteration + */ + if (fscanf(flist, "%s\n", dbname) != 1) { + fprintf(stderr, "error reading filelist\n"); + exit(0); + } + fp = fopen(dbname, "r"); + if (!fp) { + printf("error opening a db\n"); + exit(1); + } + // read each record + while (!feof(fp)) { + Record record; + LatLong latLong; + fgets(record.recString, 49, fp); + fgetc(fp); // newline + if (feof(fp)) + break; + + // parse for lat and long + char substr[6]; + + for (i = 0; i < 5; i++) + substr[i] = *(record.recString + i + 28); + substr[5] = '\0'; + latLong.lat = atof(substr); + + for (i = 0; i < 5; i++) + substr[i] = *(record.recString + i + 33); + substr[5] = '\0'; + latLong.lng = atof(substr); + + locations.push_back(latLong); + records.push_back(record); + recNum++; + } + fclose(fp); + } + fclose(flist); + // for(i=0;i &records, float *distances, int numRecords, + int topN) { + int i, j; + float val; + int minLoc; + Record *tempRec; + float tempDist; + + for (i = 0; i < topN; i++) { + minLoc = i; + for (j = i; j < numRecords; j++) { + val = distances[j]; + if (val < distances[minLoc]) + minLoc = j; + } + // swap locations and distances + tempRec = &records[i]; + records[i] = records[minLoc]; + records[minLoc] = *tempRec; + + tempDist = distances[i]; + distances[i] = distances[minLoc]; + distances[minLoc] = tempDist; + + // add distance to the min we just found + records[i].distance = distances[i]; + } +} + +int parseCommandline(int argc, char *argv[], char *filename, int *r, float *lat, + float *lng, int *q, int *t, int *p, int *d) { + int i; + if (argc < 2) + return 1; // error + strncpy(filename, argv[1], 100); + char flag; + + for (i = 1; i < argc; i++) { + if (argv[i][0] == '-') { // flag + flag = argv[i][1]; + switch (flag) { + case 'r': // number of results + i++; + *r = atoi(argv[i]); + break; + case 'l': // lat or lng + if (argv[i][2] == 'a') { // lat + *lat = atof(argv[i + 1]); + } else { // lng + *lng = atof(argv[i + 1]); + } + i++; + break; + case 'h': // help + return 1; + case 'q': // quiet + *q = 1; + break; + case 't': // timing + *t = 1; + break; + case 'p': // platform + i++; + *p = atoi(argv[i]); + break; + case 'd': // device + i++; + *d = atoi(argv[i]); + break; + } + } + } + if ((*d >= 0 && *p < 0) || + (*p >= 0 && + *d < 0)) // both p and d must be specified if either are specified + return 1; + return 0; +} + +void printUsage() { + printf("Nearest Neighbor Usage\n"); + printf("\n"); + printf("nearestNeighbor [filename] -r [int] -lat [float] -lng [float] [-hqt] " + "[-p [int] -d [int]]\n"); + printf("\n"); + printf("example:\n"); + printf("$ ./nearestNeighbor filelist.txt -r 5 -lat 30 -lng 90\n"); + printf("\n"); + printf("filename the filename that lists the data input files\n"); + printf("-r [int] the number of records to return (default: 10)\n"); + printf("-lat [float] the latitude for nearest neighbors (default: 0)\n"); + printf("-lng [float] the longitude for nearest neighbors (default: 0)\n"); + printf("\n"); + printf("-h, --help Display the help file\n"); + printf("-q Quiet mode. Suppress all text output.\n"); + printf("-t Print timing information.\n"); + printf("\n"); + printf("-p [int] Choose the platform (must choose both platform and " + "device)\n"); + printf("-d [int] Choose the device (must choose both platform and " + "device)\n"); + printf("\n"); + printf("\n"); + printf("Notes: 1. The filename is required as the first parameter.\n"); + printf(" 2. If you declare either the device or the platform,\n"); + printf(" you must declare both.\n\n"); +} diff --git a/examples/nn/run.sh b/examples/nn/run.sh new file mode 100644 index 0000000..d99021e --- /dev/null +++ b/examples/nn/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e +llvm-as nn_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as nn_cuda-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator nn_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator nn_cuda-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \ + -o nn -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread + +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./nn filelist_4 -r 3 -lat 30 -lng 90 >> res.log +if grep -q "1988 12 27 0 18 TONY 30.0 89.8 113 39 --> Distance=0.199997" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/nw/needle-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/nw/needle-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..956112e --- /dev/null +++ b/examples/nw/needle-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,923 @@ +; ModuleID = 'needle-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "needle.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@_ZZ20needle_cuda_shared_1PiS_iiiiE4temp = internal addrspace(3) global [17 x [17 x i32]] undef, align 4 +@_ZZ20needle_cuda_shared_1PiS_iiiiE3ref = internal addrspace(3) global [16 x [16 x i32]] undef, align 4 +@_ZZ20needle_cuda_shared_2PiS_iiiiE4temp = internal addrspace(3) global [17 x [17 x i32]] undef, align 4 +@_ZZ20needle_cuda_shared_2PiS_iiiiE3ref = internal addrspace(3) global [16 x [16 x i32]] undef, align 4 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local i32 @_Z14maximum_deviceiii(i32 %a, i32 %b, i32 %c) #0 { +entry: + %retval = alloca i32, align 4 + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + %k = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + store i32 %b, i32* %b.addr, align 4 + store i32 %c, i32* %c.addr, align 4 + %0 = load i32, i32* %a.addr, align 4 + %1 = load i32, i32* %b.addr, align 4 + %cmp = icmp sle i32 %0, %1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %2 = load i32, i32* %b.addr, align 4 + store i32 %2, i32* %k, align 4 + br label %if.end + +if.else: ; preds = %entry + %3 = load i32, i32* %a.addr, align 4 + store i32 %3, i32* %k, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %4 = load i32, i32* %k, align 4 + %5 = load i32, i32* %c.addr, align 4 + %cmp1 = icmp sle i32 %4, %5 + br i1 %cmp1, label %if.then2, label %if.else3 + +if.then2: ; preds = %if.end + %6 = load i32, i32* %c.addr, align 4 + store i32 %6, i32* %retval, align 4 + br label %return + +if.else3: ; preds = %if.end + %7 = load i32, i32* %k, align 4 + store i32 %7, i32* %retval, align 4 + br label %return + +return: ; preds = %if.else3, %if.then2 + %8 = load i32, i32* %retval, align 4 + ret i32 %8 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z20needle_cuda_shared_1PiS_iiii(i32* %referrence, i32* %matrix_cuda, i32 %cols, i32 %penalty, i32 %i, i32 %block_width) #0 { +entry: + %referrence.addr = alloca i32*, align 8 + %matrix_cuda.addr = alloca i32*, align 8 + %cols.addr = alloca i32, align 4 + %penalty.addr = alloca i32, align 4 + %i.addr = alloca i32, align 4 + %block_width.addr = alloca i32, align 4 + %bx = alloca i32, align 4 + %tx = alloca i32, align 4 + %b_index_x = alloca i32, align 4 + %b_index_y = alloca i32, align 4 + %index = alloca i32, align 4 + %index_n = alloca i32, align 4 + %index_w = alloca i32, align 4 + %index_nw = alloca i32, align 4 + %ty = alloca i32, align 4 + %m = alloca i32, align 4 + %t_index_x = alloca i32, align 4 + %t_index_y = alloca i32, align 4 + %m90 = alloca i32, align 4 + %t_index_x96 = alloca i32, align 4 + %t_index_y99 = alloca i32, align 4 + %ty134 = alloca i32, align 4 + store i32* %referrence, i32** %referrence.addr, align 8 + store i32* %matrix_cuda, i32** %matrix_cuda.addr, align 8 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %penalty, i32* %penalty.addr, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %block_width, i32* %block_width.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %bx, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %tx, align 4 + %0 = load i32, i32* %bx, align 4 + store i32 %0, i32* %b_index_x, align 4 + %1 = load i32, i32* %i.addr, align 4 + %sub = sub nsw i32 %1, 1 + %2 = load i32, i32* %bx, align 4 + %sub2 = sub nsw i32 %sub, %2 + store i32 %sub2, i32* %b_index_y, align 4 + %3 = load i32, i32* %cols.addr, align 4 + %mul = mul nsw i32 %3, 16 + %4 = load i32, i32* %b_index_y, align 4 + %mul3 = mul nsw i32 %mul, %4 + %5 = load i32, i32* %b_index_x, align 4 + %mul4 = mul nsw i32 16, %5 + %add = add nsw i32 %mul3, %mul4 + %6 = load i32, i32* %tx, align 4 + %add5 = add nsw i32 %add, %6 + %7 = load i32, i32* %cols.addr, align 4 + %add6 = add nsw i32 %7, 1 + %add7 = add nsw i32 %add5, %add6 + store i32 %add7, i32* %index, align 4 + %8 = load i32, i32* %cols.addr, align 4 + %mul8 = mul nsw i32 %8, 16 + %9 = load i32, i32* %b_index_y, align 4 + %mul9 = mul nsw i32 %mul8, %9 + %10 = load i32, i32* %b_index_x, align 4 + %mul10 = mul nsw i32 16, %10 + %add11 = add nsw i32 %mul9, %mul10 + %11 = load i32, i32* %tx, align 4 + %add12 = add nsw i32 %add11, %11 + %add13 = add nsw i32 %add12, 1 + store i32 %add13, i32* %index_n, align 4 + %12 = load i32, i32* %cols.addr, align 4 + %mul14 = mul nsw i32 %12, 16 + %13 = load i32, i32* %b_index_y, align 4 + %mul15 = mul nsw i32 %mul14, %13 + %14 = load i32, i32* %b_index_x, align 4 + %mul16 = mul nsw i32 16, %14 + %add17 = add nsw i32 %mul15, %mul16 + %15 = load i32, i32* %cols.addr, align 4 + %add18 = add nsw i32 %add17, %15 + store i32 %add18, i32* %index_w, align 4 + %16 = load i32, i32* %cols.addr, align 4 + %mul19 = mul nsw i32 %16, 16 + %17 = load i32, i32* %b_index_y, align 4 + %mul20 = mul nsw i32 %mul19, %17 + %18 = load i32, i32* %b_index_x, align 4 + %mul21 = mul nsw i32 16, %18 + %add22 = add nsw i32 %mul20, %mul21 + store i32 %add22, i32* %index_nw, align 4 + %19 = load i32, i32* %tx, align 4 + %cmp = icmp eq i32 %19, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %20 = load i32*, i32** %matrix_cuda.addr, align 8 + %21 = load i32, i32* %index_nw, align 4 + %idxprom = sext i32 %21 to i64 + %arrayidx = getelementptr inbounds i32, i32* %20, i64 %idxprom + %22 = load i32, i32* %arrayidx, align 4 + %23 = load i32, i32* %tx, align 4 + %idxprom23 = sext i32 %23 to i64 + %arrayidx24 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom23 + %arrayidx25 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx24, i64 0, i64 0 + store i32 %22, i32* %arrayidx25, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %ty, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %24 = load i32, i32* %ty, align 4 + %cmp26 = icmp slt i32 %24, 16 + br i1 %cmp26, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %25 = load i32*, i32** %referrence.addr, align 8 + %26 = load i32, i32* %index, align 4 + %27 = load i32, i32* %cols.addr, align 4 + %28 = load i32, i32* %ty, align 4 + %mul27 = mul nsw i32 %27, %28 + %add28 = add nsw i32 %26, %mul27 + %idxprom29 = sext i32 %add28 to i64 + %arrayidx30 = getelementptr inbounds i32, i32* %25, i64 %idxprom29 + %29 = load i32, i32* %arrayidx30, align 4 + %30 = load i32, i32* %ty, align 4 + %idxprom31 = sext i32 %30 to i64 + %arrayidx32 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* addrspacecast ([16 x [16 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE3ref to [16 x [16 x i32]]*), i64 0, i64 %idxprom31 + %31 = load i32, i32* %tx, align 4 + %idxprom33 = sext i32 %31 to i64 + %arrayidx34 = getelementptr inbounds [16 x i32], [16 x i32]* %arrayidx32, i64 0, i64 %idxprom33 + store i32 %29, i32* %arrayidx34, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %32 = load i32, i32* %ty, align 4 + %inc = add nsw i32 %32, 1 + store i32 %inc, i32* %ty, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + call void @llvm.nvvm.barrier0() + %33 = load i32*, i32** %matrix_cuda.addr, align 8 + %34 = load i32, i32* %index_w, align 4 + %35 = load i32, i32* %cols.addr, align 4 + %36 = load i32, i32* %tx, align 4 + %mul35 = mul nsw i32 %35, %36 + %add36 = add nsw i32 %34, %mul35 + %idxprom37 = sext i32 %add36 to i64 + %arrayidx38 = getelementptr inbounds i32, i32* %33, i64 %idxprom37 + %37 = load i32, i32* %arrayidx38, align 4 + %38 = load i32, i32* %tx, align 4 + %add39 = add nsw i32 %38, 1 + %idxprom40 = sext i32 %add39 to i64 + %arrayidx41 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom40 + %arrayidx42 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx41, i64 0, i64 0 + store i32 %37, i32* %arrayidx42, align 4 + call void @llvm.nvvm.barrier0() + %39 = load i32*, i32** %matrix_cuda.addr, align 8 + %40 = load i32, i32* %index_n, align 4 + %idxprom43 = sext i32 %40 to i64 + %arrayidx44 = getelementptr inbounds i32, i32* %39, i64 %idxprom43 + %41 = load i32, i32* %arrayidx44, align 4 + %42 = load i32, i32* %tx, align 4 + %add45 = add nsw i32 %42, 1 + %idxprom46 = sext i32 %add45 to i64 + %arrayidx47 = getelementptr inbounds [17 x i32], [17 x i32]* getelementptr inbounds ([17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 0), i64 0, i64 %idxprom46 + store i32 %41, i32* %arrayidx47, align 4 + call void @llvm.nvvm.barrier0() + store i32 0, i32* %m, align 4 + br label %for.cond48 + +for.cond48: ; preds = %for.inc87, %for.end + %43 = load i32, i32* %m, align 4 + %cmp49 = icmp slt i32 %43, 16 + br i1 %cmp49, label %for.body50, label %for.end89 + +for.body50: ; preds = %for.cond48 + %44 = load i32, i32* %tx, align 4 + %45 = load i32, i32* %m, align 4 + %cmp51 = icmp sle i32 %44, %45 + br i1 %cmp51, label %if.then52, label %if.end86 + +if.then52: ; preds = %for.body50 + %46 = load i32, i32* %tx, align 4 + %add53 = add nsw i32 %46, 1 + store i32 %add53, i32* %t_index_x, align 4 + %47 = load i32, i32* %m, align 4 + %48 = load i32, i32* %tx, align 4 + %sub54 = sub nsw i32 %47, %48 + %add55 = add nsw i32 %sub54, 1 + store i32 %add55, i32* %t_index_y, align 4 + %49 = load i32, i32* %t_index_y, align 4 + %sub56 = sub nsw i32 %49, 1 + %idxprom57 = sext i32 %sub56 to i64 + %arrayidx58 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom57 + %50 = load i32, i32* %t_index_x, align 4 + %sub59 = sub nsw i32 %50, 1 + %idxprom60 = sext i32 %sub59 to i64 + %arrayidx61 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx58, i64 0, i64 %idxprom60 + %51 = load i32, i32* %arrayidx61, align 4 + %52 = load i32, i32* %t_index_y, align 4 + %sub62 = sub nsw i32 %52, 1 + %idxprom63 = sext i32 %sub62 to i64 + %arrayidx64 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* addrspacecast ([16 x [16 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE3ref to [16 x [16 x i32]]*), i64 0, i64 %idxprom63 + %53 = load i32, i32* %t_index_x, align 4 + %sub65 = sub nsw i32 %53, 1 + %idxprom66 = sext i32 %sub65 to i64 + %arrayidx67 = getelementptr inbounds [16 x i32], [16 x i32]* %arrayidx64, i64 0, i64 %idxprom66 + %54 = load i32, i32* %arrayidx67, align 4 + %add68 = add nsw i32 %51, %54 + %55 = load i32, i32* %t_index_y, align 4 + %idxprom69 = sext i32 %55 to i64 + %arrayidx70 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom69 + %56 = load i32, i32* %t_index_x, align 4 + %sub71 = sub nsw i32 %56, 1 + %idxprom72 = sext i32 %sub71 to i64 + %arrayidx73 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx70, i64 0, i64 %idxprom72 + %57 = load i32, i32* %arrayidx73, align 4 + %58 = load i32, i32* %penalty.addr, align 4 + %sub74 = sub nsw i32 %57, %58 + %59 = load i32, i32* %t_index_y, align 4 + %sub75 = sub nsw i32 %59, 1 + %idxprom76 = sext i32 %sub75 to i64 + %arrayidx77 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom76 + %60 = load i32, i32* %t_index_x, align 4 + %idxprom78 = sext i32 %60 to i64 + %arrayidx79 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx77, i64 0, i64 %idxprom78 + %61 = load i32, i32* %arrayidx79, align 4 + %62 = load i32, i32* %penalty.addr, align 4 + %sub80 = sub nsw i32 %61, %62 + %call81 = call i32 @_Z14maximum_deviceiii(i32 %add68, i32 %sub74, i32 %sub80) #2 + %63 = load i32, i32* %t_index_y, align 4 + %idxprom82 = sext i32 %63 to i64 + %arrayidx83 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom82 + %64 = load i32, i32* %t_index_x, align 4 + %idxprom84 = sext i32 %64 to i64 + %arrayidx85 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx83, i64 0, i64 %idxprom84 + store i32 %call81, i32* %arrayidx85, align 4 + br label %if.end86 + +if.end86: ; preds = %if.then52, %for.body50 + call void @llvm.nvvm.barrier0() + br label %for.inc87 + +for.inc87: ; preds = %if.end86 + %65 = load i32, i32* %m, align 4 + %inc88 = add nsw i32 %65, 1 + store i32 %inc88, i32* %m, align 4 + br label %for.cond48 + +for.end89: ; preds = %for.cond48 + store i32 14, i32* %m90, align 4 + br label %for.cond91 + +for.cond91: ; preds = %for.inc132, %for.end89 + %66 = load i32, i32* %m90, align 4 + %cmp92 = icmp sge i32 %66, 0 + br i1 %cmp92, label %for.body93, label %for.end133 + +for.body93: ; preds = %for.cond91 + %67 = load i32, i32* %tx, align 4 + %68 = load i32, i32* %m90, align 4 + %cmp94 = icmp sle i32 %67, %68 + br i1 %cmp94, label %if.then95, label %if.end131 + +if.then95: ; preds = %for.body93 + %69 = load i32, i32* %tx, align 4 + %add97 = add nsw i32 %69, 16 + %70 = load i32, i32* %m90, align 4 + %sub98 = sub nsw i32 %add97, %70 + store i32 %sub98, i32* %t_index_x96, align 4 + %71 = load i32, i32* %tx, align 4 + %sub100 = sub nsw i32 16, %71 + store i32 %sub100, i32* %t_index_y99, align 4 + %72 = load i32, i32* %t_index_y99, align 4 + %sub101 = sub nsw i32 %72, 1 + %idxprom102 = sext i32 %sub101 to i64 + %arrayidx103 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom102 + %73 = load i32, i32* %t_index_x96, align 4 + %sub104 = sub nsw i32 %73, 1 + %idxprom105 = sext i32 %sub104 to i64 + %arrayidx106 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx103, i64 0, i64 %idxprom105 + %74 = load i32, i32* %arrayidx106, align 4 + %75 = load i32, i32* %t_index_y99, align 4 + %sub107 = sub nsw i32 %75, 1 + %idxprom108 = sext i32 %sub107 to i64 + %arrayidx109 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* addrspacecast ([16 x [16 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE3ref to [16 x [16 x i32]]*), i64 0, i64 %idxprom108 + %76 = load i32, i32* %t_index_x96, align 4 + %sub110 = sub nsw i32 %76, 1 + %idxprom111 = sext i32 %sub110 to i64 + %arrayidx112 = getelementptr inbounds [16 x i32], [16 x i32]* %arrayidx109, i64 0, i64 %idxprom111 + %77 = load i32, i32* %arrayidx112, align 4 + %add113 = add nsw i32 %74, %77 + %78 = load i32, i32* %t_index_y99, align 4 + %idxprom114 = sext i32 %78 to i64 + %arrayidx115 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom114 + %79 = load i32, i32* %t_index_x96, align 4 + %sub116 = sub nsw i32 %79, 1 + %idxprom117 = sext i32 %sub116 to i64 + %arrayidx118 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx115, i64 0, i64 %idxprom117 + %80 = load i32, i32* %arrayidx118, align 4 + %81 = load i32, i32* %penalty.addr, align 4 + %sub119 = sub nsw i32 %80, %81 + %82 = load i32, i32* %t_index_y99, align 4 + %sub120 = sub nsw i32 %82, 1 + %idxprom121 = sext i32 %sub120 to i64 + %arrayidx122 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom121 + %83 = load i32, i32* %t_index_x96, align 4 + %idxprom123 = sext i32 %83 to i64 + %arrayidx124 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx122, i64 0, i64 %idxprom123 + %84 = load i32, i32* %arrayidx124, align 4 + %85 = load i32, i32* %penalty.addr, align 4 + %sub125 = sub nsw i32 %84, %85 + %call126 = call i32 @_Z14maximum_deviceiii(i32 %add113, i32 %sub119, i32 %sub125) #2 + %86 = load i32, i32* %t_index_y99, align 4 + %idxprom127 = sext i32 %86 to i64 + %arrayidx128 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom127 + %87 = load i32, i32* %t_index_x96, align 4 + %idxprom129 = sext i32 %87 to i64 + %arrayidx130 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx128, i64 0, i64 %idxprom129 + store i32 %call126, i32* %arrayidx130, align 4 + br label %if.end131 + +if.end131: ; preds = %if.then95, %for.body93 + call void @llvm.nvvm.barrier0() + br label %for.inc132 + +for.inc132: ; preds = %if.end131 + %88 = load i32, i32* %m90, align 4 + %dec = add nsw i32 %88, -1 + store i32 %dec, i32* %m90, align 4 + br label %for.cond91 + +for.end133: ; preds = %for.cond91 + store i32 0, i32* %ty134, align 4 + br label %for.cond135 + +for.cond135: ; preds = %for.inc148, %for.end133 + %89 = load i32, i32* %ty134, align 4 + %cmp136 = icmp slt i32 %89, 16 + br i1 %cmp136, label %for.body137, label %for.end150 + +for.body137: ; preds = %for.cond135 + %90 = load i32, i32* %ty134, align 4 + %add138 = add nsw i32 %90, 1 + %idxprom139 = sext i32 %add138 to i64 + %arrayidx140 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_1PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom139 + %91 = load i32, i32* %tx, align 4 + %add141 = add nsw i32 %91, 1 + %idxprom142 = sext i32 %add141 to i64 + %arrayidx143 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx140, i64 0, i64 %idxprom142 + %92 = load i32, i32* %arrayidx143, align 4 + %93 = load i32*, i32** %matrix_cuda.addr, align 8 + %94 = load i32, i32* %index, align 4 + %95 = load i32, i32* %ty134, align 4 + %96 = load i32, i32* %cols.addr, align 4 + %mul144 = mul nsw i32 %95, %96 + %add145 = add nsw i32 %94, %mul144 + %idxprom146 = sext i32 %add145 to i64 + %arrayidx147 = getelementptr inbounds i32, i32* %93, i64 %idxprom146 + store i32 %92, i32* %arrayidx147, align 4 + br label %for.inc148 + +for.inc148: ; preds = %for.body137 + %97 = load i32, i32* %ty134, align 4 + %inc149 = add nsw i32 %97, 1 + store i32 %inc149, i32* %ty134, align 4 + br label %for.cond135 + +for.end150: ; preds = %for.cond135 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z20needle_cuda_shared_2PiS_iiii(i32* %referrence, i32* %matrix_cuda, i32 %cols, i32 %penalty, i32 %i, i32 %block_width) #0 { +entry: + %referrence.addr = alloca i32*, align 8 + %matrix_cuda.addr = alloca i32*, align 8 + %cols.addr = alloca i32, align 4 + %penalty.addr = alloca i32, align 4 + %i.addr = alloca i32, align 4 + %block_width.addr = alloca i32, align 4 + %bx = alloca i32, align 4 + %tx = alloca i32, align 4 + %b_index_x = alloca i32, align 4 + %b_index_y = alloca i32, align 4 + %index = alloca i32, align 4 + %index_n = alloca i32, align 4 + %index_w = alloca i32, align 4 + %index_nw = alloca i32, align 4 + %ty = alloca i32, align 4 + %m = alloca i32, align 4 + %t_index_x = alloca i32, align 4 + %t_index_y = alloca i32, align 4 + %m92 = alloca i32, align 4 + %t_index_x98 = alloca i32, align 4 + %t_index_y101 = alloca i32, align 4 + %ty136 = alloca i32, align 4 + store i32* %referrence, i32** %referrence.addr, align 8 + store i32* %matrix_cuda, i32** %matrix_cuda.addr, align 8 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %penalty, i32* %penalty.addr, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %block_width, i32* %block_width.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %bx, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %tx, align 4 + %0 = load i32, i32* %bx, align 4 + %1 = load i32, i32* %block_width.addr, align 4 + %add = add nsw i32 %0, %1 + %2 = load i32, i32* %i.addr, align 4 + %sub = sub nsw i32 %add, %2 + store i32 %sub, i32* %b_index_x, align 4 + %3 = load i32, i32* %block_width.addr, align 4 + %4 = load i32, i32* %bx, align 4 + %sub2 = sub nsw i32 %3, %4 + %sub3 = sub nsw i32 %sub2, 1 + store i32 %sub3, i32* %b_index_y, align 4 + %5 = load i32, i32* %cols.addr, align 4 + %mul = mul nsw i32 %5, 16 + %6 = load i32, i32* %b_index_y, align 4 + %mul4 = mul nsw i32 %mul, %6 + %7 = load i32, i32* %b_index_x, align 4 + %mul5 = mul nsw i32 16, %7 + %add6 = add nsw i32 %mul4, %mul5 + %8 = load i32, i32* %tx, align 4 + %add7 = add nsw i32 %add6, %8 + %9 = load i32, i32* %cols.addr, align 4 + %add8 = add nsw i32 %9, 1 + %add9 = add nsw i32 %add7, %add8 + store i32 %add9, i32* %index, align 4 + %10 = load i32, i32* %cols.addr, align 4 + %mul10 = mul nsw i32 %10, 16 + %11 = load i32, i32* %b_index_y, align 4 + %mul11 = mul nsw i32 %mul10, %11 + %12 = load i32, i32* %b_index_x, align 4 + %mul12 = mul nsw i32 16, %12 + %add13 = add nsw i32 %mul11, %mul12 + %13 = load i32, i32* %tx, align 4 + %add14 = add nsw i32 %add13, %13 + %add15 = add nsw i32 %add14, 1 + store i32 %add15, i32* %index_n, align 4 + %14 = load i32, i32* %cols.addr, align 4 + %mul16 = mul nsw i32 %14, 16 + %15 = load i32, i32* %b_index_y, align 4 + %mul17 = mul nsw i32 %mul16, %15 + %16 = load i32, i32* %b_index_x, align 4 + %mul18 = mul nsw i32 16, %16 + %add19 = add nsw i32 %mul17, %mul18 + %17 = load i32, i32* %cols.addr, align 4 + %add20 = add nsw i32 %add19, %17 + store i32 %add20, i32* %index_w, align 4 + %18 = load i32, i32* %cols.addr, align 4 + %mul21 = mul nsw i32 %18, 16 + %19 = load i32, i32* %b_index_y, align 4 + %mul22 = mul nsw i32 %mul21, %19 + %20 = load i32, i32* %b_index_x, align 4 + %mul23 = mul nsw i32 16, %20 + %add24 = add nsw i32 %mul22, %mul23 + store i32 %add24, i32* %index_nw, align 4 + store i32 0, i32* %ty, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %21 = load i32, i32* %ty, align 4 + %cmp = icmp slt i32 %21, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %22 = load i32*, i32** %referrence.addr, align 8 + %23 = load i32, i32* %index, align 4 + %24 = load i32, i32* %cols.addr, align 4 + %25 = load i32, i32* %ty, align 4 + %mul25 = mul nsw i32 %24, %25 + %add26 = add nsw i32 %23, %mul25 + %idxprom = sext i32 %add26 to i64 + %arrayidx = getelementptr inbounds i32, i32* %22, i64 %idxprom + %26 = load i32, i32* %arrayidx, align 4 + %27 = load i32, i32* %ty, align 4 + %idxprom27 = sext i32 %27 to i64 + %arrayidx28 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* addrspacecast ([16 x [16 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE3ref to [16 x [16 x i32]]*), i64 0, i64 %idxprom27 + %28 = load i32, i32* %tx, align 4 + %idxprom29 = sext i32 %28 to i64 + %arrayidx30 = getelementptr inbounds [16 x i32], [16 x i32]* %arrayidx28, i64 0, i64 %idxprom29 + store i32 %26, i32* %arrayidx30, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %29 = load i32, i32* %ty, align 4 + %inc = add nsw i32 %29, 1 + store i32 %inc, i32* %ty, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + call void @llvm.nvvm.barrier0() + %30 = load i32, i32* %tx, align 4 + %cmp31 = icmp eq i32 %30, 0 + br i1 %cmp31, label %if.then, label %if.end + +if.then: ; preds = %for.end + %31 = load i32*, i32** %matrix_cuda.addr, align 8 + %32 = load i32, i32* %index_nw, align 4 + %idxprom32 = sext i32 %32 to i64 + %arrayidx33 = getelementptr inbounds i32, i32* %31, i64 %idxprom32 + %33 = load i32, i32* %arrayidx33, align 4 + %34 = load i32, i32* %tx, align 4 + %idxprom34 = sext i32 %34 to i64 + %arrayidx35 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom34 + %arrayidx36 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx35, i64 0, i64 0 + store i32 %33, i32* %arrayidx36, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.end + %35 = load i32*, i32** %matrix_cuda.addr, align 8 + %36 = load i32, i32* %index_w, align 4 + %37 = load i32, i32* %cols.addr, align 4 + %38 = load i32, i32* %tx, align 4 + %mul37 = mul nsw i32 %37, %38 + %add38 = add nsw i32 %36, %mul37 + %idxprom39 = sext i32 %add38 to i64 + %arrayidx40 = getelementptr inbounds i32, i32* %35, i64 %idxprom39 + %39 = load i32, i32* %arrayidx40, align 4 + %40 = load i32, i32* %tx, align 4 + %add41 = add nsw i32 %40, 1 + %idxprom42 = sext i32 %add41 to i64 + %arrayidx43 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom42 + %arrayidx44 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx43, i64 0, i64 0 + store i32 %39, i32* %arrayidx44, align 4 + call void @llvm.nvvm.barrier0() + %41 = load i32*, i32** %matrix_cuda.addr, align 8 + %42 = load i32, i32* %index_n, align 4 + %idxprom45 = sext i32 %42 to i64 + %arrayidx46 = getelementptr inbounds i32, i32* %41, i64 %idxprom45 + %43 = load i32, i32* %arrayidx46, align 4 + %44 = load i32, i32* %tx, align 4 + %add47 = add nsw i32 %44, 1 + %idxprom48 = sext i32 %add47 to i64 + %arrayidx49 = getelementptr inbounds [17 x i32], [17 x i32]* getelementptr inbounds ([17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 0), i64 0, i64 %idxprom48 + store i32 %43, i32* %arrayidx49, align 4 + call void @llvm.nvvm.barrier0() + store i32 0, i32* %m, align 4 + br label %for.cond50 + +for.cond50: ; preds = %for.inc89, %if.end + %45 = load i32, i32* %m, align 4 + %cmp51 = icmp slt i32 %45, 16 + br i1 %cmp51, label %for.body52, label %for.end91 + +for.body52: ; preds = %for.cond50 + %46 = load i32, i32* %tx, align 4 + %47 = load i32, i32* %m, align 4 + %cmp53 = icmp sle i32 %46, %47 + br i1 %cmp53, label %if.then54, label %if.end88 + +if.then54: ; preds = %for.body52 + %48 = load i32, i32* %tx, align 4 + %add55 = add nsw i32 %48, 1 + store i32 %add55, i32* %t_index_x, align 4 + %49 = load i32, i32* %m, align 4 + %50 = load i32, i32* %tx, align 4 + %sub56 = sub nsw i32 %49, %50 + %add57 = add nsw i32 %sub56, 1 + store i32 %add57, i32* %t_index_y, align 4 + %51 = load i32, i32* %t_index_y, align 4 + %sub58 = sub nsw i32 %51, 1 + %idxprom59 = sext i32 %sub58 to i64 + %arrayidx60 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom59 + %52 = load i32, i32* %t_index_x, align 4 + %sub61 = sub nsw i32 %52, 1 + %idxprom62 = sext i32 %sub61 to i64 + %arrayidx63 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx60, i64 0, i64 %idxprom62 + %53 = load i32, i32* %arrayidx63, align 4 + %54 = load i32, i32* %t_index_y, align 4 + %sub64 = sub nsw i32 %54, 1 + %idxprom65 = sext i32 %sub64 to i64 + %arrayidx66 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* addrspacecast ([16 x [16 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE3ref to [16 x [16 x i32]]*), i64 0, i64 %idxprom65 + %55 = load i32, i32* %t_index_x, align 4 + %sub67 = sub nsw i32 %55, 1 + %idxprom68 = sext i32 %sub67 to i64 + %arrayidx69 = getelementptr inbounds [16 x i32], [16 x i32]* %arrayidx66, i64 0, i64 %idxprom68 + %56 = load i32, i32* %arrayidx69, align 4 + %add70 = add nsw i32 %53, %56 + %57 = load i32, i32* %t_index_y, align 4 + %idxprom71 = sext i32 %57 to i64 + %arrayidx72 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom71 + %58 = load i32, i32* %t_index_x, align 4 + %sub73 = sub nsw i32 %58, 1 + %idxprom74 = sext i32 %sub73 to i64 + %arrayidx75 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx72, i64 0, i64 %idxprom74 + %59 = load i32, i32* %arrayidx75, align 4 + %60 = load i32, i32* %penalty.addr, align 4 + %sub76 = sub nsw i32 %59, %60 + %61 = load i32, i32* %t_index_y, align 4 + %sub77 = sub nsw i32 %61, 1 + %idxprom78 = sext i32 %sub77 to i64 + %arrayidx79 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom78 + %62 = load i32, i32* %t_index_x, align 4 + %idxprom80 = sext i32 %62 to i64 + %arrayidx81 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx79, i64 0, i64 %idxprom80 + %63 = load i32, i32* %arrayidx81, align 4 + %64 = load i32, i32* %penalty.addr, align 4 + %sub82 = sub nsw i32 %63, %64 + %call83 = call i32 @_Z14maximum_deviceiii(i32 %add70, i32 %sub76, i32 %sub82) #2 + %65 = load i32, i32* %t_index_y, align 4 + %idxprom84 = sext i32 %65 to i64 + %arrayidx85 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom84 + %66 = load i32, i32* %t_index_x, align 4 + %idxprom86 = sext i32 %66 to i64 + %arrayidx87 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx85, i64 0, i64 %idxprom86 + store i32 %call83, i32* %arrayidx87, align 4 + br label %if.end88 + +if.end88: ; preds = %if.then54, %for.body52 + call void @llvm.nvvm.barrier0() + br label %for.inc89 + +for.inc89: ; preds = %if.end88 + %67 = load i32, i32* %m, align 4 + %inc90 = add nsw i32 %67, 1 + store i32 %inc90, i32* %m, align 4 + br label %for.cond50 + +for.end91: ; preds = %for.cond50 + store i32 14, i32* %m92, align 4 + br label %for.cond93 + +for.cond93: ; preds = %for.inc134, %for.end91 + %68 = load i32, i32* %m92, align 4 + %cmp94 = icmp sge i32 %68, 0 + br i1 %cmp94, label %for.body95, label %for.end135 + +for.body95: ; preds = %for.cond93 + %69 = load i32, i32* %tx, align 4 + %70 = load i32, i32* %m92, align 4 + %cmp96 = icmp sle i32 %69, %70 + br i1 %cmp96, label %if.then97, label %if.end133 + +if.then97: ; preds = %for.body95 + %71 = load i32, i32* %tx, align 4 + %add99 = add nsw i32 %71, 16 + %72 = load i32, i32* %m92, align 4 + %sub100 = sub nsw i32 %add99, %72 + store i32 %sub100, i32* %t_index_x98, align 4 + %73 = load i32, i32* %tx, align 4 + %sub102 = sub nsw i32 16, %73 + store i32 %sub102, i32* %t_index_y101, align 4 + %74 = load i32, i32* %t_index_y101, align 4 + %sub103 = sub nsw i32 %74, 1 + %idxprom104 = sext i32 %sub103 to i64 + %arrayidx105 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom104 + %75 = load i32, i32* %t_index_x98, align 4 + %sub106 = sub nsw i32 %75, 1 + %idxprom107 = sext i32 %sub106 to i64 + %arrayidx108 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx105, i64 0, i64 %idxprom107 + %76 = load i32, i32* %arrayidx108, align 4 + %77 = load i32, i32* %t_index_y101, align 4 + %sub109 = sub nsw i32 %77, 1 + %idxprom110 = sext i32 %sub109 to i64 + %arrayidx111 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* addrspacecast ([16 x [16 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE3ref to [16 x [16 x i32]]*), i64 0, i64 %idxprom110 + %78 = load i32, i32* %t_index_x98, align 4 + %sub112 = sub nsw i32 %78, 1 + %idxprom113 = sext i32 %sub112 to i64 + %arrayidx114 = getelementptr inbounds [16 x i32], [16 x i32]* %arrayidx111, i64 0, i64 %idxprom113 + %79 = load i32, i32* %arrayidx114, align 4 + %add115 = add nsw i32 %76, %79 + %80 = load i32, i32* %t_index_y101, align 4 + %idxprom116 = sext i32 %80 to i64 + %arrayidx117 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom116 + %81 = load i32, i32* %t_index_x98, align 4 + %sub118 = sub nsw i32 %81, 1 + %idxprom119 = sext i32 %sub118 to i64 + %arrayidx120 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx117, i64 0, i64 %idxprom119 + %82 = load i32, i32* %arrayidx120, align 4 + %83 = load i32, i32* %penalty.addr, align 4 + %sub121 = sub nsw i32 %82, %83 + %84 = load i32, i32* %t_index_y101, align 4 + %sub122 = sub nsw i32 %84, 1 + %idxprom123 = sext i32 %sub122 to i64 + %arrayidx124 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom123 + %85 = load i32, i32* %t_index_x98, align 4 + %idxprom125 = sext i32 %85 to i64 + %arrayidx126 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx124, i64 0, i64 %idxprom125 + %86 = load i32, i32* %arrayidx126, align 4 + %87 = load i32, i32* %penalty.addr, align 4 + %sub127 = sub nsw i32 %86, %87 + %call128 = call i32 @_Z14maximum_deviceiii(i32 %add115, i32 %sub121, i32 %sub127) #2 + %88 = load i32, i32* %t_index_y101, align 4 + %idxprom129 = sext i32 %88 to i64 + %arrayidx130 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom129 + %89 = load i32, i32* %t_index_x98, align 4 + %idxprom131 = sext i32 %89 to i64 + %arrayidx132 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx130, i64 0, i64 %idxprom131 + store i32 %call128, i32* %arrayidx132, align 4 + br label %if.end133 + +if.end133: ; preds = %if.then97, %for.body95 + call void @llvm.nvvm.barrier0() + br label %for.inc134 + +for.inc134: ; preds = %if.end133 + %90 = load i32, i32* %m92, align 4 + %dec = add nsw i32 %90, -1 + store i32 %dec, i32* %m92, align 4 + br label %for.cond93 + +for.end135: ; preds = %for.cond93 + store i32 0, i32* %ty136, align 4 + br label %for.cond137 + +for.cond137: ; preds = %for.inc150, %for.end135 + %91 = load i32, i32* %ty136, align 4 + %cmp138 = icmp slt i32 %91, 16 + br i1 %cmp138, label %for.body139, label %for.end152 + +for.body139: ; preds = %for.cond137 + %92 = load i32, i32* %ty136, align 4 + %add140 = add nsw i32 %92, 1 + %idxprom141 = sext i32 %add140 to i64 + %arrayidx142 = getelementptr inbounds [17 x [17 x i32]], [17 x [17 x i32]]* addrspacecast ([17 x [17 x i32]] addrspace(3)* @_ZZ20needle_cuda_shared_2PiS_iiiiE4temp to [17 x [17 x i32]]*), i64 0, i64 %idxprom141 + %93 = load i32, i32* %tx, align 4 + %add143 = add nsw i32 %93, 1 + %idxprom144 = sext i32 %add143 to i64 + %arrayidx145 = getelementptr inbounds [17 x i32], [17 x i32]* %arrayidx142, i64 0, i64 %idxprom144 + %94 = load i32, i32* %arrayidx145, align 4 + %95 = load i32*, i32** %matrix_cuda.addr, align 8 + %96 = load i32, i32* %index, align 4 + %97 = load i32, i32* %ty136, align 4 + %98 = load i32, i32* %cols.addr, align 4 + %mul146 = mul nsw i32 %97, %98 + %add147 = add nsw i32 %96, %mul146 + %idxprom148 = sext i32 %add147 to i64 + %arrayidx149 = getelementptr inbounds i32, i32* %95, i64 %idxprom148 + store i32 %94, i32* %arrayidx149, align 4 + br label %for.inc150 + +for.inc150: ; preds = %for.body139 + %99 = load i32, i32* %ty136, align 4 + %inc151 = add nsw i32 %99, 1 + store i32 %inc151, i32* %ty136, align 4 + br label %for.cond137 + +for.end152: ; preds = %for.cond137 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7} +!llvm.ident = !{!9} +!nvvmir.version = !{!10} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32*, i32*, i32, i32, i32, i32)* @_Z20needle_cuda_shared_1PiS_iiii, !"kernel", i32 1} +!4 = !{void (i32*, i32*, i32, i32, i32, i32)* @_Z20needle_cuda_shared_2PiS_iiii, !"kernel", i32 1} +!5 = !{null, !"align", i32 8} +!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!7 = !{null, !"align", i32 16} +!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!10 = !{i32 1, i32 4} diff --git a/examples/nw/needle-host-x86_64-unknown-linux-gnu.ll b/examples/nw/needle-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..92bde07 --- /dev/null +++ b/examples/nw/needle-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,1218 @@ +; ModuleID = 'needle-host-x86_64-unknown-linux-gnu.bc' +source_filename = "needle.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } + +$_ZN4dim3C2Ejjj = comdat any + +@blosum62 = dso_local global [24 x [24 x i32]] [[24 x i32] [i32 4, i32 -1, i32 -2, i32 -2, i32 0, i32 -1, i32 -1, i32 0, i32 -2, i32 -1, i32 -1, i32 -1, i32 -1, i32 -2, i32 -1, i32 1, i32 0, i32 -3, i32 -2, i32 0, i32 -2, i32 -1, i32 0, i32 -4], [24 x i32] [i32 -1, i32 5, i32 0, i32 -2, i32 -3, i32 1, i32 0, i32 -2, i32 0, i32 -3, i32 -2, i32 2, i32 -1, i32 -3, i32 -2, i32 -1, i32 -1, i32 -3, i32 -2, i32 -3, i32 -1, i32 0, i32 -1, i32 -4], [24 x i32] [i32 -2, i32 0, i32 6, i32 1, i32 -3, i32 0, i32 0, i32 0, i32 1, i32 -3, i32 -3, i32 0, i32 -2, i32 -3, i32 -2, i32 1, i32 0, i32 -4, i32 -2, i32 -3, i32 3, i32 0, i32 -1, i32 -4], [24 x i32] [i32 -2, i32 -2, i32 1, i32 6, i32 -3, i32 0, i32 2, i32 -1, i32 -1, i32 -3, i32 -4, i32 -1, i32 -3, i32 -3, i32 -1, i32 0, i32 -1, i32 -4, i32 -3, i32 -3, i32 4, i32 1, i32 -1, i32 -4], [24 x i32] [i32 0, i32 -3, i32 -3, i32 -3, i32 9, i32 -3, i32 -4, i32 -3, i32 -3, i32 -1, i32 -1, i32 -3, i32 -1, i32 -2, i32 -3, i32 -1, i32 -1, i32 -2, i32 -2, i32 -1, i32 -3, i32 -3, i32 -2, i32 -4], [24 x i32] [i32 -1, i32 1, i32 0, i32 0, i32 -3, i32 5, i32 2, i32 -2, i32 0, i32 -3, i32 -2, i32 1, i32 0, i32 -3, i32 -1, i32 0, i32 -1, i32 -2, i32 -1, i32 -2, i32 0, i32 3, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 0, i32 0, i32 2, i32 -4, i32 2, i32 5, i32 -2, i32 0, i32 -3, i32 -3, i32 1, i32 -2, i32 -3, i32 -1, i32 0, i32 -1, i32 -3, i32 -2, i32 -2, i32 1, i32 4, i32 -1, i32 -4], [24 x i32] [i32 0, i32 -2, i32 0, i32 -1, i32 -3, i32 -2, i32 -2, i32 6, i32 -2, i32 -4, i32 -4, i32 -2, i32 -3, i32 -3, i32 -2, i32 0, i32 -2, i32 -2, i32 -3, i32 -3, i32 -1, i32 -2, i32 -1, i32 -4], [24 x i32] [i32 -2, i32 0, i32 1, i32 -1, i32 -3, i32 0, i32 0, i32 -2, i32 8, i32 -3, i32 -3, i32 -1, i32 -2, i32 -1, i32 -2, i32 -1, i32 -2, i32 -2, i32 2, i32 -3, i32 0, i32 0, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 -3, i32 -3, i32 -3, i32 -1, i32 -3, i32 -3, i32 -4, i32 -3, i32 4, i32 2, i32 -3, i32 1, i32 0, i32 -3, i32 -2, i32 -1, i32 -3, i32 -1, i32 3, i32 -3, i32 -3, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 -2, i32 -3, i32 -4, i32 -1, i32 -2, i32 -3, i32 -4, i32 -3, i32 2, i32 4, i32 -2, i32 2, i32 0, i32 -3, i32 -2, i32 -1, i32 -2, i32 -1, i32 1, i32 -4, i32 -3, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 2, i32 0, i32 -1, i32 -3, i32 1, i32 1, i32 -2, i32 -1, i32 -3, i32 -2, i32 5, i32 -1, i32 -3, i32 -1, i32 0, i32 -1, i32 -3, i32 -2, i32 -2, i32 0, i32 1, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 -1, i32 -2, i32 -3, i32 -1, i32 0, i32 -2, i32 -3, i32 -2, i32 1, i32 2, i32 -1, i32 5, i32 0, i32 -2, i32 -1, i32 -1, i32 -1, i32 -1, i32 1, i32 -3, i32 -1, i32 -1, i32 -4], [24 x i32] [i32 -2, i32 -3, i32 -3, i32 -3, i32 -2, i32 -3, i32 -3, i32 -3, i32 -1, i32 0, i32 0, i32 -3, i32 0, i32 6, i32 -4, i32 -2, i32 -2, i32 1, i32 3, i32 -1, i32 -3, i32 -3, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 -2, i32 -2, i32 -1, i32 -3, i32 -1, i32 -1, i32 -2, i32 -2, i32 -3, i32 -3, i32 -1, i32 -2, i32 -4, i32 7, i32 -1, i32 -1, i32 -4, i32 -3, i32 -2, i32 -2, i32 -1, i32 -2, i32 -4], [24 x i32] [i32 1, i32 -1, i32 1, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 -1, i32 -2, i32 -2, i32 0, i32 -1, i32 -2, i32 -1, i32 4, i32 1, i32 -3, i32 -2, i32 -2, i32 0, i32 0, i32 0, i32 -4], [24 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 -1, i32 -1, i32 -1, i32 -2, i32 -2, i32 -1, i32 -1, i32 -1, i32 -1, i32 -2, i32 -1, i32 1, i32 5, i32 -2, i32 -2, i32 0, i32 -1, i32 -1, i32 0, i32 -4], [24 x i32] [i32 -3, i32 -3, i32 -4, i32 -4, i32 -2, i32 -2, i32 -3, i32 -2, i32 -2, i32 -3, i32 -2, i32 -3, i32 -1, i32 1, i32 -4, i32 -3, i32 -2, i32 11, i32 2, i32 -3, i32 -4, i32 -3, i32 -2, i32 -4], [24 x i32] [i32 -2, i32 -2, i32 -2, i32 -3, i32 -2, i32 -1, i32 -2, i32 -3, i32 2, i32 -1, i32 -1, i32 -2, i32 -1, i32 3, i32 -3, i32 -2, i32 -2, i32 2, i32 7, i32 -1, i32 -3, i32 -2, i32 -1, i32 -4], [24 x i32] [i32 0, i32 -3, i32 -3, i32 -3, i32 -1, i32 -2, i32 -2, i32 -3, i32 -3, i32 3, i32 1, i32 -2, i32 1, i32 -1, i32 -2, i32 -2, i32 0, i32 -3, i32 -1, i32 4, i32 -3, i32 -2, i32 -1, i32 -4], [24 x i32] [i32 -2, i32 -1, i32 3, i32 4, i32 -3, i32 0, i32 1, i32 -1, i32 0, i32 -3, i32 -4, i32 0, i32 -3, i32 -3, i32 -2, i32 0, i32 -1, i32 -4, i32 -3, i32 -3, i32 4, i32 1, i32 -1, i32 -4], [24 x i32] [i32 -1, i32 0, i32 0, i32 1, i32 -3, i32 3, i32 4, i32 -2, i32 0, i32 -3, i32 -3, i32 1, i32 -1, i32 -3, i32 -1, i32 0, i32 -1, i32 -3, i32 -2, i32 -2, i32 1, i32 4, i32 -1, i32 -4], [24 x i32] [i32 0, i32 -1, i32 -1, i32 -1, i32 -2, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -2, i32 0, i32 0, i32 -2, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -4], [24 x i32] [i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 -4, i32 1]], align 16 +@.str = private unnamed_addr constant [25 x i8] c"WG size of kernel = %d \0A\00", align 1 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.1 = private unnamed_addr constant [42 x i8] c"Usage: %s \0A\00", align 1 +@.str.2 = private unnamed_addr constant [36 x i8] c"\09 - x and y dimensions\0A\00", align 1 +@.str.3 = private unnamed_addr constant [40 x i8] c"\09 - penalty(positive integer)\0A\00", align 1 +@.str.4 = private unnamed_addr constant [47 x i8] c"The dimension values must be a multiple of 16\0A\00", align 1 +@.str.5 = private unnamed_addr constant [31 x i8] c"error: can not allocate memory\00", align 1 +@.str.6 = private unnamed_addr constant [24 x i8] c"Start Needleman-Wunsch\0A\00", align 1 +@.str.7 = private unnamed_addr constant [28 x i8] c"Processing top-left matrix\0A\00", align 1 +@.str.8 = private unnamed_addr constant [32 x i8] c"Processing bottom-right matrix\0A\00", align 1 +@.str.9 = private unnamed_addr constant [11 x i8] c"result.txt\00", align 1 +@.str.10 = private unnamed_addr constant [2 x i8] c"w\00", align 1 +@.str.11 = private unnamed_addr constant [28 x i8] c"print traceback value GPU:\0A\00", align 1 +@.str.12 = private unnamed_addr constant [4 x i8] c"%d \00", align 1 +@0 = private unnamed_addr constant [33 x i8] c"_Z20needle_cuda_shared_1PiS_iiii\00", align 1 +@1 = private unnamed_addr constant [33 x i8] c"_Z20needle_cuda_shared_2PiS_iiii\00", align 1 +@2 = private constant [48849 x i8] c"P\EDU\BA\01\00\10\00\C0\BE\00\00\00\00\00\00\02\00\01\01@\00\00\00\88\A8\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\E0\A7\00\00\00\00\00\00`\A4\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0E\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z20needle_cuda_shared_2PiS_iiii\00.nv.info._Z20needle_cuda_shared_2PiS_iiii\00.nv.shared._Z20needle_cuda_shared_2PiS_iiii\00.nv.global\00.nv.constant0._Z20needle_cuda_shared_2PiS_iiii\00.text._Z20needle_cuda_shared_1PiS_iiii\00.nv.info._Z20needle_cuda_shared_1PiS_iiii\00.nv.shared._Z20needle_cuda_shared_1PiS_iiii\00.nv.constant0._Z20needle_cuda_shared_1PiS_iiii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z20needle_cuda_shared_2PiS_iiii\00.text._Z20needle_cuda_shared_2PiS_iiii\00.nv.info._Z20needle_cuda_shared_2PiS_iiii\00.nv.shared._Z20needle_cuda_shared_2PiS_iiii\00.nv.global\00blockIdx\00threadIdx\00$_Z20needle_cuda_shared_2PiS_iiii$_Z14maximum_deviceiii\00$___ZZ20needle_cuda_shared_2PiS_iiiiE4temp__635\00$___ZZ20needle_cuda_shared_2PiS_iiiiE3ref__637\00.nv.constant0._Z20needle_cuda_shared_2PiS_iiii\00_param\00_Z20needle_cuda_shared_1PiS_iiii\00.text._Z20needle_cuda_shared_1PiS_iiii\00.nv.info._Z20needle_cuda_shared_1PiS_iiii\00.nv.shared._Z20needle_cuda_shared_1PiS_iiii\00$_Z20needle_cuda_shared_1PiS_iiii$_Z14maximum_deviceiii\00$___ZZ20needle_cuda_shared_1PiS_iiiiE4temp__240\00$___ZZ20needle_cuda_shared_1PiS_iiiiE3ref__242\00.nv.constant0._Z20needle_cuda_shared_1PiS_iiii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00S\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A4\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D0\00\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DB\00\00\00\01\00\0C\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\E4\00\00\00\01\00\0C\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\85\01\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00-\02\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F0\02\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00@M\00\00\00\00\00\00\EE\00\00\00\12\02\09\00\B0E\00\00\00\00\00\00\90\07\00\00\00\00\00\00\BB\01\00\00\12\10\0A\00\00\00\00\00\00\00\00\00@L\00\00\00\00\00\00Y\02\00\00\12\02\0A\00\D8D\00\00\00\00\00\00h\07\00\00\00\00\00\00\04/\08\00\0C\00\00\00\13\00\00\00\04#\08\00\0D\00\00\00\00\00\00\00\04\12\08\00\0D\00\00\00\00\00\00\00\04\11\08\00\0D\00\00\00\00\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00x\00\00\00\04\11\08\00\0C\00\00\00x\00\00\00\04/\08\00\0A\00\00\00\13\00\00\00\04#\08\00\0B\00\00\00\00\00\00\00\04\12\08\00\0B\00\00\00\00\00\00\00\04\11\08\00\0B\00\00\00\00\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00x\00\00\00\04\11\08\00\0A\00\00\00x\00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\01 \00\03\19 \00\04\17\0C\00\00\00\00\00\05\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\14\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\88\04\00\00\04\1C\04\00\A8E\00\00\04\1E\04\00`\00\00\00\010\00\00\01*\00\00\04\0A\08\00\09\00\00\00@\01 \00\03\19 \00\04\17\0C\00\00\00\00\00\05\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\14\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\88\04\00\00\04\1C\04\00\D0D\00\00\04\1E\04\00`\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBZ\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\7Fvisible\D9\04\0F\D1_Z14maximum_dA\09Iiii(\88\03\0F#\00\02\0E\AB\0C\0F+\00\0D\1F1+\00\17\0F\A9\0C\01\1E4\BD\03?6[2\BD\03\16xpred %p\06\08\02\CF\03\1F1\E6\0E\0D\1F6\BE\03\18\00i\03\0F\D9\00\0A\1E]\F1\03\0F3\00\0C\1F1{\03\00\0F3\00\0C*0]W\02\02m\02\0F\A5\0C\02\1F2\A5\0C\02\113s\00\02m\00$4,E\00\07\15\00$5,E\00\B0;\0Asetp.gt.s\1A\002p1,4\00\F2\0E%r5;\0A@%p1 bra LBB6_2;\0Abra.uni\10\0021;\0A\08\00\16:Y\00\187Y\00\0EG\03\1B7C\00\133C\00\172C\00\176\B1\00\0FC\00\00\1F6C\00\04\173C\00%8,3\00\07\F5\00%9,%\01\0C\F6\00\222,6\001%r9\F6\00\162\F6\00\1B5p\00\134p\00\174p\00)11[\00\07\B5\00\03\80\0C\1B1E\00\136E\00\185E\00\180\B6\00\0FE\00\00\1F0E\00\04\186E\00$2,4\00\0F\18\0D\11\1F2t\04\02\F0\02entry _Z20needle_~\05\E9_shared_1PiS_ie\04\00T\03\0F.\00\0D\0Ep\04\0F6\00\18\07{\04\00\E7\00\0F6\00\14\1F26\00\22\1F36\00\22\1F46\00\22\1F5\E5\08\13O7[96(\05\1D\1E8(\05\1D6\F9\08 10'\002\0A\09.\A0\00\0B\AB\05\1FZ\C5\00\0C\CFE4temp[1156]F\00(\803ref[102\F0\03\0F\C7\05\08\1F7\C7\05\18\00\F4\04\0F\83\01\15\1E]\05\06\0F>\00\17\1F4\10\06\00\0F>\00\17\0F\96\09\01\0F>\00\17\0F\80\09\01\0F}\00\18\0Fj\09\01\0F~\00\18#0]\90\01#tob\19\07\AD\0A\02G\09\01 \14\0A\1C\00\1446\09\0F;\00\03\145\98\09\0F;\00\00\116\1C\00\1F5\CA\09\02\1F6\CA\09\02\1F4\CA\09\09\04\16\00/20H\07\01\03\E0\09\1B3\16\00\02Y\00\184\DF\09\00\B3\00\7Fctaid.x\0C\0A\02\185-\00\00\C4\00\1Ft+\00\00\05\DA\06\0DH\07\1C3Q\06$40I\07\0D\EE\06/24\EE\06\01\01B\00Unot.bf\06\01\E6\06Uadd.s\BE\06\04\19\00\1B8n\00\154\C0\06\0Ec\06\0A^\07513,4\00\84;\0Amul.loa\00#4,8\00\00'\00T;\0Ashl\8E\00#5, \00\184K\00\05\03\08)0].\00#7,\1E\00\194\C0\00&8,K\00\08\19\01\151\04\01(6]1\00620,7\00\09\0B\01&21\AA\00)20\1A\00#2, \00\0BA\02\134\89\09\172y\00/23#\01\02/24#\01\06325,8\00\00'\00\08\F5\00326, \00\08#\01/27#\01\03328,\1E\00\09#\01629,K\00\178H\00/30#\01\03531,7\00\193\09\01\113\09\01\1E3\09\01\135|\0A\08\09\01\1F3\09\01\03\1F3\09\01\07335,8\00\00'\00\08\DB\00\113\BE\00\1B3\09\01\1F3\09\01\04\113\09\01\1C3\09\01639,K\00\188\1A\00640, \00\0B\F9\03\125\A1\03(40X\0B\1F1\DB\00\02/42\DB\00\06343,8\00\00'\00\08\AD\00344, \00\08\DB\00/45\DB\00\03346,\1E\00\09\DB\00647,K\00\1A6\C1\00\1266\04\184\1E\03(48\FB\01\01\10\0B)ne\06\0CJ8, 0\05\0C\1B7\05\0C\137\05\0C\147\05\0C\02\AC\05\08\06\0C\14l\A0\005d8,\8D\00\03\CF\00\02\87\08#9,\1E\00\132\CF\00\03\8D\06#0,L\00\00$\00\08\B9\00\01\D7\03\00!\00\09_\00(11\D0\00\04z\01\03L\00$2,#\00'68g\08 rd\B7\04\0F\EA\08\14\03\97\06\02\1D\00\05@\00\02\D3\06*13\BC\00(5,\1D\00\08^\06\00\1D\00\02\8D\01\1B9K\01\132K\01(2:`\06\110\85\01\09\CD\01\02\95\05+50@\00\133@\00\08\0A\0D551,5\00\0F\F5\0C\02\00!\00*15\F5\0C;7_6[\00\134[\00\184\E6\01'96l\0C\07\1F\06%8,\1A\05\09\18\00\1F98\06\04\180\A3\00\0A9\06\02\87\04\121\88\04\1A1\89\04\121\8A\04\121\F3\04\22137\08\00\82\01\04\12\02\1290\06*32o\02$8,\1D\00\0Aq\02499,\D7\00\01'\00\09\A5\00\123t\02=99]g\005100\9F\00\08Z\02?01,\FF\0A\14\0FZ\02\03\130\B5\02)01\C0\00E103,w\00\1B63\03*4,;\00\193\22\03)05\F3\03\08T\00%6,\22\00\0C\87\03*7,\\\00\1A6\CE\02\2207$\08+335\02\1355\02\09\E5\0E(34\EE\01\08\CE\01$5, \00\1F1\F0\02\02?135\F1\02\04\B06:\0Abar.sync,\03\06\A2\02*16\89\04\03\14\03\04\A3\05\195\89\02\1F5\AC\06\03/54W\04\01\02\1B\00#5,8\00\00'\00\08\CE\00356,i\00\00&\00\0E2\02\01S\01*56\84\01$8,\1C\00\0B\82\01$9,\CD\00\01'\00\08\9D\00\137\0B\05)9]\7F\00#8,\BA\00\0E\15\03\02\FD\08\1A5\06\05/21\06\05+\02\1E\09\04\22d2\B1\05\182!\07\135h\06\192>\02\1F62\09\04361,\1E\00\0Ft\01\00\01\C2\01\1A6\C8\03\00\F8\04\03\1C\00\0A\93\00(31\1A\01)30\1A\01\2231\1A\01\1A9\1A\01\07d\0C\1E21\06\02\A3\0A\1C6\BC\07\137@\03\187~\13\04\06\03\1E6\1C\14#3,!\00\031\06\1631\06\1C1\\\00\138\\\00\198\E2\12\09L\03\07\18\00\1F5u\00\07$7,:\00\01)\00\01y\00\177y\00\0C\06\07\139y\00\199y\00\1F6\B5\01\03\03\DD\04\01 \00\0Bv\04#72v\13\09R\0C(08\AA\00\07\C2\00\09k\0C5sub\\\0D$0,8\00\01'\00\0Ay\0D$1,$\00\0C~\00\03\AD\0E\0Az\0D\05{\0D\1B7\C7\00\00\C6\08\03 \00\1E-\7F\02\127\84\00)13\F4\03\1F7\F4\03,\127\A0\02\1D7\F4\03\127\F4\03\1E7\F4\03874,:\00\193\DC\00%4,y\01\0A\DC\00$5, \00\0F\DC\00\01\03\1E\00\195\\\03\127,\05\1D7\CE\06877,\89\00\196\82\00\126\F0\03:77])\01\1F8\C8\07*\127\BF\05)78\A2\00\138\DE\0A\0D\C4\07781,6\00*80\1D\00(2,$\00\0A\BF\00\03\0C\06\1C8@\01$8,\DE\00\01&\00\0DC\01\148=\02\0C\C9\01484, \00\0B\C9\01(85\C9\01*84\1D\00(6,$\00\0A\A5\00\03T\05\1A8\01\04\00 \06\04\8F\11\09?\03\03\91\0F\02\03\07>120\BD\00\137\1E\02\0A^\01\02\D6\05\1D8L\07(89\00\02\198\E4\09#22\9C\00\1B9\84\00$3,\1F\00\03\84\00F{ \0A\09]\15\00\22\03\03\16\13Ireg;&\1B\01\0B\00\1C0\8B\17\02\16\00\05\1D\18\1F84\00\00\1F14\00\02\151\B9\17\1F14\00\00\1F24\00\02\1524\00\1A34\00\03\0C\18a;\0AcallA\05\14(W\1C?, \0A\A1\1A\02R, \0A(\0A\BE\00\22, \09\00\141\09\0072\0A)R\14\03\B2\11\01\F3\10\06u\18f;\0A} \0A\09\B0\07(90\D1\04\0A\B4\0D\129\E5\02\1E9G\04(92~\02(91\02\08(93G\04\08\F7\0B$4, \00\0B\F7\0B'5,U\00)94o\07#95<\01\1C4\F8\0E\140\F9\0E\1A0~\0A\0A&\00\05\1F\0F\191\A5\19\186F\06\07\AC\03\02\A7\08\01 \00\1F1\C4\07\02?127\C5\07\04\1917\0E\106\FC\09\0B\01\15\128\08\02\1D6\CB\00\04H\0B\181:\0E\156b\09\140\94\07\14l\94\07#4,!\00\02\1F\10\174\8F\07\1C8]\00\04;\0E\1811\1B/79\09\08\02-80t\00\05\08\08#6,8\00\00'\00\01w\00\176w\00\0C\B1\0E$15w\00\08~\0C/81w\00\03\192w\00\05~\04\02\19\05\02m\05(82\A9\01\02\1A\05\00 \00+16e\01\02\BA\0A(84`\00\09w\0D\06\A5\01#86@\00\06r\00#7,\18\00\006\00\0BZ\00\02\02\02\198\FB\15\04\8F\08\1A8I\02\02\E7\04\00\1E\00\0F3\07\00\02#\12)89\E5\06/45\0E\08+\02\95\12\11\02\22\11\1F8>\11\01\02\E8\15\1A6\B0\06\1F3\B8\17,\133\8B\1E\1C3\80\02\02\88\14\01u\00\0B\80\02\026\1A\05:\00\185\AF\1D\0F\15\12\04\00d\0F\02\1E\00\0F\D7\00\00\01\DA\04\1B7\15\12\02\AF\02\1D3\AF\02\02\99\1A\05\84\00\08\B9\0C\137\AF\06839]#\13/40\FD\14\01/73\9E\17\02(74\FD\14\09\E6\14$75\9B\01\00'\00\09\DE\00#6,R\00\00&\00\0E\90\07\02\FD\00\0A\E6\14\00 \1B\03\1C\00\0A\9C\05\02\1E\1B#d4\B0\07\0A3\19\2243\F6\1E\0De\02\04`\0B\192\85&\1F7e\02\03\00\C9\0F\02\1E\00\1F1\22\03\02/78\22\03\05?22:)%\1E\1F2)%\1A\1F2)%\22\1F2)%\22\1F2)%\22\1F2)%\22\1F2)%\22\1F2)%#\1F8)%2/40)%3\1F2)%2\1F2)%\1D\1F8)%3\1F2)%*\1F2)%*\1F2)%*\1F2)%+\1F2)%+\1F2)%\FFi\0E\13%\0A\17\0F\01D\07\137\E8$\0Ey++24\BC\17\14,4\00.10\86%\0F\18%\06.28\18%\0F\89%\00\144\0F%\07\09\0A'5,/%\0F\8B%\03\1E5)%\0DS\1F\1F7\8B%\07\148C%\02,%\07]%\149\15%\09h$\1F0h$\04\02 $\1C0h$%2,K\00.21\12%\0E\8B%&4,]%\09\0C\01&25\AA\00\1A2e\00\05\12%\0F\8B%\04\1E6)%\0D\8B%\1F8\8B%\07\149C%\02,%\07T$\140\15%\09\82$\1F1\82$\04\05)%\0A\82$\02\DB$\1E0,%\1F4\8B%\04\0B\12%\07\1A\00\05\12%\0F\8B%\04\1E6)%\0D\8B%\0F\09\01\07$39C%\02,%\06\09\01\06,%\1E4\12%\0E\B0$\142\F8$\0A\B0$%3,K\00)42\1A\00\05\0F%\00\7F\00\0F\8B%\03\0E)%\0E\8B%\1F6\8B%\07\147C%\03,%\06\AD\00#8, \00\1D4s$\0E\DB\00\02^\12+49\DB\00\02\\\12$48\A7#\0F\8B%\00\195\CA +#include +#include +#include +#include +#include + +// includes, kernels +#include "needle_kernel.cu" + +#ifdef TIMING +#include "timing.h" + +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest(int argc, char **argv); + +int maximum(int a, int b, int c) { + + int k; + if (a <= b) + k = b; + else + k = a; + + if (k <= c) + return (c); + else + return (k); +} + +int blosum62[24][24] = {{4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, + -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, + -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, + -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, + -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, + -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, + 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, + -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, + -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, + -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, + 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, + 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, + -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, + 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, + 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, + -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, + -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, + -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, + -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, + -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, + 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, + -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, + -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + {0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}}; + +double gettime() { + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec + t.tv_usec * 1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + cudaSetDevice(0); + printf("WG size of kernel = %d \n", BLOCK_SIZE); + + runTest(argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) { + fprintf(stderr, "Usage: %s \n", argv[0]); + fprintf(stderr, "\t - x and y dimensions\n"); + fprintf(stderr, "\t - penalty(positive integer)\n"); + exit(1); +} + +void runTest(int argc, char **argv) { + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + int *matrix_cuda, *referrence_cuda; + int size; + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 3) { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + } else { + usage(argc, argv); + } + + if (atoi(argv[1]) % 16 != 0) { + fprintf(stderr, "The dimension values must be a multiple of 16\n"); + exit(1); + } + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc(max_rows * max_cols * sizeof(int)); + input_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int)); + output_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int)); + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand(7); + + for (int i = 0; i < max_cols; i++) { + for (int j = 0; j < max_rows; j++) { + input_itemsets[i * max_cols + j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for (int i = 1; i < max_rows; i++) { // please define your own sequence. + input_itemsets[i * max_cols] = rand() % 10 + 1; + } + for (int j = 1; j < max_cols; j++) { // please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + for (int i = 1; i < max_cols; i++) { + for (int j = 1; j < max_rows; j++) { + referrence[i * max_cols + j] = + blosum62[input_itemsets[i * max_cols]][input_itemsets[j]]; + } + } + + for (int i = 1; i < max_rows; i++) + input_itemsets[i * max_cols] = -i * penalty; + for (int j = 1; j < max_cols; j++) + input_itemsets[j] = -j * penalty; + + size = max_cols * max_rows; + cudaMalloc((void **)&referrence_cuda, sizeof(int) * size); + cudaMalloc((void **)&matrix_cuda, sizeof(int) * size); + + cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, + cudaMemcpyHostToDevice); + cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, + cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + dim3 dimGrid; + dim3 dimBlock(BLOCK_SIZE, 1); + int block_width = (max_cols - 1) / BLOCK_SIZE; + +#ifdef TIMING + gettimeofday(&tv_kernel_start, NULL); +#endif + + printf("Processing top-left matrix\n"); + // process top-left matrix + for (int i = 1; i <= block_width; i++) { + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_1<<>>( + referrence_cuda, matrix_cuda, max_cols, penalty, i, block_width); + cudaDeviceSynchronize(); + } + cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, + cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + printf("Processing bottom-right matrix\n"); + // process bottom-right matrix + for (int i = block_width - 1; i >= 1; i--) { + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_2<<>>( + referrence_cuda, matrix_cuda, max_cols, penalty, i, block_width); + cudaDeviceSynchronize(); + } + +#ifdef TIMING + gettimeofday(&tv_kernel_end, NULL); + tvsub(&tv_kernel_end, &tv_kernel_start, &tv); + kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; +#endif + + cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, + cudaMemcpyDeviceToHost); + + //#define TRACEBACK + + FILE *fpo = fopen("result.txt", "w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i >= 0, j >= 0;) { + int nw, n, w, traceback; + if (i == max_rows - 2 && j == max_rows - 2) + fprintf(fpo, "%d ", + output_itemsets[i * max_cols + j]); // print the first element + if (i == 0 && j == 0) + break; + if (i > 0 && j > 0) { + nw = output_itemsets[(i - 1) * max_cols + j - 1]; + w = output_itemsets[i * max_cols + j - 1]; + n = output_itemsets[(i - 1) * max_cols + j]; + } else if (i == 0) { + nw = n = LIMIT; + w = output_itemsets[i * max_cols + j - 1]; + } else if (j == 0) { + nw = w = LIMIT; + n = output_itemsets[(i - 1) * max_cols + j]; + } else { + } + + // traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if (traceback == new_nw) + traceback = nw; + if (traceback == new_w) + traceback = w; + if (traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if (traceback == nw) { + i--; + j--; + continue; + } + + else if (traceback == w) { + j--; + continue; + } + + else if (traceback == n) { + i--; + continue; + } + + else + ; + } + + fclose(fpo); + + cudaFree(referrence_cuda); + cudaFree(matrix_cuda); + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} diff --git a/examples/nw/needle.h b/examples/nw/needle.h new file mode 100644 index 0000000..a0907b3 --- /dev/null +++ b/examples/nw/needle.h @@ -0,0 +1,10 @@ +#ifdef RD_WG_SIZE_0_0 +#define BLOCK_SIZE RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define BLOCK_SIZE RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE RD_WG_SIZE +#else +#define BLOCK_SIZE 16 +#endif +//#define TRACE diff --git a/examples/nw/needle_kernel.cu b/examples/nw/needle_kernel.cu new file mode 100644 index 0000000..d180012 --- /dev/null +++ b/examples/nw/needle_kernel.cu @@ -0,0 +1,165 @@ +#include "needle.h" +#include + +#define SDATA(index) CUT_BANK_CHECKER(sdata, index) + +__device__ int maximum_device(int a, int b, int c) { + + int k; + if (a <= b) + k = b; + else + k = a; + + if (k <= c) + return (c); + else + return (k); +} +__global__ void needle_cuda_shared_1(int *referrence, int *matrix_cuda, + int cols, int penalty, int i, + int block_width) { + int bx = blockIdx.x; + int tx = threadIdx.x; + + int b_index_x = bx; + int b_index_y = i - 1 - bx; + + int index = + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1); + int index_n = + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1); + int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols); + int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; + + __shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; + __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; + + if (tx == 0) + temp[tx][0] = matrix_cuda[index_nw]; + + for (int ty = 0; ty < BLOCK_SIZE; ty++) + ref[ty][tx] = referrence[index + cols * ty]; + + __syncthreads(); + + temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; + + __syncthreads(); + + temp[0][tx + 1] = matrix_cuda[index_n]; + + __syncthreads(); + + for (int m = 0; m < BLOCK_SIZE; m++) { + + if (tx <= m) { + + int t_index_x = tx + 1; + int t_index_y = m - tx + 1; + + temp[t_index_y][t_index_x] = + maximum_device(temp[t_index_y - 1][t_index_x - 1] + + ref[t_index_y - 1][t_index_x - 1], + temp[t_index_y][t_index_x - 1] - penalty, + temp[t_index_y - 1][t_index_x] - penalty); + } + + __syncthreads(); + } + + for (int m = BLOCK_SIZE - 2; m >= 0; m--) { + + if (tx <= m) { + + int t_index_x = tx + BLOCK_SIZE - m; + int t_index_y = BLOCK_SIZE - tx; + + temp[t_index_y][t_index_x] = + maximum_device(temp[t_index_y - 1][t_index_x - 1] + + ref[t_index_y - 1][t_index_x - 1], + temp[t_index_y][t_index_x - 1] - penalty, + temp[t_index_y - 1][t_index_x] - penalty); + } + + __syncthreads(); + } + + for (int ty = 0; ty < BLOCK_SIZE; ty++) + matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1]; +} + +__global__ void needle_cuda_shared_2(int *referrence, int *matrix_cuda, + + int cols, int penalty, int i, + int block_width) { + + int bx = blockIdx.x; + int tx = threadIdx.x; + + int b_index_x = bx + block_width - i; + int b_index_y = block_width - bx - 1; + + int index = + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1); + int index_n = + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1); + int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols); + int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; + + __shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; + __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; + + for (int ty = 0; ty < BLOCK_SIZE; ty++) + ref[ty][tx] = referrence[index + cols * ty]; + + __syncthreads(); + + if (tx == 0) + temp[tx][0] = matrix_cuda[index_nw]; + + temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; + + __syncthreads(); + + temp[0][tx + 1] = matrix_cuda[index_n]; + + __syncthreads(); + + for (int m = 0; m < BLOCK_SIZE; m++) { + + if (tx <= m) { + + int t_index_x = tx + 1; + int t_index_y = m - tx + 1; + + temp[t_index_y][t_index_x] = + maximum_device(temp[t_index_y - 1][t_index_x - 1] + + ref[t_index_y - 1][t_index_x - 1], + temp[t_index_y][t_index_x - 1] - penalty, + temp[t_index_y - 1][t_index_x] - penalty); + } + + __syncthreads(); + } + + for (int m = BLOCK_SIZE - 2; m >= 0; m--) { + + if (tx <= m) { + + int t_index_x = tx + BLOCK_SIZE - m; + int t_index_y = BLOCK_SIZE - tx; + + temp[t_index_y][t_index_x] = + maximum_device(temp[t_index_y - 1][t_index_x - 1] + + ref[t_index_y - 1][t_index_x - 1], + temp[t_index_y][t_index_x - 1] - penalty, + temp[t_index_y - 1][t_index_x] - penalty); + } + + __syncthreads(); + } + + for (int ty = 0; ty < BLOCK_SIZE; ty++) + matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1]; +} diff --git a/examples/nw/run.sh b/examples/nw/run.sh new file mode 100644 index 0000000..0dd3f29 --- /dev/null +++ b/examples/nw/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +llvm-as needle-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as needle-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator needle-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator needle-host-x86_64-unknown-linux-gnu.bc host.bc +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime \ + -L../../build/runtime/threadPool \ + -o needle -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./needle 16 10 +if grep -q -e "-11 -7 -5 -6 -7 -7 -4 -2 -2 2 -7 -9 -9 -7 -3 0" result.txt; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/particlefilter/ex_particle_CUDA_naive_seq-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/particlefilter/ex_particle_CUDA_naive_seq-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..35ef954 --- /dev/null +++ b/examples/particlefilter/ex_particle_CUDA_naive_seq-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,482 @@ +; ModuleID = 'ex_particle_CUDA_naive_seq-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "ex_particle_CUDA_naive_seq.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_blockDim_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local i32 @_Z12findIndexSeqPdid(double* %CDF, i32 %lengthCDF, double %value) #0 { +entry: + %retval = alloca i32, align 4 + %CDF.addr = alloca double*, align 8 + %lengthCDF.addr = alloca i32, align 4 + %value.addr = alloca double, align 8 + %index = alloca i32, align 4 + %x = alloca i32, align 4 + store double* %CDF, double** %CDF.addr, align 8 + store i32 %lengthCDF, i32* %lengthCDF.addr, align 4 + store double %value, double* %value.addr, align 8 + store i32 -1, i32* %index, align 4 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %x, align 4 + %1 = load i32, i32* %lengthCDF.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load double*, double** %CDF.addr, align 8 + %3 = load i32, i32* %x, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds double, double* %2, i64 %idxprom + %4 = load double, double* %arrayidx, align 8 + %5 = load double, double* %value.addr, align 8 + %cmp1 = fcmp oge double %4, %5 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %6 = load i32, i32* %x, align 4 + store i32 %6, i32* %index, align 4 + br label %for.end + +if.end: ; preds = %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %7 = load i32, i32* %x, align 4 + %inc = add nsw i32 %7, 1 + store i32 %inc, i32* %x, align 4 + br label %for.cond + +for.end: ; preds = %if.then, %for.cond + %8 = load i32, i32* %index, align 4 + %cmp2 = icmp eq i32 %8, -1 + br i1 %cmp2, label %if.then3, label %if.end4 + +if.then3: ; preds = %for.end + %9 = load i32, i32* %lengthCDF.addr, align 4 + %sub = sub nsw i32 %9, 1 + store i32 %sub, i32* %retval, align 4 + br label %return + +if.end4: ; preds = %for.end + %10 = load i32, i32* %index, align 4 + store i32 %10, i32* %retval, align 4 + br label %return + +return: ; preds = %if.end4, %if.then3 + %11 = load i32, i32* %retval, align 4 + ret i32 %11 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local i32 @_Z12findIndexBinPdiid(double* %CDF, i32 %beginIndex, i32 %endIndex, double %value) #0 { +entry: + %retval = alloca i32, align 4 + %CDF.addr = alloca double*, align 8 + %beginIndex.addr = alloca i32, align 4 + %endIndex.addr = alloca i32, align 4 + %value.addr = alloca double, align 8 + %middleIndex = alloca i32, align 4 + store double* %CDF, double** %CDF.addr, align 8 + store i32 %beginIndex, i32* %beginIndex.addr, align 4 + store i32 %endIndex, i32* %endIndex.addr, align 4 + store double %value, double* %value.addr, align 8 + %0 = load i32, i32* %endIndex.addr, align 4 + %1 = load i32, i32* %beginIndex.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 -1, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + br label %while.cond + +while.cond: ; preds = %if.end34, %if.end + %2 = load i32, i32* %endIndex.addr, align 4 + %3 = load i32, i32* %beginIndex.addr, align 4 + %cmp1 = icmp sgt i32 %2, %3 + br i1 %cmp1, label %while.body, label %while.end35 + +while.body: ; preds = %while.cond + %4 = load i32, i32* %beginIndex.addr, align 4 + %5 = load i32, i32* %endIndex.addr, align 4 + %6 = load i32, i32* %beginIndex.addr, align 4 + %sub = sub nsw i32 %5, %6 + %div = sdiv i32 %sub, 2 + %add = add nsw i32 %4, %div + store i32 %add, i32* %middleIndex, align 4 + %7 = load double*, double** %CDF.addr, align 8 + %8 = load i32, i32* %middleIndex, align 4 + %idxprom = sext i32 %8 to i64 + %arrayidx = getelementptr inbounds double, double* %7, i64 %idxprom + %9 = load double, double* %arrayidx, align 8 + %10 = load double, double* %value.addr, align 8 + %cmp2 = fcmp oge double %9, %10 + br i1 %cmp2, label %if.then3, label %if.end26 + +if.then3: ; preds = %while.body + %11 = load i32, i32* %middleIndex, align 4 + %cmp4 = icmp eq i32 %11, 0 + br i1 %cmp4, label %if.then5, label %if.else + +if.then5: ; preds = %if.then3 + %12 = load i32, i32* %middleIndex, align 4 + store i32 %12, i32* %retval, align 4 + br label %return + +if.else: ; preds = %if.then3 + %13 = load double*, double** %CDF.addr, align 8 + %14 = load i32, i32* %middleIndex, align 4 + %sub6 = sub nsw i32 %14, 1 + %idxprom7 = sext i32 %sub6 to i64 + %arrayidx8 = getelementptr inbounds double, double* %13, i64 %idxprom7 + %15 = load double, double* %arrayidx8, align 8 + %16 = load double, double* %value.addr, align 8 + %cmp9 = fcmp olt double %15, %16 + br i1 %cmp9, label %if.then10, label %if.else11 + +if.then10: ; preds = %if.else + %17 = load i32, i32* %middleIndex, align 4 + store i32 %17, i32* %retval, align 4 + br label %return + +if.else11: ; preds = %if.else + %18 = load double*, double** %CDF.addr, align 8 + %19 = load i32, i32* %middleIndex, align 4 + %sub12 = sub nsw i32 %19, 1 + %idxprom13 = sext i32 %sub12 to i64 + %arrayidx14 = getelementptr inbounds double, double* %18, i64 %idxprom13 + %20 = load double, double* %arrayidx14, align 8 + %21 = load double, double* %value.addr, align 8 + %cmp15 = fcmp oeq double %20, %21 + br i1 %cmp15, label %if.then16, label %if.end23 + +if.then16: ; preds = %if.else11 + br label %while.cond17 + +while.cond17: ; preds = %while.body22, %if.then16 + %22 = load double*, double** %CDF.addr, align 8 + %23 = load i32, i32* %middleIndex, align 4 + %idxprom18 = sext i32 %23 to i64 + %arrayidx19 = getelementptr inbounds double, double* %22, i64 %idxprom18 + %24 = load double, double* %arrayidx19, align 8 + %25 = load double, double* %value.addr, align 8 + %cmp20 = fcmp oeq double %24, %25 + br i1 %cmp20, label %land.rhs, label %land.end + +land.rhs: ; preds = %while.cond17 + %26 = load i32, i32* %middleIndex, align 4 + %cmp21 = icmp sge i32 %26, 0 + br label %land.end + +land.end: ; preds = %land.rhs, %while.cond17 + %27 = phi i1 [ false, %while.cond17 ], [ %cmp21, %land.rhs ] + br i1 %27, label %while.body22, label %while.end + +while.body22: ; preds = %land.end + %28 = load i32, i32* %middleIndex, align 4 + %dec = add nsw i32 %28, -1 + store i32 %dec, i32* %middleIndex, align 4 + br label %while.cond17 + +while.end: ; preds = %land.end + %29 = load i32, i32* %middleIndex, align 4 + %inc = add nsw i32 %29, 1 + store i32 %inc, i32* %middleIndex, align 4 + %30 = load i32, i32* %middleIndex, align 4 + store i32 %30, i32* %retval, align 4 + br label %return + +if.end23: ; preds = %if.else11 + br label %if.end24 + +if.end24: ; preds = %if.end23 + br label %if.end25 + +if.end25: ; preds = %if.end24 + br label %if.end26 + +if.end26: ; preds = %if.end25, %while.body + %31 = load double*, double** %CDF.addr, align 8 + %32 = load i32, i32* %middleIndex, align 4 + %idxprom27 = sext i32 %32 to i64 + %arrayidx28 = getelementptr inbounds double, double* %31, i64 %idxprom27 + %33 = load double, double* %arrayidx28, align 8 + %34 = load double, double* %value.addr, align 8 + %cmp29 = fcmp ogt double %33, %34 + br i1 %cmp29, label %if.then30, label %if.else32 + +if.then30: ; preds = %if.end26 + %35 = load i32, i32* %middleIndex, align 4 + %sub31 = sub nsw i32 %35, 1 + store i32 %sub31, i32* %endIndex.addr, align 4 + br label %if.end34 + +if.else32: ; preds = %if.end26 + %36 = load i32, i32* %middleIndex, align 4 + %add33 = add nsw i32 %36, 1 + store i32 %add33, i32* %beginIndex.addr, align 4 + br label %if.end34 + +if.end34: ; preds = %if.else32, %if.then30 + br label %while.cond + +while.end35: ; preds = %while.cond + store i32 -1, i32* %retval, align 4 + br label %return + +return: ; preds = %while.end35, %while.end, %if.then10, %if.then5, %if.then + %37 = load i32, i32* %retval, align 4 + ret i32 %37 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z6kernelPdS_S_S_S_S_i(double* %arrayX, double* %arrayY, double* %CDF, double* %u, double* %xj, double* %yj, i32 %Nparticles) #0 { +entry: + %arrayX.addr = alloca double*, align 8 + %arrayY.addr = alloca double*, align 8 + %CDF.addr = alloca double*, align 8 + %u.addr = alloca double*, align 8 + %xj.addr = alloca double*, align 8 + %yj.addr = alloca double*, align 8 + %Nparticles.addr = alloca i32, align 4 + %block_id = alloca i32, align 4 + %i = alloca i32, align 4 + %index = alloca i32, align 4 + %x = alloca i32, align 4 + store double* %arrayX, double** %arrayX.addr, align 8 + store double* %arrayY, double** %arrayY.addr, align 8 + store double* %CDF, double** %CDF.addr, align 8 + store double* %u, double** %u.addr, align 8 + store double* %xj, double** %xj.addr, align 8 + store double* %yj, double** %yj.addr, align 8 + store i32 %Nparticles, i32* %Nparticles.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + store i32 %call, i32* %block_id, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %0 = load i32, i32* %block_id, align 4 + %mul = mul i32 %call1, %0 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add = add i32 %mul, %call2 + store i32 %add, i32* %i, align 4 + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %Nparticles.addr, align 4 + %cmp = icmp slt i32 %1, %2 + br i1 %cmp, label %if.then, label %if.end19 + +if.then: ; preds = %entry + store i32 -1, i32* %index, align 4 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then + %3 = load i32, i32* %x, align 4 + %4 = load i32, i32* %Nparticles.addr, align 4 + %cmp3 = icmp slt i32 %3, %4 + br i1 %cmp3, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %5 = load double*, double** %CDF.addr, align 8 + %6 = load i32, i32* %x, align 4 + %idxprom = sext i32 %6 to i64 + %arrayidx = getelementptr inbounds double, double* %5, i64 %idxprom + %7 = load double, double* %arrayidx, align 8 + %8 = load double*, double** %u.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %9 to i64 + %arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4 + %10 = load double, double* %arrayidx5, align 8 + %cmp6 = fcmp oge double %7, %10 + br i1 %cmp6, label %if.then7, label %if.end + +if.then7: ; preds = %for.body + %11 = load i32, i32* %x, align 4 + store i32 %11, i32* %index, align 4 + br label %for.end + +if.end: ; preds = %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32, i32* %x, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %x, align 4 + br label %for.cond + +for.end: ; preds = %if.then7, %for.cond + %13 = load i32, i32* %index, align 4 + %cmp8 = icmp eq i32 %13, -1 + br i1 %cmp8, label %if.then9, label %if.end10 + +if.then9: ; preds = %for.end + %14 = load i32, i32* %Nparticles.addr, align 4 + %sub = sub nsw i32 %14, 1 + store i32 %sub, i32* %index, align 4 + br label %if.end10 + +if.end10: ; preds = %if.then9, %for.end + %15 = load double*, double** %arrayX.addr, align 8 + %16 = load i32, i32* %index, align 4 + %idxprom11 = sext i32 %16 to i64 + %arrayidx12 = getelementptr inbounds double, double* %15, i64 %idxprom11 + %17 = load double, double* %arrayidx12, align 8 + %18 = load double*, double** %xj.addr, align 8 + %19 = load i32, i32* %i, align 4 + %idxprom13 = sext i32 %19 to i64 + %arrayidx14 = getelementptr inbounds double, double* %18, i64 %idxprom13 + store double %17, double* %arrayidx14, align 8 + %20 = load double*, double** %arrayY.addr, align 8 + %21 = load i32, i32* %index, align 4 + %idxprom15 = sext i32 %21 to i64 + %arrayidx16 = getelementptr inbounds double, double* %20, i64 %idxprom15 + %22 = load double, double* %arrayidx16, align 8 + %23 = load double*, double** %yj.addr, align 8 + %24 = load i32, i32* %i, align 4 + %idxprom17 = sext i32 %24 to i64 + %arrayidx18 = getelementptr inbounds double, double* %23, i64 %idxprom17 + store double %22, double* %arrayidx18, align 8 + br label %if.end19 + +if.end19: ; preds = %if.end10, %entry + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (double*, double*, double*, double*, double*, double*, i32)* @_Z6kernelPdS_S_S_S_S_i, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/particlefilter/ex_particle_CUDA_naive_seq-host-x86_64-unknown-linux-gnu.ll b/examples/particlefilter/ex_particle_CUDA_naive_seq-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..a5835c5 --- /dev/null +++ b/examples/particlefilter/ex_particle_CUDA_naive_seq-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,2920 @@ +; ModuleID = 'ex_particle_CUDA_naive_seq-host-x86_64-unknown-linux-gnu.bc' +source_filename = "ex_particle_CUDA_naive_seq.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZSt3powdi = comdat any + +$_ZSt4fabsIiEN9__gnu_cxx11__enable_ifIXsr12__is_integerIT_EE7__valueEdE6__typeES2_ = comdat any + +$_ZN4dim3C2Ejjj = comdat any + +@M = dso_local global i64 2147483647, align 8 +@A = dso_local global i32 1103515245, align 4 +@C = dso_local global i32 12345, align 4 +@.str = private unnamed_addr constant [17 x i8] c"\0ACUDA error: %s\0A\00", align 1 +@.str.1 = private unnamed_addr constant [32 x i8] c"TIME TO GET NEIGHBORS TOOK: %f\0A\00", align 1 +@.str.2 = private unnamed_addr constant [29 x i8] c"TIME TO GET WEIGHTSTOOK: %f\0A\00", align 1 +@.str.3 = private unnamed_addr constant [28 x i8] c"TIME TO SET ERROR TOOK: %f\0A\00", align 1 +@.str.4 = private unnamed_addr constant [34 x i8] c"TIME TO GET LIKELIHOODS TOOK: %f\0A\00", align 1 +@.str.5 = private unnamed_addr constant [26 x i8] c"TIME TO GET EXP TOOK: %f\0A\00", align 1 +@.str.6 = private unnamed_addr constant [30 x i8] c"TIME TO SUM WEIGHTS TOOK: %f\0A\00", align 1 +@.str.7 = private unnamed_addr constant [36 x i8] c"TIME TO NORMALIZE WEIGHTS TOOK: %f\0A\00", align 1 +@.str.8 = private unnamed_addr constant [30 x i8] c"TIME TO MOVE OBJECT TOOK: %f\0A\00", align 1 +@.str.9 = private unnamed_addr constant [9 x i8] c"XE: %lf\0A\00", align 1 +@.str.10 = private unnamed_addr constant [9 x i8] c"YE: %lf\0A\00", align 1 +@.str.11 = private unnamed_addr constant [5 x i8] c"%lf\0A\00", align 1 +@.str.12 = private unnamed_addr constant [31 x i8] c"TIME TO CALC CUM SUM TOOK: %f\0A\00", align 1 +@.str.13 = private unnamed_addr constant [25 x i8] c"TIME TO CALC U TOOK: %f\0A\00", align 1 +@.str.14 = private unnamed_addr constant [26 x i8] c"SENDING TO GPU TOOK: %lf\0A\00", align 1 +@.str.15 = private unnamed_addr constant [21 x i8] c"CUDA EXEC TOOK: %lf\0A\00", align 1 +@.str.16 = private unnamed_addr constant [33 x i8] c"SENDING BACK FROM GPU TOOK: %lf\0A\00", align 1 +@.str.17 = private unnamed_addr constant [41 x i8] c"TIME TO CALC NEW ARRAY X AND Y TOOK: %f\0A\00", align 1 +@.str.18 = private unnamed_addr constant [32 x i8] c"TIME TO RESET WEIGHTS TOOK: %f\0A\00", align 1 +@.str.19 = private unnamed_addr constant [8 x i8] c"output\0A\00", align 1 +@.str.20 = private unnamed_addr constant [5 x i8] c"%lf \00", align 1 +@.str.21 = private unnamed_addr constant [56 x i8] c"naive.out -x -y -z -np \00", align 1 +@.str.22 = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1 +@.str.23 = private unnamed_addr constant [3 x i8] c"-x\00", align 1 +@.str.24 = private unnamed_addr constant [3 x i8] c"-y\00", align 1 +@.str.25 = private unnamed_addr constant [3 x i8] c"-z\00", align 1 +@.str.26 = private unnamed_addr constant [4 x i8] c"-np\00", align 1 +@.str.27 = private unnamed_addr constant [3 x i8] c"%d\00", align 1 +@.str.28 = private unnamed_addr constant [31 x i8] c"ERROR: dimX input is incorrect\00", align 1 +@.str.29 = private unnamed_addr constant [18 x i8] c"dimX must be > 0\0A\00", align 1 +@.str.30 = private unnamed_addr constant [31 x i8] c"ERROR: dimY input is incorrect\00", align 1 +@.str.31 = private unnamed_addr constant [18 x i8] c"dimY must be > 0\0A\00", align 1 +@.str.32 = private unnamed_addr constant [43 x i8] c"ERROR: Number of frames input is incorrect\00", align 1 +@.str.33 = private unnamed_addr constant [30 x i8] c"number of frames must be > 0\0A\00", align 1 +@.str.34 = private unnamed_addr constant [46 x i8] c"ERROR: Number of particles input is incorrect\00", align 1 +@.str.35 = private unnamed_addr constant [33 x i8] c"Number of particles must be > 0\0A\00", align 1 +@.str.36 = private unnamed_addr constant [24 x i8] c"VIDEO SEQUENCE TOOK %f\0A\00", align 1 +@.str.37 = private unnamed_addr constant [25 x i8] c"PARTICLE FILTER TOOK %f\0A\00", align 1 +@.str.38 = private unnamed_addr constant [24 x i8] c"ENTIRE PROGRAM TOOK %f\0A\00", align 1 +@0 = private unnamed_addr constant [23 x i8] c"_Z6kernelPdS_S_S_S_S_i\00", align 1 +@1 = private constant [11281 x i8] c"P\EDU\BA\01\00\10\00\00,\00\00\00\00\00\00\02\00\01\01@\00\00\00\08\1F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00`\1E\00\00\00\00\00\00 \1C\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z6kernelPdS_S_S_S_S_i\00.nv.info._Z6kernelPdS_S_S_S_S_i\00.nv.shared._Z6kernelPdS_S_S_S_S_i\00.nv.global\00.nv.constant0._Z6kernelPdS_S_S_S_S_i\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z6kernelPdS_S_S_S_S_i\00.text._Z6kernelPdS_S_S_S_S_i\00.nv.info._Z6kernelPdS_S_S_S_S_i\00.nv.shared._Z6kernelPdS_S_S_S_S_i\00.nv.global\00blockIdx\00blockDim\00threadIdx\00.nv.constant0._Z6kernelPdS_S_S_S_S_i\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00I\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B3\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\BC\00\00\00\01\00\08\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\C5\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\CF\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00\00\17\00\00\00\00\00\00\04/\08\00\07\00\00\00\13\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00H\00\00\00\04\11\08\00\07\00\00\00H\00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\014\00\03\194\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\A8\06\00\00\04\1C\04\00\D0\16\00\00\04\1E\04\00`\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\7Fvisible\D9\04\0F\FA\05_Z12findIndexSeqPdid\B7\04\0F\22\00\01\0F\A9\0C\03\0F*\00\08/1,T\00\0E\0F\A5\0C\1B\1F6\B9\03\18xpred %p\BA\03\02\CB\03\161\12\00\10f4\00/fd\DE\03\00\1F6\DE\03\0C\1F6\DE\03\12\03`\00\00\8C\02\0F\FD\00\01\0F\C9\0C\07\0F2\00\0C\0F\F1\02\00\0Fe\00\0D/0]\B9\02\02\1F1\B9\02\08\1Af\B9\02;fd1y\07/-1\CA\02\12\1B0&\00\146\BA\02\F2\01bra.uni LBB6_1;\0A\08\00\10:\DB\00\02=\00%4,3\00\16;\16\00%5,\AD\00\B0;\0Asetp.ge.s\1B\002p1,6\00\D2%r5;\0A@%p1 bra`\00\1B6p\00\132p\00\122p\00\06\8B\04\02\1E\01\01p\00\01U\00\02J\05\05\87\004shl&\02#4,\1E\00\823;\0Aadd.s\17\00#5,K\00\00#\00\01H\00\03\08\02\01^\00Hrd5]\15\00%3,e\01\03\CE\002ltu\1D\003p2,7\00\00(\00\01\D1\00\162\D1\00\1B4\D1\00\133\D1\00\173A\01\198\BA\00\0E\AB\01\1B8D\00\136D\00*4:\18\00\135\18\00\175\\\00\186\\\00$ad.\01\227,\1C\00\1F1\F6\01\02\1F7\F6\01\04\176Y\00%9,\A4\00\03\12\01\14n\E0\01\223, \00!-1\0E\01\163\0E\01\0C\CA\00\137Y\00\177Y\00(11:\02\06\B3\00312,\1E\00\0C\D2\02\03\0A\0E\1B2]\00\139]\00\188]\00\190\B7\00\0EE\00\1F0E\00\04\189E\00\04\0E\02/0]\A2\0E\11\0F\FE\050oBinPdi\FF\05\0A\04#\00\0F\00\06\10\0B+\00\1F1+\00\17/2,\81\00\0F\0F.\0E\1B\1F7-\06 ,13.\06\1E3.\06\1F1\DE\10\00\0F\DF\10\0D\1F70\06*\0C\01\01\1E]B\0A\0F3\00\03\0Fd\06\1A\0C\BD\01\0Fe\06\11\0C\1C\02\0Ff\06\1F\05\16\00\152\A8\0D\0F|\06\04&ldT\06\042\00\1E;-\06\0F\17\06\00\223,6\00:%r48\04*7_\DB\03\137\91\04\107\87\06\07\C5\06\1F2\1B\04\05\1C3@\00#24A\00*2:\19\00\133\18\00\08\9F\05\1F5\C9\00\02\1D6\C9\00\14l\C9\00\02\DB\06\01b\05\01\C9\00\174\C9\00\1B3\8A\00\04\89\00\1F4\10\06\00\01[\00\0Dq\05\01\87\00Usub.s\D2\04\01\1D\00\01(\067shrG\05\02\1E\00/31G\05\00\120M\05\022\00\15s\D7\04\02 \00\0A1\00\01\CF\00\148\CB\04\0F\15\0B\00?14;\9E\07\17\1F2\9E\07\\\1D5\9E\07\165\8F\01+19\8F\01\135\8F\01\08C\07/15\EB\06\07#6,!\00\110[\00\166[\00\1B7Z\00\136Z\00\08D\07/31\8E\06\0B/31\B8\02\05\187\10\09\1A6r\01\03\B6\00\186\\\00\07G\07#7,\1E\00\12-%\1A\00s\01\04\A6\01\02\1C\00\197\A2\01#8,\1A\00\0A\A2\01#9,{\00\00#\00\08\8D\01\124\A2\01\199\A2\01\1D5\A2\01$ge\A2\01#7,7\00\00(\00\01G\01\167G\01\0C\A1\01\138\01\01\08\D5\07\1F3\D5\07\0C/30G\01\05\189G\01/10H\01\02\1F8H\01\04#9,\1E\00\0FH\01\00\03}\03\199I\01\02z\03\01\1C\00\0AK\01\02y\03\12d\94\038d129\01\126N\01)13O\01\1D7O\01\15nO\01#8,8\00\00(\00\01O\01\178\F1\02\1C6J\05\140K\05\1B0\0B\05\151e\05\191&\01\1F8\E0\03\02/19\E1\03\04\132\D6\00\1C9\F6\00421,P\00\01'\00\08\E0\00\02\8B\01\00$\00\09\F7\00/11\F8\00\08\01t\01\02;\00\01+\00\02\E0\0C\034\08\00\16\00\1A0\12\00!2,\18\00\02\82\0C\171#\01\0C\A3\05$12\09\01\172u\02-24\16\04(gt\E1\0C\2224\1B\02\0Et\00\0C`\00\04u\06813:.\00\02-\00W2;\0A@!S\0CL7_15A\00\04\CF\02\181F\06\1F2\B9\02\04329,\1E\00\0F\1A\0E\05/9;\0A\02\05\08\17\05/25`\00\04#6,\1E\00\0F_\00\04\176^\03/27\BB\03\0B/27\BB\03\05,16\B0\02\147\1A\00\1C7\1A\00\045\04,18\1A\00\149\1A\00\0A\0A\04\1F4\E4\02\03\1F5\E4\02\04\00N\05\03 \00\0B\DA\03$7,P\00\01'\00\08\CD\02\138\DA\03\197\E3\02\1E9\CB\06\05\E2\02#9,8\00\00(\00\01\B8\02\179Z\08\0D,\06\04\DA\03(20\B6\01\1F2\B6\01\04\01\84\04\1E2C\09\05\F2\09\0C\17\03\142\17\03\182\0D\10/20`\00\04\02\14\03\1D0\16\02\03g\0A/21_\00\06\0F\A4\09\05(23\FE\09\1F7\FD\09\05\0FB\02\06\182.\03\1F3\D4\0D\1A\1F3\D4\0D\03\F4\04entry _Z6kernelPdS_\02\00\16i\BB\0D\00e\02\0F$\00\03\0E\BC\0D\0F,\00\0E\1F1,\00\18\1F2,\00\18\1F3,\00\18\1F4,\00\18\175,\00/32,\00\0B\1F6C\0E\13O8[72C\0E\1D\1C5B\0E-20B\0E\1E5N\18\0Fw#\0D\1F8O\18\18\02\A6\0D\0F\FF\00\09\0F\DB\0D\00\1F65\00\0D\1F55\00\00\1F55\00\0D\0F\99\18\01\1F45\00\0D\1F3\CE\18\01\0F5\00\0D\0FZ\18\02\0F5\00\06\0F\E4\0E\0E\0FE\03\09\13]\8C\01#to$(\04;\00\02\94\05\04\1C\16\0A\1C\00\03\ED\0A\0F;\00\05\02\F5\0A\1F5;\00\02\03\BF\09\1F9<\00\05$11\CE\0C\0F=\00\01\04\1C\0A\0F>\00\06\143\96\18\0F>\00\01\03`\06\1F3>\00\06\03P\0D\0F>\00\03\04\BE\06\0F>\00\06\03\C6\06\0F>\00\03\023\01/17\89\19\03\1F8k\10\03\1A6\17\00\03\05\06?d14\8D\19\03*12\18\00\03K\08:d10\18\00\134w\00\1A8\09\06\154\F7#\09\1A\17\8A%ctaid.x-\00\1F5 \17\032%nt,\00\0D\CB\10\105\D5\02cmul.lo\ED\06\185\CA\10\06F\00\00h\01\03E\00\0B\8A\15\04-\10\09\88\00\05\8C\15\0D\19\10\1F5\19\10\02/48j\17\03\01\A1\01\1A9j\17+8_\EA\14\138T\11\1A8T\11.10W\07\166+\15\07\B1#\1D1C\18464]e\10\09i\00\132i\00\08N\0B\00\00\03\045\00\1E;j\15\0F\DC\00\01&2,\CE\10\0Bw\17\1B8~\0F\1384\0898_3\E7\09\1B9i\11\03\E8\09(20\8D\00\08\CC\0C$1, \00\0B\CC\0C\152\E5\0C\01'\00\09\B5\0C\03\CB\0C)2]&\11\09\B4\10\07|\00\184\E4\01\08|\00\03\C3\03\1D4|\00$6,Q\00\01'\00\0F.\11\00.26\18\11#3,\9D\00\00'\00\0Ap\13;8_5?\01\134?\01/4:a\10\00\02&\01\0F!\02\00\1B6F\00\137F\00*5:\18\00\136\18\00\08\1B\11(14^\00\07w\0F#5,\1E\00\1F1n\02\03\1F5n\02\04\09c\18%7,\AA\00\0C\D2\11#4,!\00*-1\BD\13\1B8\8C\10\138\8C\10\1A8a\18\188\B2\02\0E/\10\0FO\03\07\0C^\00\139^\00\09\9A\0C\050\0D\1A06\02\198\D1\00\076\02\03\92\02\1C86\02\004\11\03P\00\01'\00\0DO\13\00#\00\09\B2\02\0A\AD\12\07\DA\13\1F2\B2\02\04\00S\0B\03 \00\0B|\00$4,Q\00\01'\00\07L\16\00\1D\00\14]\AC\13\07|\00\1F5U\14\03\1F6\F7\00\04437, \00\0B{\00$8,P\00\01'\00\0F\B9\12\00*38\F7\00\05_\05\0As\01/40\F7\00\04\134\E8\07\0D%\04442,Q\00\01'\00\09\F7\00\2242\F7\00\1C4\96\05\140\97\05\B00:\0Aret;\0A\0A}\0A\00\00\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([11281 x i8], [11281 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i64 @_Z8get_timev() #0 { +entry: + %tv = alloca %struct.timeval, align 8 + %call = call i32 @gettimeofday(%struct.timeval* %tv, %struct.timezone* null) #10 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 0 + %0 = load i64, i64* %tv_sec, align 8 + %mul = mul nsw i64 %0, 1000000 + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 1 + %1 = load i64, i64* %tv_usec, align 8 + %add = add nsw i64 %mul, %1 + ret i64 %add +} + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local float @_Z12elapsed_timexx(i64 %start_time, i64 %end_time) #0 { +entry: + %start_time.addr = alloca i64, align 8 + %end_time.addr = alloca i64, align 8 + store i64 %start_time, i64* %start_time.addr, align 8 + store i64 %end_time, i64* %end_time.addr, align 8 + %0 = load i64, i64* %end_time.addr, align 8 + %1 = load i64, i64* %start_time.addr, align 8 + %sub = sub nsw i64 %0, %1 + %conv = sitofp i64 %sub to float + %div = fdiv float %conv, 1.000000e+06 + ret float %div +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z11check_error9cudaError(i32 %e) #2 { +entry: + %e.addr = alloca i32, align 4 + store i32 %e, i32* %e.addr, align 4 + %0 = load i32, i32* %e.addr, align 4 + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* %e.addr, align 4 + %call = call i8* @cudaGetErrorString(i32 %1) + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str, i64 0, i64 0), i8* %call) + call void @exit(i32 1) #11 + unreachable + +if.end: ; preds = %entry + ret void +} + +declare dso_local i32 @printf(i8*, ...) #3 + +declare dso_local i8* @cudaGetErrorString(i32) #3 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z6kernelPdS_S_S_S_S_i(double* %arrayX, double* %arrayY, double* %CDF, double* %u, double* %xj, double* %yj, i32 %Nparticles) #2 { +entry: + %arrayX.addr = alloca double*, align 8 + %arrayY.addr = alloca double*, align 8 + %CDF.addr = alloca double*, align 8 + %u.addr = alloca double*, align 8 + %xj.addr = alloca double*, align 8 + %yj.addr = alloca double*, align 8 + %Nparticles.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store double* %arrayX, double** %arrayX.addr, align 8 + store double* %arrayY, double** %arrayY.addr, align 8 + store double* %CDF, double** %CDF.addr, align 8 + store double* %u, double** %u.addr, align 8 + store double* %xj, double** %xj.addr, align 8 + store double* %yj, double** %yj.addr, align 8 + store i32 %Nparticles, i32* %Nparticles.addr, align 4 + %kernel_args = alloca i8*, i64 7, align 16 + %0 = bitcast double** %arrayX.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast double** %arrayY.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast double** %CDF.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast double** %u.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast double** %xj.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast double** %yj.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32* %Nparticles.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %15 = load i64, i64* %shmem_size, align 8 + %16 = load i8*, i8** %stream, align 8 + %17 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %18 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 12, i1 false) + %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %20 = load i64, i64* %19, align 8 + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %22 = load i32, i32* %21, align 8 + %23 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %24 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 12, i1 false) + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %26 = load i64, i64* %25, align 8 + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %28 = load i32, i32* %27, align 8 + %29 = bitcast i8* %16 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*, double*, double*, double*, double*, i32)* @_Z6kernelPdS_S_S_S_S_i to i8*), i64 %20, i32 %22, i64 %26, i32 %28, i8** %kernel_args, i64 %15, %struct.CUstream_st* %29) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #5 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local double @_Z11roundDoubled(double %value) #0 { +entry: + %retval = alloca double, align 8 + %value.addr = alloca double, align 8 + %newValue = alloca i32, align 4 + store double %value, double* %value.addr, align 8 + %0 = load double, double* %value.addr, align 8 + %conv = fptosi double %0 to i32 + store i32 %conv, i32* %newValue, align 4 + %1 = load double, double* %value.addr, align 8 + %2 = load i32, i32* %newValue, align 4 + %conv1 = sitofp i32 %2 to double + %sub = fsub contract double %1, %conv1 + %cmp = fcmp olt double %sub, 5.000000e-01 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %3 = load i32, i32* %newValue, align 4 + %conv2 = sitofp i32 %3 to double + store double %conv2, double* %retval, align 8 + br label %return + +if.else: ; preds = %entry + %4 = load i32, i32* %newValue, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %newValue, align 4 + %conv3 = sitofp i32 %4 to double + store double %conv3, double* %retval, align 8 + br label %return + +return: ; preds = %if.else, %if.then + %5 = load double, double* %retval, align 8 + ret double %5 +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z5setIfiiPiS_S_S_(i32 %testValue, i32 %newValue, i32* %array3D, i32* %dimX, i32* %dimY, i32* %dimZ) #0 { +entry: + %testValue.addr = alloca i32, align 4 + %newValue.addr = alloca i32, align 4 + %array3D.addr = alloca i32*, align 8 + %dimX.addr = alloca i32*, align 8 + %dimY.addr = alloca i32*, align 8 + %dimZ.addr = alloca i32*, align 8 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %z = alloca i32, align 4 + store i32 %testValue, i32* %testValue.addr, align 4 + store i32 %newValue, i32* %newValue.addr, align 4 + store i32* %array3D, i32** %array3D.addr, align 8 + store i32* %dimX, i32** %dimX.addr, align 8 + store i32* %dimY, i32** %dimY.addr, align 8 + store i32* %dimZ, i32** %dimZ.addr, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc21, %entry + %0 = load i32, i32* %x, align 4 + %1 = load i32*, i32** %dimX.addr, align 8 + %2 = load i32, i32* %1, align 4 + %cmp = icmp slt i32 %0, %2 + br i1 %cmp, label %for.body, label %for.end23 + +for.body: ; preds = %for.cond + store i32 0, i32* %y, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc18, %for.body + %3 = load i32, i32* %y, align 4 + %4 = load i32*, i32** %dimY.addr, align 8 + %5 = load i32, i32* %4, align 4 + %cmp2 = icmp slt i32 %3, %5 + br i1 %cmp2, label %for.body3, label %for.end20 + +for.body3: ; preds = %for.cond1 + store i32 0, i32* %z, align 4 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %6 = load i32, i32* %z, align 4 + %7 = load i32*, i32** %dimZ.addr, align 8 + %8 = load i32, i32* %7, align 4 + %cmp5 = icmp slt i32 %6, %8 + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %9 = load i32*, i32** %array3D.addr, align 8 + %10 = load i32, i32* %x, align 4 + %11 = load i32*, i32** %dimY.addr, align 8 + %12 = load i32, i32* %11, align 4 + %mul = mul nsw i32 %10, %12 + %13 = load i32*, i32** %dimZ.addr, align 8 + %14 = load i32, i32* %13, align 4 + %mul7 = mul nsw i32 %mul, %14 + %15 = load i32, i32* %y, align 4 + %16 = load i32*, i32** %dimZ.addr, align 8 + %17 = load i32, i32* %16, align 4 + %mul8 = mul nsw i32 %15, %17 + %add = add nsw i32 %mul7, %mul8 + %18 = load i32, i32* %z, align 4 + %add9 = add nsw i32 %add, %18 + %idxprom = sext i32 %add9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %19 = load i32, i32* %arrayidx, align 4 + %20 = load i32, i32* %testValue.addr, align 4 + %cmp10 = icmp eq i32 %19, %20 + br i1 %cmp10, label %if.then, label %if.end + +if.then: ; preds = %for.body6 + %21 = load i32, i32* %newValue.addr, align 4 + %22 = load i32*, i32** %array3D.addr, align 8 + %23 = load i32, i32* %x, align 4 + %24 = load i32*, i32** %dimY.addr, align 8 + %25 = load i32, i32* %24, align 4 + %mul11 = mul nsw i32 %23, %25 + %26 = load i32*, i32** %dimZ.addr, align 8 + %27 = load i32, i32* %26, align 4 + %mul12 = mul nsw i32 %mul11, %27 + %28 = load i32, i32* %y, align 4 + %29 = load i32*, i32** %dimZ.addr, align 8 + %30 = load i32, i32* %29, align 4 + %mul13 = mul nsw i32 %28, %30 + %add14 = add nsw i32 %mul12, %mul13 + %31 = load i32, i32* %z, align 4 + %add15 = add nsw i32 %add14, %31 + %idxprom16 = sext i32 %add15 to i64 + %arrayidx17 = getelementptr inbounds i32, i32* %22, i64 %idxprom16 + store i32 %21, i32* %arrayidx17, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body6 + br label %for.inc + +for.inc: ; preds = %if.end + %32 = load i32, i32* %z, align 4 + %inc = add nsw i32 %32, 1 + store i32 %inc, i32* %z, align 4 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc18 + +for.inc18: ; preds = %for.end + %33 = load i32, i32* %y, align 4 + %inc19 = add nsw i32 %33, 1 + store i32 %inc19, i32* %y, align 4 + br label %for.cond1 + +for.end20: ; preds = %for.cond1 + br label %for.inc21 + +for.inc21: ; preds = %for.end20 + %34 = load i32, i32* %x, align 4 + %inc22 = add nsw i32 %34, 1 + store i32 %inc22, i32* %x, align 4 + br label %for.cond + +for.end23: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local double @_Z5randuPii(i32* %seed, i32 %index) #0 { +entry: + %seed.addr = alloca i32*, align 8 + %index.addr = alloca i32, align 4 + %num = alloca i32, align 4 + store i32* %seed, i32** %seed.addr, align 8 + store i32 %index, i32* %index.addr, align 4 + %0 = load i32, i32* @A, align 4 + %1 = load i32*, i32** %seed.addr, align 8 + %2 = load i32, i32* %index.addr, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %1, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %3 + %4 = load i32, i32* @C, align 4 + %add = add nsw i32 %mul, %4 + store i32 %add, i32* %num, align 4 + %5 = load i32, i32* %num, align 4 + %conv = sext i32 %5 to i64 + %6 = load i64, i64* @M, align 8 + %rem = srem i64 %conv, %6 + %conv1 = trunc i64 %rem to i32 + %7 = load i32*, i32** %seed.addr, align 8 + %8 = load i32, i32* %index.addr, align 4 + %idxprom2 = sext i32 %8 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %7, i64 %idxprom2 + store i32 %conv1, i32* %arrayidx3, align 4 + %9 = load i32*, i32** %seed.addr, align 8 + %10 = load i32, i32* %index.addr, align 4 + %idxprom4 = sext i32 %10 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %9, i64 %idxprom4 + %11 = load i32, i32* %arrayidx5, align 4 + %conv6 = sitofp i32 %11 to double + %12 = load i64, i64* @M, align 8 + %conv7 = sitofp i64 %12 to double + %div = fdiv double %conv6, %conv7 + %13 = call double @llvm.fabs.f64(double %div) + ret double %13 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare double @llvm.fabs.f64(double) #6 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local double @_Z5randnPii(i32* %seed, i32 %index) #0 { +entry: + %seed.addr = alloca i32*, align 8 + %index.addr = alloca i32, align 4 + %u = alloca double, align 8 + %v = alloca double, align 8 + %cosine = alloca double, align 8 + %rt = alloca double, align 8 + store i32* %seed, i32** %seed.addr, align 8 + store i32 %index, i32* %index.addr, align 4 + %0 = load i32*, i32** %seed.addr, align 8 + %1 = load i32, i32* %index.addr, align 4 + %call = call double @_Z5randuPii(i32* %0, i32 %1) + store double %call, double* %u, align 8 + %2 = load i32*, i32** %seed.addr, align 8 + %3 = load i32, i32* %index.addr, align 4 + %call1 = call double @_Z5randuPii(i32* %2, i32 %3) + store double %call1, double* %v, align 8 + %4 = load double, double* %v, align 8 + %mul = fmul contract double 0x401921FB54442D18, %4 + %call2 = call double @cos(double %mul) #10 + store double %call2, double* %cosine, align 8 + %5 = load double, double* %u, align 8 + %call3 = call double @log(double %5) #10 + %mul4 = fmul contract double -2.000000e+00, %call3 + store double %mul4, double* %rt, align 8 + %6 = load double, double* %rt, align 8 + %call5 = call double @sqrt(double %6) #10 + %7 = load double, double* %cosine, align 8 + %mul6 = fmul contract double %call5, %7 + ret double %mul6 +} + +; Function Attrs: nounwind +declare dso_local double @cos(double) #1 + +; Function Attrs: nounwind +declare dso_local double @log(double) #1 + +; Function Attrs: nounwind +declare dso_local double @sqrt(double) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z8addNoisePiS_S_S_S_(i32* %array3D, i32* %dimX, i32* %dimY, i32* %dimZ, i32* %seed) #0 { +entry: + %array3D.addr = alloca i32*, align 8 + %dimX.addr = alloca i32*, align 8 + %dimY.addr = alloca i32*, align 8 + %dimZ.addr = alloca i32*, align 8 + %seed.addr = alloca i32*, align 8 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %z = alloca i32, align 4 + store i32* %array3D, i32** %array3D.addr, align 8 + store i32* %dimX, i32** %dimX.addr, align 8 + store i32* %dimY, i32** %dimY.addr, align 8 + store i32* %dimZ, i32** %dimZ.addr, align 8 + store i32* %seed, i32** %seed.addr, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc22, %entry + %0 = load i32, i32* %x, align 4 + %1 = load i32*, i32** %dimX.addr, align 8 + %2 = load i32, i32* %1, align 4 + %cmp = icmp slt i32 %0, %2 + br i1 %cmp, label %for.body, label %for.end24 + +for.body: ; preds = %for.cond + store i32 0, i32* %y, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc19, %for.body + %3 = load i32, i32* %y, align 4 + %4 = load i32*, i32** %dimY.addr, align 8 + %5 = load i32, i32* %4, align 4 + %cmp2 = icmp slt i32 %3, %5 + br i1 %cmp2, label %for.body3, label %for.end21 + +for.body3: ; preds = %for.cond1 + store i32 0, i32* %z, align 4 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %6 = load i32, i32* %z, align 4 + %7 = load i32*, i32** %dimZ.addr, align 8 + %8 = load i32, i32* %7, align 4 + %cmp5 = icmp slt i32 %6, %8 + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %9 = load i32*, i32** %array3D.addr, align 8 + %10 = load i32, i32* %x, align 4 + %11 = load i32*, i32** %dimY.addr, align 8 + %12 = load i32, i32* %11, align 4 + %mul = mul nsw i32 %10, %12 + %13 = load i32*, i32** %dimZ.addr, align 8 + %14 = load i32, i32* %13, align 4 + %mul7 = mul nsw i32 %mul, %14 + %15 = load i32, i32* %y, align 4 + %16 = load i32*, i32** %dimZ.addr, align 8 + %17 = load i32, i32* %16, align 4 + %mul8 = mul nsw i32 %15, %17 + %add = add nsw i32 %mul7, %mul8 + %18 = load i32, i32* %z, align 4 + %add9 = add nsw i32 %add, %18 + %idxprom = sext i32 %add9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %19 = load i32, i32* %arrayidx, align 4 + %20 = load i32*, i32** %seed.addr, align 8 + %call = call double @_Z5randnPii(i32* %20, i32 0) + %mul10 = fmul contract double 5.000000e+00, %call + %conv = fptosi double %mul10 to i32 + %add11 = add nsw i32 %19, %conv + %21 = load i32*, i32** %array3D.addr, align 8 + %22 = load i32, i32* %x, align 4 + %23 = load i32*, i32** %dimY.addr, align 8 + %24 = load i32, i32* %23, align 4 + %mul12 = mul nsw i32 %22, %24 + %25 = load i32*, i32** %dimZ.addr, align 8 + %26 = load i32, i32* %25, align 4 + %mul13 = mul nsw i32 %mul12, %26 + %27 = load i32, i32* %y, align 4 + %28 = load i32*, i32** %dimZ.addr, align 8 + %29 = load i32, i32* %28, align 4 + %mul14 = mul nsw i32 %27, %29 + %add15 = add nsw i32 %mul13, %mul14 + %30 = load i32, i32* %z, align 4 + %add16 = add nsw i32 %add15, %30 + %idxprom17 = sext i32 %add16 to i64 + %arrayidx18 = getelementptr inbounds i32, i32* %21, i64 %idxprom17 + store i32 %add11, i32* %arrayidx18, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %31 = load i32, i32* %z, align 4 + %inc = add nsw i32 %31, 1 + store i32 %inc, i32* %z, align 4 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc19 + +for.inc19: ; preds = %for.end + %32 = load i32, i32* %y, align 4 + %inc20 = add nsw i32 %32, 1 + store i32 %inc20, i32* %y, align 4 + br label %for.cond1 + +for.end21: ; preds = %for.cond1 + br label %for.inc22 + +for.inc22: ; preds = %for.end21 + %33 = load i32, i32* %x, align 4 + %inc23 = add nsw i32 %33, 1 + store i32 %inc23, i32* %x, align 4 + br label %for.cond + +for.end24: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z9strelDiskPii(i32* %disk, i32 %radius) #2 { +entry: + %disk.addr = alloca i32*, align 8 + %radius.addr = alloca i32, align 4 + %diameter = alloca i32, align 4 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %distance = alloca double, align 8 + store i32* %disk, i32** %disk.addr, align 8 + store i32 %radius, i32* %radius.addr, align 4 + %0 = load i32, i32* %radius.addr, align 4 + %mul = mul nsw i32 %0, 2 + %sub = sub nsw i32 %mul, 1 + store i32 %sub, i32* %diameter, align 4 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc15, %entry + %1 = load i32, i32* %x, align 4 + %2 = load i32, i32* %diameter, align 4 + %cmp = icmp slt i32 %1, %2 + br i1 %cmp, label %for.body, label %for.end17 + +for.body: ; preds = %for.cond + store i32 0, i32* %y, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %3 = load i32, i32* %y, align 4 + %4 = load i32, i32* %diameter, align 4 + %cmp2 = icmp slt i32 %3, %4 + br i1 %cmp2, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %5 = load i32, i32* %x, align 4 + %6 = load i32, i32* %radius.addr, align 4 + %sub4 = sub nsw i32 %5, %6 + %add = add nsw i32 %sub4, 1 + %conv = sitofp i32 %add to double + %call = call double @_ZSt3powdi(double %conv, i32 2) + %7 = load i32, i32* %y, align 4 + %8 = load i32, i32* %radius.addr, align 4 + %sub5 = sub nsw i32 %7, %8 + %add6 = add nsw i32 %sub5, 1 + %conv7 = sitofp i32 %add6 to double + %call8 = call double @_ZSt3powdi(double %conv7, i32 2) + %add9 = fadd contract double %call, %call8 + %call10 = call double @sqrt(double %add9) #10 + store double %call10, double* %distance, align 8 + %9 = load double, double* %distance, align 8 + %10 = load i32, i32* %radius.addr, align 4 + %conv11 = sitofp i32 %10 to double + %cmp12 = fcmp olt double %9, %conv11 + br i1 %cmp12, label %if.then, label %if.end + +if.then: ; preds = %for.body3 + %11 = load i32*, i32** %disk.addr, align 8 + %12 = load i32, i32* %x, align 4 + %13 = load i32, i32* %diameter, align 4 + %mul13 = mul nsw i32 %12, %13 + %14 = load i32, i32* %y, align 4 + %add14 = add nsw i32 %mul13, %14 + %idxprom = sext i32 %add14 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + store i32 1, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body3 + br label %for.inc + +for.inc: ; preds = %if.end + %15 = load i32, i32* %y, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %y, align 4 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc15 + +for.inc15: ; preds = %for.end + %16 = load i32, i32* %x, align 4 + %inc16 = add nsw i32 %16, 1 + store i32 %inc16, i32* %x, align 4 + br label %for.cond + +for.end17: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local double @_ZSt3powdi(double %__x, i32 %__i) #0 comdat { +entry: + %__x.addr = alloca double, align 8 + %__i.addr = alloca i32, align 4 + store double %__x, double* %__x.addr, align 8 + store i32 %__i, i32* %__i.addr, align 4 + %0 = load double, double* %__x.addr, align 8 + %1 = load i32, i32* %__i.addr, align 4 + %2 = call double @llvm.powi.f64(double %0, i32 %1) + ret double %2 +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13dilate_matrixPiiiiiiii(i32* %matrix, i32 %posX, i32 %posY, i32 %posZ, i32 %dimX, i32 %dimY, i32 %dimZ, i32 %error) #2 { +entry: + %matrix.addr = alloca i32*, align 8 + %posX.addr = alloca i32, align 4 + %posY.addr = alloca i32, align 4 + %posZ.addr = alloca i32, align 4 + %dimX.addr = alloca i32, align 4 + %dimY.addr = alloca i32, align 4 + %dimZ.addr = alloca i32, align 4 + %error.addr = alloca i32, align 4 + %startX = alloca i32, align 4 + %startY = alloca i32, align 4 + %endX = alloca i32, align 4 + %endY = alloca i32, align 4 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %distance = alloca double, align 8 + store i32* %matrix, i32** %matrix.addr, align 8 + store i32 %posX, i32* %posX.addr, align 4 + store i32 %posY, i32* %posY.addr, align 4 + store i32 %posZ, i32* %posZ.addr, align 4 + store i32 %dimX, i32* %dimX.addr, align 4 + store i32 %dimY, i32* %dimY.addr, align 4 + store i32 %dimZ, i32* %dimZ.addr, align 4 + store i32 %error, i32* %error.addr, align 4 + %0 = load i32, i32* %posX.addr, align 4 + %1 = load i32, i32* %error.addr, align 4 + %sub = sub nsw i32 %0, %1 + store i32 %sub, i32* %startX, align 4 + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %2 = load i32, i32* %startX, align 4 + %cmp = icmp slt i32 %2, 0 + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %3 = load i32, i32* %startX, align 4 + %inc = add nsw i32 %3, 1 + store i32 %inc, i32* %startX, align 4 + br label %while.cond + +while.end: ; preds = %while.cond + %4 = load i32, i32* %posY.addr, align 4 + %5 = load i32, i32* %error.addr, align 4 + %sub1 = sub nsw i32 %4, %5 + store i32 %sub1, i32* %startY, align 4 + br label %while.cond2 + +while.cond2: ; preds = %while.body4, %while.end + %6 = load i32, i32* %startY, align 4 + %cmp3 = icmp slt i32 %6, 0 + br i1 %cmp3, label %while.body4, label %while.end6 + +while.body4: ; preds = %while.cond2 + %7 = load i32, i32* %startY, align 4 + %inc5 = add nsw i32 %7, 1 + store i32 %inc5, i32* %startY, align 4 + br label %while.cond2 + +while.end6: ; preds = %while.cond2 + %8 = load i32, i32* %posX.addr, align 4 + %9 = load i32, i32* %error.addr, align 4 + %add = add nsw i32 %8, %9 + store i32 %add, i32* %endX, align 4 + br label %while.cond7 + +while.cond7: ; preds = %while.body9, %while.end6 + %10 = load i32, i32* %endX, align 4 + %11 = load i32, i32* %dimX.addr, align 4 + %cmp8 = icmp sgt i32 %10, %11 + br i1 %cmp8, label %while.body9, label %while.end10 + +while.body9: ; preds = %while.cond7 + %12 = load i32, i32* %endX, align 4 + %dec = add nsw i32 %12, -1 + store i32 %dec, i32* %endX, align 4 + br label %while.cond7 + +while.end10: ; preds = %while.cond7 + %13 = load i32, i32* %posY.addr, align 4 + %14 = load i32, i32* %error.addr, align 4 + %add11 = add nsw i32 %13, %14 + store i32 %add11, i32* %endY, align 4 + br label %while.cond12 + +while.cond12: ; preds = %while.body14, %while.end10 + %15 = load i32, i32* %endY, align 4 + %16 = load i32, i32* %dimY.addr, align 4 + %cmp13 = icmp sgt i32 %15, %16 + br i1 %cmp13, label %while.body14, label %while.end16 + +while.body14: ; preds = %while.cond12 + %17 = load i32, i32* %endY, align 4 + %dec15 = add nsw i32 %17, -1 + store i32 %dec15, i32* %endY, align 4 + br label %while.cond12 + +while.end16: ; preds = %while.cond12 + %18 = load i32, i32* %startX, align 4 + store i32 %18, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc34, %while.end16 + %19 = load i32, i32* %x, align 4 + %20 = load i32, i32* %endX, align 4 + %cmp17 = icmp slt i32 %19, %20 + br i1 %cmp17, label %for.body, label %for.end36 + +for.body: ; preds = %for.cond + %21 = load i32, i32* %startY, align 4 + store i32 %21, i32* %y, align 4 + br label %for.cond18 + +for.cond18: ; preds = %for.inc, %for.body + %22 = load i32, i32* %y, align 4 + %23 = load i32, i32* %endY, align 4 + %cmp19 = icmp slt i32 %22, %23 + br i1 %cmp19, label %for.body20, label %for.end + +for.body20: ; preds = %for.cond18 + %24 = load i32, i32* %x, align 4 + %25 = load i32, i32* %posX.addr, align 4 + %sub21 = sub nsw i32 %24, %25 + %conv = sitofp i32 %sub21 to double + %call = call double @_ZSt3powdi(double %conv, i32 2) + %26 = load i32, i32* %y, align 4 + %27 = load i32, i32* %posY.addr, align 4 + %sub22 = sub nsw i32 %26, %27 + %conv23 = sitofp i32 %sub22 to double + %call24 = call double @_ZSt3powdi(double %conv23, i32 2) + %add25 = fadd contract double %call, %call24 + %call26 = call double @sqrt(double %add25) #10 + store double %call26, double* %distance, align 8 + %28 = load double, double* %distance, align 8 + %29 = load i32, i32* %error.addr, align 4 + %conv27 = sitofp i32 %29 to double + %cmp28 = fcmp olt double %28, %conv27 + br i1 %cmp28, label %if.then, label %if.end + +if.then: ; preds = %for.body20 + %30 = load i32*, i32** %matrix.addr, align 8 + %31 = load i32, i32* %x, align 4 + %32 = load i32, i32* %dimY.addr, align 4 + %mul = mul nsw i32 %31, %32 + %33 = load i32, i32* %dimZ.addr, align 4 + %mul29 = mul nsw i32 %mul, %33 + %34 = load i32, i32* %y, align 4 + %35 = load i32, i32* %dimZ.addr, align 4 + %mul30 = mul nsw i32 %34, %35 + %add31 = add nsw i32 %mul29, %mul30 + %36 = load i32, i32* %posZ.addr, align 4 + %add32 = add nsw i32 %add31, %36 + %idxprom = sext i32 %add32 to i64 + %arrayidx = getelementptr inbounds i32, i32* %30, i64 %idxprom + store i32 1, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body20 + br label %for.inc + +for.inc: ; preds = %if.end + %37 = load i32, i32* %y, align 4 + %inc33 = add nsw i32 %37, 1 + store i32 %inc33, i32* %y, align 4 + br label %for.cond18 + +for.end: ; preds = %for.cond18 + br label %for.inc34 + +for.inc34: ; preds = %for.end + %38 = load i32, i32* %x, align 4 + %inc35 = add nsw i32 %38, 1 + store i32 %inc35, i32* %x, align 4 + br label %for.cond + +for.end36: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13imdilate_diskPiiiiiS_(i32* %matrix, i32 %dimX, i32 %dimY, i32 %dimZ, i32 %error, i32* %newMatrix) #2 { +entry: + %matrix.addr = alloca i32*, align 8 + %dimX.addr = alloca i32, align 4 + %dimY.addr = alloca i32, align 4 + %dimZ.addr = alloca i32, align 4 + %error.addr = alloca i32, align 4 + %newMatrix.addr = alloca i32*, align 8 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %z = alloca i32, align 4 + store i32* %matrix, i32** %matrix.addr, align 8 + store i32 %dimX, i32* %dimX.addr, align 4 + store i32 %dimY, i32* %dimY.addr, align 4 + store i32 %dimZ, i32* %dimZ.addr, align 4 + store i32 %error, i32* %error.addr, align 4 + store i32* %newMatrix, i32** %newMatrix.addr, align 8 + store i32 0, i32* %z, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc14, %entry + %0 = load i32, i32* %z, align 4 + %1 = load i32, i32* %dimZ.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end16 + +for.body: ; preds = %for.cond + store i32 0, i32* %x, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc11, %for.body + %2 = load i32, i32* %x, align 4 + %3 = load i32, i32* %dimX.addr, align 4 + %cmp2 = icmp slt i32 %2, %3 + br i1 %cmp2, label %for.body3, label %for.end13 + +for.body3: ; preds = %for.cond1 + store i32 0, i32* %y, align 4 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %4 = load i32, i32* %y, align 4 + %5 = load i32, i32* %dimY.addr, align 4 + %cmp5 = icmp slt i32 %4, %5 + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %6 = load i32*, i32** %matrix.addr, align 8 + %7 = load i32, i32* %x, align 4 + %8 = load i32, i32* %dimY.addr, align 4 + %mul = mul nsw i32 %7, %8 + %9 = load i32, i32* %dimZ.addr, align 4 + %mul7 = mul nsw i32 %mul, %9 + %10 = load i32, i32* %y, align 4 + %11 = load i32, i32* %dimZ.addr, align 4 + %mul8 = mul nsw i32 %10, %11 + %add = add nsw i32 %mul7, %mul8 + %12 = load i32, i32* %z, align 4 + %add9 = add nsw i32 %add, %12 + %idxprom = sext i32 %add9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %6, i64 %idxprom + %13 = load i32, i32* %arrayidx, align 4 + %cmp10 = icmp eq i32 %13, 1 + br i1 %cmp10, label %if.then, label %if.end + +if.then: ; preds = %for.body6 + %14 = load i32*, i32** %newMatrix.addr, align 8 + %15 = load i32, i32* %x, align 4 + %16 = load i32, i32* %y, align 4 + %17 = load i32, i32* %z, align 4 + %18 = load i32, i32* %dimX.addr, align 4 + %19 = load i32, i32* %dimY.addr, align 4 + %20 = load i32, i32* %dimZ.addr, align 4 + %21 = load i32, i32* %error.addr, align 4 + call void @_Z13dilate_matrixPiiiiiiii(i32* %14, i32 %15, i32 %16, i32 %17, i32 %18, i32 %19, i32 %20, i32 %21) + br label %if.end + +if.end: ; preds = %if.then, %for.body6 + br label %for.inc + +for.inc: ; preds = %if.end + %22 = load i32, i32* %y, align 4 + %inc = add nsw i32 %22, 1 + store i32 %inc, i32* %y, align 4 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc11 + +for.inc11: ; preds = %for.end + %23 = load i32, i32* %x, align 4 + %inc12 = add nsw i32 %23, 1 + store i32 %inc12, i32* %x, align 4 + br label %for.cond1 + +for.end13: ; preds = %for.cond1 + br label %for.inc14 + +for.inc14: ; preds = %for.end13 + %24 = load i32, i32* %z, align 4 + %inc15 = add nsw i32 %24, 1 + store i32 %inc15, i32* %z, align 4 + br label %for.cond + +for.end16: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z12getneighborsPiiPdi(i32* %se, i32 %numOnes, double* %neighbors, i32 %radius) #0 { +entry: + %se.addr = alloca i32*, align 8 + %numOnes.addr = alloca i32, align 4 + %neighbors.addr = alloca double*, align 8 + %radius.addr = alloca i32, align 4 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %neighY = alloca i32, align 4 + %center = alloca i32, align 4 + %diameter = alloca i32, align 4 + store i32* %se, i32** %se.addr, align 8 + store i32 %numOnes, i32* %numOnes.addr, align 4 + store double* %neighbors, double** %neighbors.addr, align 8 + store i32 %radius, i32* %radius.addr, align 4 + store i32 0, i32* %neighY, align 4 + %0 = load i32, i32* %radius.addr, align 4 + %sub = sub nsw i32 %0, 1 + store i32 %sub, i32* %center, align 4 + %1 = load i32, i32* %radius.addr, align 4 + %mul = mul nsw i32 %1, 2 + %sub1 = sub nsw i32 %mul, 1 + store i32 %sub1, i32* %diameter, align 4 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc17, %entry + %2 = load i32, i32* %x, align 4 + %3 = load i32, i32* %diameter, align 4 + %cmp = icmp slt i32 %2, %3 + br i1 %cmp, label %for.body, label %for.end19 + +for.body: ; preds = %for.cond + store i32 0, i32* %y, align 4 + br label %for.cond2 + +for.cond2: ; preds = %for.inc, %for.body + %4 = load i32, i32* %y, align 4 + %5 = load i32, i32* %diameter, align 4 + %cmp3 = icmp slt i32 %4, %5 + br i1 %cmp3, label %for.body4, label %for.end + +for.body4: ; preds = %for.cond2 + %6 = load i32*, i32** %se.addr, align 8 + %7 = load i32, i32* %x, align 4 + %8 = load i32, i32* %diameter, align 4 + %mul5 = mul nsw i32 %7, %8 + %9 = load i32, i32* %y, align 4 + %add = add nsw i32 %mul5, %9 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %6, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4 + %tobool = icmp ne i32 %10, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %for.body4 + %11 = load i32, i32* %y, align 4 + %12 = load i32, i32* %center, align 4 + %sub6 = sub nsw i32 %11, %12 + %conv = sitofp i32 %sub6 to double + %13 = load double*, double** %neighbors.addr, align 8 + %14 = load i32, i32* %neighY, align 4 + %mul7 = mul nsw i32 %14, 2 + %idxprom8 = sext i32 %mul7 to i64 + %arrayidx9 = getelementptr inbounds double, double* %13, i64 %idxprom8 + store double %conv, double* %arrayidx9, align 8 + %15 = load i32, i32* %x, align 4 + %16 = load i32, i32* %center, align 4 + %sub10 = sub nsw i32 %15, %16 + %conv11 = sitofp i32 %sub10 to double + %17 = load double*, double** %neighbors.addr, align 8 + %18 = load i32, i32* %neighY, align 4 + %mul12 = mul nsw i32 %18, 2 + %add13 = add nsw i32 %mul12, 1 + %idxprom14 = sext i32 %add13 to i64 + %arrayidx15 = getelementptr inbounds double, double* %17, i64 %idxprom14 + store double %conv11, double* %arrayidx15, align 8 + %19 = load i32, i32* %neighY, align 4 + %inc = add nsw i32 %19, 1 + store i32 %inc, i32* %neighY, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body4 + br label %for.inc + +for.inc: ; preds = %if.end + %20 = load i32, i32* %y, align 4 + %inc16 = add nsw i32 %20, 1 + store i32 %inc16, i32* %y, align 4 + br label %for.cond2 + +for.end: ; preds = %for.cond2 + br label %for.inc17 + +for.inc17: ; preds = %for.end + %21 = load i32, i32* %x, align 4 + %inc18 = add nsw i32 %21, 1 + store i32 %inc18, i32* %x, align 4 + br label %for.cond + +for.end19: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13videoSequencePiiiiS_(i32* %I, i32 %IszX, i32 %IszY, i32 %Nfr, i32* %seed) #2 { +entry: + %I.addr = alloca i32*, align 8 + %IszX.addr = alloca i32, align 4 + %IszY.addr = alloca i32, align 4 + %Nfr.addr = alloca i32, align 4 + %seed.addr = alloca i32*, align 8 + %k = alloca i32, align 4 + %max_size = alloca i32, align 4 + %x0 = alloca i32, align 4 + %y0 = alloca i32, align 4 + %xk = alloca i32, align 4 + %yk = alloca i32, align 4 + %pos = alloca i32, align 4 + %newMatrix = alloca i32*, align 8 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + store i32* %I, i32** %I.addr, align 8 + store i32 %IszX, i32* %IszX.addr, align 4 + store i32 %IszY, i32* %IszY.addr, align 4 + store i32 %Nfr, i32* %Nfr.addr, align 4 + store i32* %seed, i32** %seed.addr, align 8 + %0 = load i32, i32* %IszX.addr, align 4 + %1 = load i32, i32* %IszY.addr, align 4 + %mul = mul nsw i32 %0, %1 + %2 = load i32, i32* %Nfr.addr, align 4 + %mul1 = mul nsw i32 %mul, %2 + store i32 %mul1, i32* %max_size, align 4 + %3 = load i32, i32* %IszY.addr, align 4 + %conv = sitofp i32 %3 to double + %div = fdiv double %conv, 2.000000e+00 + %call = call double @_Z11roundDoubled(double %div) + %conv2 = fptosi double %call to i32 + store i32 %conv2, i32* %x0, align 4 + %4 = load i32, i32* %IszX.addr, align 4 + %conv3 = sitofp i32 %4 to double + %div4 = fdiv double %conv3, 2.000000e+00 + %call5 = call double @_Z11roundDoubled(double %div4) + %conv6 = fptosi double %call5 to i32 + store i32 %conv6, i32* %y0, align 4 + %5 = load i32*, i32** %I.addr, align 8 + %6 = load i32, i32* %x0, align 4 + %7 = load i32, i32* %IszY.addr, align 4 + %mul7 = mul nsw i32 %6, %7 + %8 = load i32, i32* %Nfr.addr, align 4 + %mul8 = mul nsw i32 %mul7, %8 + %9 = load i32, i32* %y0, align 4 + %10 = load i32, i32* %Nfr.addr, align 4 + %mul9 = mul nsw i32 %9, %10 + %add = add nsw i32 %mul8, %mul9 + %add10 = add nsw i32 %add, 0 + %idxprom = sext i32 %add10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %5, i64 %idxprom + store i32 1, i32* %arrayidx, align 4 + store i32 1, i32* %k, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %11 = load i32, i32* %k, align 4 + %12 = load i32, i32* %Nfr.addr, align 4 + %cmp = icmp slt i32 %11, %12 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %13 = load i32, i32* %x0, align 4 + %14 = load i32, i32* %k, align 4 + %sub = sub nsw i32 %14, 1 + %add11 = add nsw i32 %13, %sub + %call12 = call i32 @abs(i32 %add11) #12 + store i32 %call12, i32* %xk, align 4 + %15 = load i32, i32* %y0, align 4 + %16 = load i32, i32* %k, align 4 + %sub13 = sub nsw i32 %16, 1 + %mul14 = mul nsw i32 2, %sub13 + %sub15 = sub nsw i32 %15, %mul14 + %call16 = call i32 @abs(i32 %sub15) #12 + store i32 %call16, i32* %yk, align 4 + %17 = load i32, i32* %yk, align 4 + %18 = load i32, i32* %IszY.addr, align 4 + %mul17 = mul nsw i32 %17, %18 + %19 = load i32, i32* %Nfr.addr, align 4 + %mul18 = mul nsw i32 %mul17, %19 + %20 = load i32, i32* %xk, align 4 + %21 = load i32, i32* %Nfr.addr, align 4 + %mul19 = mul nsw i32 %20, %21 + %add20 = add nsw i32 %mul18, %mul19 + %22 = load i32, i32* %k, align 4 + %add21 = add nsw i32 %add20, %22 + store i32 %add21, i32* %pos, align 4 + %23 = load i32, i32* %pos, align 4 + %24 = load i32, i32* %max_size, align 4 + %cmp22 = icmp sge i32 %23, %24 + br i1 %cmp22, label %if.then, label %if.end + +if.then: ; preds = %for.body + store i32 0, i32* %pos, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %25 = load i32*, i32** %I.addr, align 8 + %26 = load i32, i32* %pos, align 4 + %idxprom23 = sext i32 %26 to i64 + %arrayidx24 = getelementptr inbounds i32, i32* %25, i64 %idxprom23 + store i32 1, i32* %arrayidx24, align 4 + br label %for.inc + +for.inc: ; preds = %if.end + %27 = load i32, i32* %k, align 4 + %inc = add nsw i32 %27, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %28 = load i32, i32* %IszX.addr, align 4 + %conv25 = sext i32 %28 to i64 + %mul26 = mul i64 4, %conv25 + %29 = load i32, i32* %IszY.addr, align 4 + %conv27 = sext i32 %29 to i64 + %mul28 = mul i64 %mul26, %conv27 + %30 = load i32, i32* %Nfr.addr, align 4 + %conv29 = sext i32 %30 to i64 + %mul30 = mul i64 %mul28, %conv29 + %call31 = call noalias i8* @malloc(i64 %mul30) #10 + %31 = bitcast i8* %call31 to i32* + store i32* %31, i32** %newMatrix, align 8 + %32 = load i32*, i32** %I.addr, align 8 + %33 = load i32, i32* %IszX.addr, align 4 + %34 = load i32, i32* %IszY.addr, align 4 + %35 = load i32, i32* %Nfr.addr, align 4 + %36 = load i32*, i32** %newMatrix, align 8 + call void @_Z13imdilate_diskPiiiiiS_(i32* %32, i32 %33, i32 %34, i32 %35, i32 5, i32* %36) + store i32 0, i32* %x, align 4 + br label %for.cond32 + +for.cond32: ; preds = %for.inc61, %for.end + %37 = load i32, i32* %x, align 4 + %38 = load i32, i32* %IszX.addr, align 4 + %cmp33 = icmp slt i32 %37, %38 + br i1 %cmp33, label %for.body34, label %for.end63 + +for.body34: ; preds = %for.cond32 + store i32 0, i32* %y, align 4 + br label %for.cond35 + +for.cond35: ; preds = %for.inc58, %for.body34 + %39 = load i32, i32* %y, align 4 + %40 = load i32, i32* %IszY.addr, align 4 + %cmp36 = icmp slt i32 %39, %40 + br i1 %cmp36, label %for.body37, label %for.end60 + +for.body37: ; preds = %for.cond35 + store i32 0, i32* %k, align 4 + br label %for.cond38 + +for.cond38: ; preds = %for.inc55, %for.body37 + %41 = load i32, i32* %k, align 4 + %42 = load i32, i32* %Nfr.addr, align 4 + %cmp39 = icmp slt i32 %41, %42 + br i1 %cmp39, label %for.body40, label %for.end57 + +for.body40: ; preds = %for.cond38 + %43 = load i32*, i32** %newMatrix, align 8 + %44 = load i32, i32* %x, align 4 + %45 = load i32, i32* %IszY.addr, align 4 + %mul41 = mul nsw i32 %44, %45 + %46 = load i32, i32* %Nfr.addr, align 4 + %mul42 = mul nsw i32 %mul41, %46 + %47 = load i32, i32* %y, align 4 + %48 = load i32, i32* %Nfr.addr, align 4 + %mul43 = mul nsw i32 %47, %48 + %add44 = add nsw i32 %mul42, %mul43 + %49 = load i32, i32* %k, align 4 + %add45 = add nsw i32 %add44, %49 + %idxprom46 = sext i32 %add45 to i64 + %arrayidx47 = getelementptr inbounds i32, i32* %43, i64 %idxprom46 + %50 = load i32, i32* %arrayidx47, align 4 + %51 = load i32*, i32** %I.addr, align 8 + %52 = load i32, i32* %x, align 4 + %53 = load i32, i32* %IszY.addr, align 4 + %mul48 = mul nsw i32 %52, %53 + %54 = load i32, i32* %Nfr.addr, align 4 + %mul49 = mul nsw i32 %mul48, %54 + %55 = load i32, i32* %y, align 4 + %56 = load i32, i32* %Nfr.addr, align 4 + %mul50 = mul nsw i32 %55, %56 + %add51 = add nsw i32 %mul49, %mul50 + %57 = load i32, i32* %k, align 4 + %add52 = add nsw i32 %add51, %57 + %idxprom53 = sext i32 %add52 to i64 + %arrayidx54 = getelementptr inbounds i32, i32* %51, i64 %idxprom53 + store i32 %50, i32* %arrayidx54, align 4 + br label %for.inc55 + +for.inc55: ; preds = %for.body40 + %58 = load i32, i32* %k, align 4 + %inc56 = add nsw i32 %58, 1 + store i32 %inc56, i32* %k, align 4 + br label %for.cond38 + +for.end57: ; preds = %for.cond38 + br label %for.inc58 + +for.inc58: ; preds = %for.end57 + %59 = load i32, i32* %y, align 4 + %inc59 = add nsw i32 %59, 1 + store i32 %inc59, i32* %y, align 4 + br label %for.cond35 + +for.end60: ; preds = %for.cond35 + br label %for.inc61 + +for.inc61: ; preds = %for.end60 + %60 = load i32, i32* %x, align 4 + %inc62 = add nsw i32 %60, 1 + store i32 %inc62, i32* %x, align 4 + br label %for.cond32 + +for.end63: ; preds = %for.cond32 + %61 = load i32*, i32** %newMatrix, align 8 + %62 = bitcast i32* %61 to i8* + call void @free(i8* %62) #10 + %63 = load i32*, i32** %I.addr, align 8 + call void @_Z5setIfiiPiS_S_S_(i32 0, i32 100, i32* %63, i32* %IszX.addr, i32* %IszY.addr, i32* %Nfr.addr) + %64 = load i32*, i32** %I.addr, align 8 + call void @_Z5setIfiiPiS_S_S_(i32 1, i32 228, i32* %64, i32* %IszX.addr, i32* %IszY.addr, i32* %Nfr.addr) + %65 = load i32*, i32** %I.addr, align 8 + %66 = load i32*, i32** %seed.addr, align 8 + call void @_Z8addNoisePiS_S_S_S_(i32* %65, i32* %IszX.addr, i32* %IszY.addr, i32* %Nfr.addr, i32* %66) + ret void +} + +; Function Attrs: nounwind readnone +declare dso_local i32 @abs(i32) #7 + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #1 + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #1 + +; Function Attrs: noinline optnone uwtable +define dso_local double @_Z17calcLikelihoodSumPiS_i(i32* %I, i32* %ind, i32 %numOnes) #2 { +entry: + %I.addr = alloca i32*, align 8 + %ind.addr = alloca i32*, align 8 + %numOnes.addr = alloca i32, align 4 + %likelihoodSum = alloca double, align 8 + %y = alloca i32, align 4 + store i32* %I, i32** %I.addr, align 8 + store i32* %ind, i32** %ind.addr, align 8 + store i32 %numOnes, i32* %numOnes.addr, align 4 + store double 0.000000e+00, double* %likelihoodSum, align 8 + store i32 0, i32* %y, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %y, align 4 + %1 = load i32, i32* %numOnes.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %I.addr, align 8 + %3 = load i32*, i32** %ind.addr, align 8 + %4 = load i32, i32* %y, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 %idxprom + %5 = load i32, i32* %arrayidx, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %2, i64 %idxprom1 + %6 = load i32, i32* %arrayidx2, align 4 + %sub = sub nsw i32 %6, 100 + %conv = sitofp i32 %sub to double + %call = call double @_ZSt3powdi(double %conv, i32 2) + %7 = load i32*, i32** %I.addr, align 8 + %8 = load i32*, i32** %ind.addr, align 8 + %9 = load i32, i32* %y, align 4 + %idxprom3 = sext i32 %9 to i64 + %arrayidx4 = getelementptr inbounds i32, i32* %8, i64 %idxprom3 + %10 = load i32, i32* %arrayidx4, align 4 + %idxprom5 = sext i32 %10 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %7, i64 %idxprom5 + %11 = load i32, i32* %arrayidx6, align 4 + %sub7 = sub nsw i32 %11, 228 + %conv8 = sitofp i32 %sub7 to double + %call9 = call double @_ZSt3powdi(double %conv8, i32 2) + %sub10 = fsub contract double %call, %call9 + %div = fdiv double %sub10, 5.000000e+01 + %12 = load double, double* %likelihoodSum, align 8 + %add = fadd contract double %12, %div + store double %add, double* %likelihoodSum, align 8 + br label %for.inc + +for.inc: ; preds = %for.body + %13 = load i32, i32* %y, align 4 + %inc = add nsw i32 %13, 1 + store i32 %inc, i32* %y, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %14 = load double, double* %likelihoodSum, align 8 + ret double %14 +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @_Z9findIndexPdid(double* %CDF, i32 %lengthCDF, double %value) #0 { +entry: + %retval = alloca i32, align 4 + %CDF.addr = alloca double*, align 8 + %lengthCDF.addr = alloca i32, align 4 + %value.addr = alloca double, align 8 + %index = alloca i32, align 4 + %x = alloca i32, align 4 + store double* %CDF, double** %CDF.addr, align 8 + store i32 %lengthCDF, i32* %lengthCDF.addr, align 4 + store double %value, double* %value.addr, align 8 + store i32 -1, i32* %index, align 4 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %x, align 4 + %1 = load i32, i32* %lengthCDF.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load double*, double** %CDF.addr, align 8 + %3 = load i32, i32* %x, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds double, double* %2, i64 %idxprom + %4 = load double, double* %arrayidx, align 8 + %5 = load double, double* %value.addr, align 8 + %cmp1 = fcmp oge double %4, %5 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %6 = load i32, i32* %x, align 4 + store i32 %6, i32* %index, align 4 + br label %for.end + +if.end: ; preds = %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %7 = load i32, i32* %x, align 4 + %inc = add nsw i32 %7, 1 + store i32 %inc, i32* %x, align 4 + br label %for.cond + +for.end: ; preds = %if.then, %for.cond + %8 = load i32, i32* %index, align 4 + %cmp2 = icmp eq i32 %8, -1 + br i1 %cmp2, label %if.then3, label %if.end4 + +if.then3: ; preds = %for.end + %9 = load i32, i32* %lengthCDF.addr, align 4 + %sub = sub nsw i32 %9, 1 + store i32 %sub, i32* %retval, align 4 + br label %return + +if.end4: ; preds = %for.end + %10 = load i32, i32* %index, align 4 + store i32 %10, i32* %retval, align 4 + br label %return + +return: ; preds = %if.end4, %if.then3 + %11 = load i32, i32* %retval, align 4 + ret i32 %11 +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z14particleFilterPiiiiS_i(i32* %I, i32 %IszX, i32 %IszY, i32 %Nfr, i32* %seed, i32 %Nparticles) #2 { +entry: + %I.addr = alloca i32*, align 8 + %IszX.addr = alloca i32, align 4 + %IszY.addr = alloca i32, align 4 + %Nfr.addr = alloca i32, align 4 + %seed.addr = alloca i32*, align 8 + %Nparticles.addr = alloca i32, align 4 + %max_size = alloca i32, align 4 + %start = alloca i64, align 8 + %xe = alloca double, align 8 + %ye = alloca double, align 8 + %radius = alloca i32, align 4 + %diameter = alloca i32, align 4 + %disk = alloca i32*, align 8 + %countOnes = alloca i32, align 4 + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %objxy = alloca double*, align 8 + %get_neighbors = alloca i64, align 8 + %weights = alloca double*, align 8 + %get_weights = alloca i64, align 8 + %likelihood = alloca double*, align 8 + %arrayX = alloca double*, align 8 + %arrayY = alloca double*, align 8 + %xj = alloca double*, align 8 + %yj = alloca double*, align 8 + %CDF = alloca double*, align 8 + %arrayX_GPU = alloca double*, align 8 + %arrayY_GPU = alloca double*, align 8 + %xj_GPU = alloca double*, align 8 + %yj_GPU = alloca double*, align 8 + %CDF_GPU = alloca double*, align 8 + %ind = alloca i32*, align 8 + %u = alloca double*, align 8 + %u_GPU = alloca double*, align 8 + %k = alloca i32, align 4 + %indX = alloca i32, align 4 + %indY = alloca i32, align 4 + %set_arrays = alloca i64, align 8 + %error = alloca i64, align 8 + %likelihood_time = alloca i64, align 8 + %exponential = alloca i64, align 8 + %sumWeights = alloca double, align 8 + %sum_time = alloca i64, align 8 + %normalize = alloca i64, align 8 + %move_time = alloca i64, align 8 + %distance = alloca double, align 8 + %cum_sum = alloca i64, align 8 + %u1 = alloca double, align 8 + %u_time = alloca i64, align 8 + %start_copy = alloca i64, align 8 + %end_copy = alloca i64, align 8 + %num_blocks = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp335 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp335.coerce = alloca { i64, i32 }, align 4 + %start_copy_back = alloca i64, align 8 + %end_copy_back = alloca i64, align 8 + %xyj_time = alloca i64, align 8 + %reset = alloca i64, align 8 + %i = alloca i32, align 4 + store i32* %I, i32** %I.addr, align 8 + store i32 %IszX, i32* %IszX.addr, align 4 + store i32 %IszY, i32* %IszY.addr, align 4 + store i32 %Nfr, i32* %Nfr.addr, align 4 + store i32* %seed, i32** %seed.addr, align 8 + store i32 %Nparticles, i32* %Nparticles.addr, align 4 + %0 = load i32, i32* %IszX.addr, align 4 + %1 = load i32, i32* %IszY.addr, align 4 + %mul = mul nsw i32 %0, %1 + %2 = load i32, i32* %Nfr.addr, align 4 + %mul1 = mul nsw i32 %mul, %2 + store i32 %mul1, i32* %max_size, align 4 + %call = call i64 @_Z8get_timev() + store i64 %call, i64* %start, align 8 + %3 = load i32, i32* %IszY.addr, align 4 + %conv = sitofp i32 %3 to double + %div = fdiv double %conv, 2.000000e+00 + %call2 = call double @_Z11roundDoubled(double %div) + store double %call2, double* %xe, align 8 + %4 = load i32, i32* %IszX.addr, align 4 + %conv3 = sitofp i32 %4 to double + %div4 = fdiv double %conv3, 2.000000e+00 + %call5 = call double @_Z11roundDoubled(double %div4) + store double %call5, double* %ye, align 8 + store i32 5, i32* %radius, align 4 + %5 = load i32, i32* %radius, align 4 + %mul6 = mul nsw i32 %5, 2 + %sub = sub nsw i32 %mul6, 1 + store i32 %sub, i32* %diameter, align 4 + %6 = load i32, i32* %diameter, align 4 + %7 = load i32, i32* %diameter, align 4 + %mul7 = mul nsw i32 %6, %7 + %conv8 = sext i32 %mul7 to i64 + %mul9 = mul i64 %conv8, 4 + %call10 = call noalias i8* @malloc(i64 %mul9) #10 + %8 = bitcast i8* %call10 to i32* + store i32* %8, i32** %disk, align 8 + %9 = load i32*, i32** %disk, align 8 + %10 = load i32, i32* %radius, align 4 + call void @_Z9strelDiskPii(i32* %9, i32 %10) + store i32 0, i32* %countOnes, align 4 + store i32 0, i32* %x, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc17, %entry + %11 = load i32, i32* %x, align 4 + %12 = load i32, i32* %diameter, align 4 + %cmp = icmp slt i32 %11, %12 + br i1 %cmp, label %for.body, label %for.end19 + +for.body: ; preds = %for.cond + store i32 0, i32* %y, align 4 + br label %for.cond11 + +for.cond11: ; preds = %for.inc, %for.body + %13 = load i32, i32* %y, align 4 + %14 = load i32, i32* %diameter, align 4 + %cmp12 = icmp slt i32 %13, %14 + br i1 %cmp12, label %for.body13, label %for.end + +for.body13: ; preds = %for.cond11 + %15 = load i32*, i32** %disk, align 8 + %16 = load i32, i32* %x, align 4 + %17 = load i32, i32* %diameter, align 4 + %mul14 = mul nsw i32 %16, %17 + %18 = load i32, i32* %y, align 4 + %add = add nsw i32 %mul14, %18 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom + %19 = load i32, i32* %arrayidx, align 4 + %cmp15 = icmp eq i32 %19, 1 + br i1 %cmp15, label %if.then, label %if.end + +if.then: ; preds = %for.body13 + %20 = load i32, i32* %countOnes, align 4 + %inc = add nsw i32 %20, 1 + store i32 %inc, i32* %countOnes, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body13 + br label %for.inc + +for.inc: ; preds = %if.end + %21 = load i32, i32* %y, align 4 + %inc16 = add nsw i32 %21, 1 + store i32 %inc16, i32* %y, align 4 + br label %for.cond11 + +for.end: ; preds = %for.cond11 + br label %for.inc17 + +for.inc17: ; preds = %for.end + %22 = load i32, i32* %x, align 4 + %inc18 = add nsw i32 %22, 1 + store i32 %inc18, i32* %x, align 4 + br label %for.cond + +for.end19: ; preds = %for.cond + %23 = load i32, i32* %countOnes, align 4 + %mul20 = mul nsw i32 %23, 2 + %conv21 = sext i32 %mul20 to i64 + %mul22 = mul i64 %conv21, 8 + %call23 = call noalias i8* @malloc(i64 %mul22) #10 + %24 = bitcast i8* %call23 to double* + store double* %24, double** %objxy, align 8 + %25 = load i32*, i32** %disk, align 8 + %26 = load i32, i32* %countOnes, align 4 + %27 = load double*, double** %objxy, align 8 + %28 = load i32, i32* %radius, align 4 + call void @_Z12getneighborsPiiPdi(i32* %25, i32 %26, double* %27, i32 %28) + %call24 = call i64 @_Z8get_timev() + store i64 %call24, i64* %get_neighbors, align 8 + %29 = load i64, i64* %start, align 8 + %30 = load i64, i64* %get_neighbors, align 8 + %call25 = call float @_Z12elapsed_timexx(i64 %29, i64 %30) + %conv26 = fpext float %call25 to double + %call27 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.1, i64 0, i64 0), double %conv26) + %31 = load i32, i32* %Nparticles.addr, align 4 + %conv28 = sext i32 %31 to i64 + %mul29 = mul i64 8, %conv28 + %call30 = call noalias i8* @malloc(i64 %mul29) #10 + %32 = bitcast i8* %call30 to double* + store double* %32, double** %weights, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond31 + +for.cond31: ; preds = %for.inc38, %for.end19 + %33 = load i32, i32* %x, align 4 + %34 = load i32, i32* %Nparticles.addr, align 4 + %cmp32 = icmp slt i32 %33, %34 + br i1 %cmp32, label %for.body33, label %for.end40 + +for.body33: ; preds = %for.cond31 + %35 = load i32, i32* %Nparticles.addr, align 4 + %conv34 = sitofp i32 %35 to double + %div35 = fdiv double 1.000000e+00, %conv34 + %36 = load double*, double** %weights, align 8 + %37 = load i32, i32* %x, align 4 + %idxprom36 = sext i32 %37 to i64 + %arrayidx37 = getelementptr inbounds double, double* %36, i64 %idxprom36 + store double %div35, double* %arrayidx37, align 8 + br label %for.inc38 + +for.inc38: ; preds = %for.body33 + %38 = load i32, i32* %x, align 4 + %inc39 = add nsw i32 %38, 1 + store i32 %inc39, i32* %x, align 4 + br label %for.cond31 + +for.end40: ; preds = %for.cond31 + %call41 = call i64 @_Z8get_timev() + store i64 %call41, i64* %get_weights, align 8 + %39 = load i64, i64* %get_neighbors, align 8 + %40 = load i64, i64* %get_weights, align 8 + %call42 = call float @_Z12elapsed_timexx(i64 %39, i64 %40) + %conv43 = fpext float %call42 to double + %call44 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.2, i64 0, i64 0), double %conv43) + %41 = load i32, i32* %Nparticles.addr, align 4 + %conv45 = sext i32 %41 to i64 + %mul46 = mul i64 8, %conv45 + %call47 = call noalias i8* @malloc(i64 %mul46) #10 + %42 = bitcast i8* %call47 to double* + store double* %42, double** %likelihood, align 8 + %43 = load i32, i32* %Nparticles.addr, align 4 + %conv48 = sext i32 %43 to i64 + %mul49 = mul i64 8, %conv48 + %call50 = call noalias i8* @malloc(i64 %mul49) #10 + %44 = bitcast i8* %call50 to double* + store double* %44, double** %arrayX, align 8 + %45 = load i32, i32* %Nparticles.addr, align 4 + %conv51 = sext i32 %45 to i64 + %mul52 = mul i64 8, %conv51 + %call53 = call noalias i8* @malloc(i64 %mul52) #10 + %46 = bitcast i8* %call53 to double* + store double* %46, double** %arrayY, align 8 + %47 = load i32, i32* %Nparticles.addr, align 4 + %conv54 = sext i32 %47 to i64 + %mul55 = mul i64 8, %conv54 + %call56 = call noalias i8* @malloc(i64 %mul55) #10 + %48 = bitcast i8* %call56 to double* + store double* %48, double** %xj, align 8 + %49 = load i32, i32* %Nparticles.addr, align 4 + %conv57 = sext i32 %49 to i64 + %mul58 = mul i64 8, %conv57 + %call59 = call noalias i8* @malloc(i64 %mul58) #10 + %50 = bitcast i8* %call59 to double* + store double* %50, double** %yj, align 8 + %51 = load i32, i32* %Nparticles.addr, align 4 + %conv60 = sext i32 %51 to i64 + %mul61 = mul i64 8, %conv60 + %call62 = call noalias i8* @malloc(i64 %mul61) #10 + %52 = bitcast i8* %call62 to double* + store double* %52, double** %CDF, align 8 + %53 = load i32, i32* %countOnes, align 4 + %conv63 = sext i32 %53 to i64 + %mul64 = mul i64 4, %conv63 + %call65 = call noalias i8* @malloc(i64 %mul64) #10 + %54 = bitcast i8* %call65 to i32* + store i32* %54, i32** %ind, align 8 + %55 = load i32, i32* %Nparticles.addr, align 4 + %conv66 = sext i32 %55 to i64 + %mul67 = mul i64 8, %conv66 + %call68 = call noalias i8* @malloc(i64 %mul67) #10 + %56 = bitcast i8* %call68 to double* + store double* %56, double** %u, align 8 + %57 = bitcast double** %arrayX_GPU to i8** + %58 = load i32, i32* %Nparticles.addr, align 4 + %conv69 = sext i32 %58 to i64 + %mul70 = mul i64 8, %conv69 + %call71 = call i32 @cudaMalloc(i8** %57, i64 %mul70) + call void @_Z11check_error9cudaError(i32 %call71) + %59 = bitcast double** %arrayY_GPU to i8** + %60 = load i32, i32* %Nparticles.addr, align 4 + %conv72 = sext i32 %60 to i64 + %mul73 = mul i64 8, %conv72 + %call74 = call i32 @cudaMalloc(i8** %59, i64 %mul73) + call void @_Z11check_error9cudaError(i32 %call74) + %61 = bitcast double** %xj_GPU to i8** + %62 = load i32, i32* %Nparticles.addr, align 4 + %conv75 = sext i32 %62 to i64 + %mul76 = mul i64 8, %conv75 + %call77 = call i32 @cudaMalloc(i8** %61, i64 %mul76) + call void @_Z11check_error9cudaError(i32 %call77) + %63 = bitcast double** %yj_GPU to i8** + %64 = load i32, i32* %Nparticles.addr, align 4 + %conv78 = sext i32 %64 to i64 + %mul79 = mul i64 8, %conv78 + %call80 = call i32 @cudaMalloc(i8** %63, i64 %mul79) + call void @_Z11check_error9cudaError(i32 %call80) + %65 = bitcast double** %CDF_GPU to i8** + %66 = load i32, i32* %Nparticles.addr, align 4 + %conv81 = sext i32 %66 to i64 + %mul82 = mul i64 8, %conv81 + %call83 = call i32 @cudaMalloc(i8** %65, i64 %mul82) + call void @_Z11check_error9cudaError(i32 %call83) + %67 = bitcast double** %u_GPU to i8** + %68 = load i32, i32* %Nparticles.addr, align 4 + %conv84 = sext i32 %68 to i64 + %mul85 = mul i64 8, %conv84 + %call86 = call i32 @cudaMalloc(i8** %67, i64 %mul85) + call void @_Z11check_error9cudaError(i32 %call86) + store i32 0, i32* %x, align 4 + br label %for.cond87 + +for.cond87: ; preds = %for.inc94, %for.end40 + %69 = load i32, i32* %x, align 4 + %70 = load i32, i32* %Nparticles.addr, align 4 + %cmp88 = icmp slt i32 %69, %70 + br i1 %cmp88, label %for.body89, label %for.end96 + +for.body89: ; preds = %for.cond87 + %71 = load double, double* %xe, align 8 + %72 = load double*, double** %arrayX, align 8 + %73 = load i32, i32* %x, align 4 + %idxprom90 = sext i32 %73 to i64 + %arrayidx91 = getelementptr inbounds double, double* %72, i64 %idxprom90 + store double %71, double* %arrayidx91, align 8 + %74 = load double, double* %ye, align 8 + %75 = load double*, double** %arrayY, align 8 + %76 = load i32, i32* %x, align 4 + %idxprom92 = sext i32 %76 to i64 + %arrayidx93 = getelementptr inbounds double, double* %75, i64 %idxprom92 + store double %74, double* %arrayidx93, align 8 + br label %for.inc94 + +for.inc94: ; preds = %for.body89 + %77 = load i32, i32* %x, align 4 + %inc95 = add nsw i32 %77, 1 + store i32 %inc95, i32* %x, align 4 + br label %for.cond87 + +for.end96: ; preds = %for.cond87 + store i32 1, i32* %k, align 4 + br label %for.cond97 + +for.cond97: ; preds = %for.inc381, %for.end96 + %78 = load i32, i32* %k, align 4 + %79 = load i32, i32* %Nfr.addr, align 4 + %cmp98 = icmp slt i32 %78, %79 + br i1 %cmp98, label %for.body99, label %for.end383 + +for.body99: ; preds = %for.cond97 + %call100 = call i64 @_Z8get_timev() + store i64 %call100, i64* %set_arrays, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond101 + +for.cond101: ; preds = %for.inc120, %for.body99 + %80 = load i32, i32* %x, align 4 + %81 = load i32, i32* %Nparticles.addr, align 4 + %cmp102 = icmp slt i32 %80, %81 + br i1 %cmp102, label %for.body103, label %for.end122 + +for.body103: ; preds = %for.cond101 + %82 = load double*, double** %arrayX, align 8 + %83 = load i32, i32* %x, align 4 + %idxprom104 = sext i32 %83 to i64 + %arrayidx105 = getelementptr inbounds double, double* %82, i64 %idxprom104 + %84 = load double, double* %arrayidx105, align 8 + %add106 = fadd contract double %84, 1.000000e+00 + %85 = load i32*, i32** %seed.addr, align 8 + %86 = load i32, i32* %x, align 4 + %call107 = call double @_Z5randnPii(i32* %85, i32 %86) + %mul108 = fmul contract double 5.000000e+00, %call107 + %add109 = fadd contract double %add106, %mul108 + %87 = load double*, double** %arrayX, align 8 + %88 = load i32, i32* %x, align 4 + %idxprom110 = sext i32 %88 to i64 + %arrayidx111 = getelementptr inbounds double, double* %87, i64 %idxprom110 + store double %add109, double* %arrayidx111, align 8 + %89 = load double*, double** %arrayY, align 8 + %90 = load i32, i32* %x, align 4 + %idxprom112 = sext i32 %90 to i64 + %arrayidx113 = getelementptr inbounds double, double* %89, i64 %idxprom112 + %91 = load double, double* %arrayidx113, align 8 + %sub114 = fsub contract double %91, 2.000000e+00 + %92 = load i32*, i32** %seed.addr, align 8 + %93 = load i32, i32* %x, align 4 + %call115 = call double @_Z5randnPii(i32* %92, i32 %93) + %mul116 = fmul contract double 2.000000e+00, %call115 + %add117 = fadd contract double %sub114, %mul116 + %94 = load double*, double** %arrayY, align 8 + %95 = load i32, i32* %x, align 4 + %idxprom118 = sext i32 %95 to i64 + %arrayidx119 = getelementptr inbounds double, double* %94, i64 %idxprom118 + store double %add117, double* %arrayidx119, align 8 + br label %for.inc120 + +for.inc120: ; preds = %for.body103 + %96 = load i32, i32* %x, align 4 + %inc121 = add nsw i32 %96, 1 + store i32 %inc121, i32* %x, align 4 + br label %for.cond101 + +for.end122: ; preds = %for.cond101 + %call123 = call i64 @_Z8get_timev() + store i64 %call123, i64* %error, align 8 + %97 = load i64, i64* %set_arrays, align 8 + %98 = load i64, i64* %error, align 8 + %call124 = call float @_Z12elapsed_timexx(i64 %97, i64 %98) + %conv125 = fpext float %call124 to double + %call126 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.3, i64 0, i64 0), double %conv125) + store i32 0, i32* %x, align 4 + br label %for.cond127 + +for.cond127: ; preds = %for.inc178, %for.end122 + %99 = load i32, i32* %x, align 4 + %100 = load i32, i32* %Nparticles.addr, align 4 + %cmp128 = icmp slt i32 %99, %100 + br i1 %cmp128, label %for.body129, label %for.end180 + +for.body129: ; preds = %for.cond127 + store i32 0, i32* %y, align 4 + br label %for.cond130 + +for.cond130: ; preds = %for.inc166, %for.body129 + %101 = load i32, i32* %y, align 4 + %102 = load i32, i32* %countOnes, align 4 + %cmp131 = icmp slt i32 %101, %102 + br i1 %cmp131, label %for.body132, label %for.end168 + +for.body132: ; preds = %for.cond130 + %103 = load double*, double** %arrayX, align 8 + %104 = load i32, i32* %x, align 4 + %idxprom133 = sext i32 %104 to i64 + %arrayidx134 = getelementptr inbounds double, double* %103, i64 %idxprom133 + %105 = load double, double* %arrayidx134, align 8 + %call135 = call double @_Z11roundDoubled(double %105) + %106 = load double*, double** %objxy, align 8 + %107 = load i32, i32* %y, align 4 + %mul136 = mul nsw i32 %107, 2 + %add137 = add nsw i32 %mul136, 1 + %idxprom138 = sext i32 %add137 to i64 + %arrayidx139 = getelementptr inbounds double, double* %106, i64 %idxprom138 + %108 = load double, double* %arrayidx139, align 8 + %add140 = fadd contract double %call135, %108 + %conv141 = fptosi double %add140 to i32 + store i32 %conv141, i32* %indX, align 4 + %109 = load double*, double** %arrayY, align 8 + %110 = load i32, i32* %x, align 4 + %idxprom142 = sext i32 %110 to i64 + %arrayidx143 = getelementptr inbounds double, double* %109, i64 %idxprom142 + %111 = load double, double* %arrayidx143, align 8 + %call144 = call double @_Z11roundDoubled(double %111) + %112 = load double*, double** %objxy, align 8 + %113 = load i32, i32* %y, align 4 + %mul145 = mul nsw i32 %113, 2 + %idxprom146 = sext i32 %mul145 to i64 + %arrayidx147 = getelementptr inbounds double, double* %112, i64 %idxprom146 + %114 = load double, double* %arrayidx147, align 8 + %add148 = fadd contract double %call144, %114 + %conv149 = fptosi double %add148 to i32 + store i32 %conv149, i32* %indY, align 4 + %115 = load i32, i32* %indX, align 4 + %116 = load i32, i32* %IszY.addr, align 4 + %mul150 = mul nsw i32 %115, %116 + %117 = load i32, i32* %Nfr.addr, align 4 + %mul151 = mul nsw i32 %mul150, %117 + %118 = load i32, i32* %indY, align 4 + %119 = load i32, i32* %Nfr.addr, align 4 + %mul152 = mul nsw i32 %118, %119 + %add153 = add nsw i32 %mul151, %mul152 + %120 = load i32, i32* %k, align 4 + %add154 = add nsw i32 %add153, %120 + %call155 = call double @_ZSt4fabsIiEN9__gnu_cxx11__enable_ifIXsr12__is_integerIT_EE7__valueEdE6__typeES2_(i32 %add154) + %conv156 = fptosi double %call155 to i32 + %121 = load i32*, i32** %ind, align 8 + %122 = load i32, i32* %y, align 4 + %idxprom157 = sext i32 %122 to i64 + %arrayidx158 = getelementptr inbounds i32, i32* %121, i64 %idxprom157 + store i32 %conv156, i32* %arrayidx158, align 4 + %123 = load i32*, i32** %ind, align 8 + %124 = load i32, i32* %y, align 4 + %idxprom159 = sext i32 %124 to i64 + %arrayidx160 = getelementptr inbounds i32, i32* %123, i64 %idxprom159 + %125 = load i32, i32* %arrayidx160, align 4 + %126 = load i32, i32* %max_size, align 4 + %cmp161 = icmp sge i32 %125, %126 + br i1 %cmp161, label %if.then162, label %if.end165 + +if.then162: ; preds = %for.body132 + %127 = load i32*, i32** %ind, align 8 + %128 = load i32, i32* %y, align 4 + %idxprom163 = sext i32 %128 to i64 + %arrayidx164 = getelementptr inbounds i32, i32* %127, i64 %idxprom163 + store i32 0, i32* %arrayidx164, align 4 + br label %if.end165 + +if.end165: ; preds = %if.then162, %for.body132 + br label %for.inc166 + +for.inc166: ; preds = %if.end165 + %129 = load i32, i32* %y, align 4 + %inc167 = add nsw i32 %129, 1 + store i32 %inc167, i32* %y, align 4 + br label %for.cond130 + +for.end168: ; preds = %for.cond130 + %130 = load i32*, i32** %I.addr, align 8 + %131 = load i32*, i32** %ind, align 8 + %132 = load i32, i32* %countOnes, align 4 + %call169 = call double @_Z17calcLikelihoodSumPiS_i(i32* %130, i32* %131, i32 %132) + %133 = load double*, double** %likelihood, align 8 + %134 = load i32, i32* %x, align 4 + %idxprom170 = sext i32 %134 to i64 + %arrayidx171 = getelementptr inbounds double, double* %133, i64 %idxprom170 + store double %call169, double* %arrayidx171, align 8 + %135 = load double*, double** %likelihood, align 8 + %136 = load i32, i32* %x, align 4 + %idxprom172 = sext i32 %136 to i64 + %arrayidx173 = getelementptr inbounds double, double* %135, i64 %idxprom172 + %137 = load double, double* %arrayidx173, align 8 + %138 = load i32, i32* %countOnes, align 4 + %conv174 = sitofp i32 %138 to double + %div175 = fdiv double %137, %conv174 + %139 = load double*, double** %likelihood, align 8 + %140 = load i32, i32* %x, align 4 + %idxprom176 = sext i32 %140 to i64 + %arrayidx177 = getelementptr inbounds double, double* %139, i64 %idxprom176 + store double %div175, double* %arrayidx177, align 8 + br label %for.inc178 + +for.inc178: ; preds = %for.end168 + %141 = load i32, i32* %x, align 4 + %inc179 = add nsw i32 %141, 1 + store i32 %inc179, i32* %x, align 4 + br label %for.cond127 + +for.end180: ; preds = %for.cond127 + %call181 = call i64 @_Z8get_timev() + store i64 %call181, i64* %likelihood_time, align 8 + %142 = load i64, i64* %error, align 8 + %143 = load i64, i64* %likelihood_time, align 8 + %call182 = call float @_Z12elapsed_timexx(i64 %142, i64 %143) + %conv183 = fpext float %call182 to double + %call184 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.4, i64 0, i64 0), double %conv183) + store i32 0, i32* %x, align 4 + br label %for.cond185 + +for.cond185: ; preds = %for.inc196, %for.end180 + %144 = load i32, i32* %x, align 4 + %145 = load i32, i32* %Nparticles.addr, align 4 + %cmp186 = icmp slt i32 %144, %145 + br i1 %cmp186, label %for.body187, label %for.end198 + +for.body187: ; preds = %for.cond185 + %146 = load double*, double** %weights, align 8 + %147 = load i32, i32* %x, align 4 + %idxprom188 = sext i32 %147 to i64 + %arrayidx189 = getelementptr inbounds double, double* %146, i64 %idxprom188 + %148 = load double, double* %arrayidx189, align 8 + %149 = load double*, double** %likelihood, align 8 + %150 = load i32, i32* %x, align 4 + %idxprom190 = sext i32 %150 to i64 + %arrayidx191 = getelementptr inbounds double, double* %149, i64 %idxprom190 + %151 = load double, double* %arrayidx191, align 8 + %call192 = call double @exp(double %151) #10 + %mul193 = fmul contract double %148, %call192 + %152 = load double*, double** %weights, align 8 + %153 = load i32, i32* %x, align 4 + %idxprom194 = sext i32 %153 to i64 + %arrayidx195 = getelementptr inbounds double, double* %152, i64 %idxprom194 + store double %mul193, double* %arrayidx195, align 8 + br label %for.inc196 + +for.inc196: ; preds = %for.body187 + %154 = load i32, i32* %x, align 4 + %inc197 = add nsw i32 %154, 1 + store i32 %inc197, i32* %x, align 4 + br label %for.cond185 + +for.end198: ; preds = %for.cond185 + %call199 = call i64 @_Z8get_timev() + store i64 %call199, i64* %exponential, align 8 + %155 = load i64, i64* %likelihood_time, align 8 + %156 = load i64, i64* %exponential, align 8 + %call200 = call float @_Z12elapsed_timexx(i64 %155, i64 %156) + %conv201 = fpext float %call200 to double + %call202 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.5, i64 0, i64 0), double %conv201) + store double 0.000000e+00, double* %sumWeights, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond203 + +for.cond203: ; preds = %for.inc209, %for.end198 + %157 = load i32, i32* %x, align 4 + %158 = load i32, i32* %Nparticles.addr, align 4 + %cmp204 = icmp slt i32 %157, %158 + br i1 %cmp204, label %for.body205, label %for.end211 + +for.body205: ; preds = %for.cond203 + %159 = load double*, double** %weights, align 8 + %160 = load i32, i32* %x, align 4 + %idxprom206 = sext i32 %160 to i64 + %arrayidx207 = getelementptr inbounds double, double* %159, i64 %idxprom206 + %161 = load double, double* %arrayidx207, align 8 + %162 = load double, double* %sumWeights, align 8 + %add208 = fadd contract double %162, %161 + store double %add208, double* %sumWeights, align 8 + br label %for.inc209 + +for.inc209: ; preds = %for.body205 + %163 = load i32, i32* %x, align 4 + %inc210 = add nsw i32 %163, 1 + store i32 %inc210, i32* %x, align 4 + br label %for.cond203 + +for.end211: ; preds = %for.cond203 + %call212 = call i64 @_Z8get_timev() + store i64 %call212, i64* %sum_time, align 8 + %164 = load i64, i64* %exponential, align 8 + %165 = load i64, i64* %sum_time, align 8 + %call213 = call float @_Z12elapsed_timexx(i64 %164, i64 %165) + %conv214 = fpext float %call213 to double + %call215 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.6, i64 0, i64 0), double %conv214) + store i32 0, i32* %x, align 4 + br label %for.cond216 + +for.cond216: ; preds = %for.inc224, %for.end211 + %166 = load i32, i32* %x, align 4 + %167 = load i32, i32* %Nparticles.addr, align 4 + %cmp217 = icmp slt i32 %166, %167 + br i1 %cmp217, label %for.body218, label %for.end226 + +for.body218: ; preds = %for.cond216 + %168 = load double*, double** %weights, align 8 + %169 = load i32, i32* %x, align 4 + %idxprom219 = sext i32 %169 to i64 + %arrayidx220 = getelementptr inbounds double, double* %168, i64 %idxprom219 + %170 = load double, double* %arrayidx220, align 8 + %171 = load double, double* %sumWeights, align 8 + %div221 = fdiv double %170, %171 + %172 = load double*, double** %weights, align 8 + %173 = load i32, i32* %x, align 4 + %idxprom222 = sext i32 %173 to i64 + %arrayidx223 = getelementptr inbounds double, double* %172, i64 %idxprom222 + store double %div221, double* %arrayidx223, align 8 + br label %for.inc224 + +for.inc224: ; preds = %for.body218 + %174 = load i32, i32* %x, align 4 + %inc225 = add nsw i32 %174, 1 + store i32 %inc225, i32* %x, align 4 + br label %for.cond216 + +for.end226: ; preds = %for.cond216 + %call227 = call i64 @_Z8get_timev() + store i64 %call227, i64* %normalize, align 8 + %175 = load i64, i64* %sum_time, align 8 + %176 = load i64, i64* %normalize, align 8 + %call228 = call float @_Z12elapsed_timexx(i64 %175, i64 %176) + %conv229 = fpext float %call228 to double + %call230 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.7, i64 0, i64 0), double %conv229) + store double 0.000000e+00, double* %xe, align 8 + store double 0.000000e+00, double* %ye, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond231 + +for.cond231: ; preds = %for.inc246, %for.end226 + %177 = load i32, i32* %x, align 4 + %178 = load i32, i32* %Nparticles.addr, align 4 + %cmp232 = icmp slt i32 %177, %178 + br i1 %cmp232, label %for.body233, label %for.end248 + +for.body233: ; preds = %for.cond231 + %179 = load double*, double** %arrayX, align 8 + %180 = load i32, i32* %x, align 4 + %idxprom234 = sext i32 %180 to i64 + %arrayidx235 = getelementptr inbounds double, double* %179, i64 %idxprom234 + %181 = load double, double* %arrayidx235, align 8 + %182 = load double*, double** %weights, align 8 + %183 = load i32, i32* %x, align 4 + %idxprom236 = sext i32 %183 to i64 + %arrayidx237 = getelementptr inbounds double, double* %182, i64 %idxprom236 + %184 = load double, double* %arrayidx237, align 8 + %mul238 = fmul contract double %181, %184 + %185 = load double, double* %xe, align 8 + %add239 = fadd contract double %185, %mul238 + store double %add239, double* %xe, align 8 + %186 = load double*, double** %arrayY, align 8 + %187 = load i32, i32* %x, align 4 + %idxprom240 = sext i32 %187 to i64 + %arrayidx241 = getelementptr inbounds double, double* %186, i64 %idxprom240 + %188 = load double, double* %arrayidx241, align 8 + %189 = load double*, double** %weights, align 8 + %190 = load i32, i32* %x, align 4 + %idxprom242 = sext i32 %190 to i64 + %arrayidx243 = getelementptr inbounds double, double* %189, i64 %idxprom242 + %191 = load double, double* %arrayidx243, align 8 + %mul244 = fmul contract double %188, %191 + %192 = load double, double* %ye, align 8 + %add245 = fadd contract double %192, %mul244 + store double %add245, double* %ye, align 8 + br label %for.inc246 + +for.inc246: ; preds = %for.body233 + %193 = load i32, i32* %x, align 4 + %inc247 = add nsw i32 %193, 1 + store i32 %inc247, i32* %x, align 4 + br label %for.cond231 + +for.end248: ; preds = %for.cond231 + %call249 = call i64 @_Z8get_timev() + store i64 %call249, i64* %move_time, align 8 + %194 = load i64, i64* %normalize, align 8 + %195 = load i64, i64* %move_time, align 8 + %call250 = call float @_Z12elapsed_timexx(i64 %194, i64 %195) + %conv251 = fpext float %call250 to double + %call252 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.8, i64 0, i64 0), double %conv251) + %196 = load double, double* %xe, align 8 + %call253 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.9, i64 0, i64 0), double %196) + %197 = load double, double* %ye, align 8 + %call254 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.10, i64 0, i64 0), double %197) + %198 = load double, double* %xe, align 8 + %199 = load i32, i32* %IszY.addr, align 4 + %conv255 = sitofp i32 %199 to double + %div256 = fdiv double %conv255, 2.000000e+00 + %call257 = call double @_Z11roundDoubled(double %div256) + %conv258 = fptosi double %call257 to i32 + %conv259 = sitofp i32 %conv258 to double + %sub260 = fsub contract double %198, %conv259 + %call261 = call double @_ZSt3powdi(double %sub260, i32 2) + %200 = load double, double* %ye, align 8 + %201 = load i32, i32* %IszX.addr, align 4 + %conv262 = sitofp i32 %201 to double + %div263 = fdiv double %conv262, 2.000000e+00 + %call264 = call double @_Z11roundDoubled(double %div263) + %conv265 = fptosi double %call264 to i32 + %conv266 = sitofp i32 %conv265 to double + %sub267 = fsub contract double %200, %conv266 + %call268 = call double @_ZSt3powdi(double %sub267, i32 2) + %add269 = fadd contract double %call261, %call268 + %call270 = call double @sqrt(double %add269) #10 + store double %call270, double* %distance, align 8 + %202 = load double, double* %distance, align 8 + %call271 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.11, i64 0, i64 0), double %202) + %203 = load double*, double** %weights, align 8 + %arrayidx272 = getelementptr inbounds double, double* %203, i64 0 + %204 = load double, double* %arrayidx272, align 8 + %205 = load double*, double** %CDF, align 8 + %arrayidx273 = getelementptr inbounds double, double* %205, i64 0 + store double %204, double* %arrayidx273, align 8 + store i32 1, i32* %x, align 4 + br label %for.cond274 + +for.cond274: ; preds = %for.inc285, %for.end248 + %206 = load i32, i32* %x, align 4 + %207 = load i32, i32* %Nparticles.addr, align 4 + %cmp275 = icmp slt i32 %206, %207 + br i1 %cmp275, label %for.body276, label %for.end287 + +for.body276: ; preds = %for.cond274 + %208 = load double*, double** %weights, align 8 + %209 = load i32, i32* %x, align 4 + %idxprom277 = sext i32 %209 to i64 + %arrayidx278 = getelementptr inbounds double, double* %208, i64 %idxprom277 + %210 = load double, double* %arrayidx278, align 8 + %211 = load double*, double** %CDF, align 8 + %212 = load i32, i32* %x, align 4 + %sub279 = sub nsw i32 %212, 1 + %idxprom280 = sext i32 %sub279 to i64 + %arrayidx281 = getelementptr inbounds double, double* %211, i64 %idxprom280 + %213 = load double, double* %arrayidx281, align 8 + %add282 = fadd contract double %210, %213 + %214 = load double*, double** %CDF, align 8 + %215 = load i32, i32* %x, align 4 + %idxprom283 = sext i32 %215 to i64 + %arrayidx284 = getelementptr inbounds double, double* %214, i64 %idxprom283 + store double %add282, double* %arrayidx284, align 8 + br label %for.inc285 + +for.inc285: ; preds = %for.body276 + %216 = load i32, i32* %x, align 4 + %inc286 = add nsw i32 %216, 1 + store i32 %inc286, i32* %x, align 4 + br label %for.cond274 + +for.end287: ; preds = %for.cond274 + %call288 = call i64 @_Z8get_timev() + store i64 %call288, i64* %cum_sum, align 8 + %217 = load i64, i64* %move_time, align 8 + %218 = load i64, i64* %cum_sum, align 8 + %call289 = call float @_Z12elapsed_timexx(i64 %217, i64 %218) + %conv290 = fpext float %call289 to double + %call291 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.12, i64 0, i64 0), double %conv290) + %219 = load i32, i32* %Nparticles.addr, align 4 + %conv292 = sitofp i32 %219 to double + %div293 = fdiv double 1.000000e+00, %conv292 + %220 = load i32*, i32** %seed.addr, align 8 + %call294 = call double @_Z5randuPii(i32* %220, i32 0) + %mul295 = fmul contract double %div293, %call294 + store double %mul295, double* %u1, align 8 + store i32 0, i32* %x, align 4 + br label %for.cond296 + +for.cond296: ; preds = %for.inc305, %for.end287 + %221 = load i32, i32* %x, align 4 + %222 = load i32, i32* %Nparticles.addr, align 4 + %cmp297 = icmp slt i32 %221, %222 + br i1 %cmp297, label %for.body298, label %for.end307 + +for.body298: ; preds = %for.cond296 + %223 = load double, double* %u1, align 8 + %224 = load i32, i32* %x, align 4 + %conv299 = sitofp i32 %224 to double + %225 = load i32, i32* %Nparticles.addr, align 4 + %conv300 = sitofp i32 %225 to double + %div301 = fdiv double %conv299, %conv300 + %add302 = fadd contract double %223, %div301 + %226 = load double*, double** %u, align 8 + %227 = load i32, i32* %x, align 4 + %idxprom303 = sext i32 %227 to i64 + %arrayidx304 = getelementptr inbounds double, double* %226, i64 %idxprom303 + store double %add302, double* %arrayidx304, align 8 + br label %for.inc305 + +for.inc305: ; preds = %for.body298 + %228 = load i32, i32* %x, align 4 + %inc306 = add nsw i32 %228, 1 + store i32 %inc306, i32* %x, align 4 + br label %for.cond296 + +for.end307: ; preds = %for.cond296 + %call308 = call i64 @_Z8get_timev() + store i64 %call308, i64* %u_time, align 8 + %229 = load i64, i64* %cum_sum, align 8 + %230 = load i64, i64* %u_time, align 8 + %call309 = call float @_Z12elapsed_timexx(i64 %229, i64 %230) + %conv310 = fpext float %call309 to double + %call311 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.13, i64 0, i64 0), double %conv310) + %call312 = call i64 @_Z8get_timev() + store i64 %call312, i64* %start_copy, align 8 + %231 = load double*, double** %arrayX_GPU, align 8 + %232 = bitcast double* %231 to i8* + %233 = load double*, double** %arrayX, align 8 + %234 = bitcast double* %233 to i8* + %235 = load i32, i32* %Nparticles.addr, align 4 + %conv313 = sext i32 %235 to i64 + %mul314 = mul i64 8, %conv313 + %call315 = call i32 @cudaMemcpy(i8* %232, i8* %234, i64 %mul314, i32 1) + %236 = load double*, double** %arrayY_GPU, align 8 + %237 = bitcast double* %236 to i8* + %238 = load double*, double** %arrayY, align 8 + %239 = bitcast double* %238 to i8* + %240 = load i32, i32* %Nparticles.addr, align 4 + %conv316 = sext i32 %240 to i64 + %mul317 = mul i64 8, %conv316 + %call318 = call i32 @cudaMemcpy(i8* %237, i8* %239, i64 %mul317, i32 1) + %241 = load double*, double** %xj_GPU, align 8 + %242 = bitcast double* %241 to i8* + %243 = load double*, double** %xj, align 8 + %244 = bitcast double* %243 to i8* + %245 = load i32, i32* %Nparticles.addr, align 4 + %conv319 = sext i32 %245 to i64 + %mul320 = mul i64 8, %conv319 + %call321 = call i32 @cudaMemcpy(i8* %242, i8* %244, i64 %mul320, i32 1) + %246 = load double*, double** %yj_GPU, align 8 + %247 = bitcast double* %246 to i8* + %248 = load double*, double** %yj, align 8 + %249 = bitcast double* %248 to i8* + %250 = load i32, i32* %Nparticles.addr, align 4 + %conv322 = sext i32 %250 to i64 + %mul323 = mul i64 8, %conv322 + %call324 = call i32 @cudaMemcpy(i8* %247, i8* %249, i64 %mul323, i32 1) + %251 = load double*, double** %CDF_GPU, align 8 + %252 = bitcast double* %251 to i8* + %253 = load double*, double** %CDF, align 8 + %254 = bitcast double* %253 to i8* + %255 = load i32, i32* %Nparticles.addr, align 4 + %conv325 = sext i32 %255 to i64 + %mul326 = mul i64 8, %conv325 + %call327 = call i32 @cudaMemcpy(i8* %252, i8* %254, i64 %mul326, i32 1) + %256 = load double*, double** %u_GPU, align 8 + %257 = bitcast double* %256 to i8* + %258 = load double*, double** %u, align 8 + %259 = bitcast double* %258 to i8* + %260 = load i32, i32* %Nparticles.addr, align 4 + %conv328 = sext i32 %260 to i64 + %mul329 = mul i64 8, %conv328 + %call330 = call i32 @cudaMemcpy(i8* %257, i8* %259, i64 %mul329, i32 1) + %call331 = call i64 @_Z8get_timev() + store i64 %call331, i64* %end_copy, align 8 + %261 = load i32, i32* %Nparticles.addr, align 4 + %conv332 = sitofp i32 %261 to double + %div333 = fdiv double %conv332, 1.280000e+02 + %262 = call double @llvm.ceil.f64(double %div333) + %conv334 = fptosi double %262 to i32 + store i32 %conv334, i32* %num_blocks, align 4 + %263 = load i32, i32* %num_blocks, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 %263, i32 1, i32 1) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp335, i32 128, i32 1, i32 1) + %264 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %265 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %264, i8* align 4 %265, i64 12, i1 false) + %266 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %267 = load i64, i64* %266, align 4 + %268 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %269 = load i32, i32* %268, align 4 + %270 = bitcast { i64, i32 }* %agg.tmp335.coerce to i8* + %271 = bitcast %struct.dim3* %agg.tmp335 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %270, i8* align 4 %271, i64 12, i1 false) + %272 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp335.coerce, i32 0, i32 0 + %273 = load i64, i64* %272, align 4 + %274 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp335.coerce, i32 0, i32 1 + %275 = load i32, i32* %274, align 4 + %call336 = call i32 @__cudaPushCallConfiguration(i64 %267, i32 %269, i64 %273, i32 %275, i64 0, i8* null) + %tobool = icmp ne i32 %call336, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.end307 + %276 = load double*, double** %arrayX_GPU, align 8 + %277 = load double*, double** %arrayY_GPU, align 8 + %278 = load double*, double** %CDF_GPU, align 8 + %279 = load double*, double** %u_GPU, align 8 + %280 = load double*, double** %xj_GPU, align 8 + %281 = load double*, double** %yj_GPU, align 8 + %282 = load i32, i32* %Nparticles.addr, align 4 + call void @_Z6kernelPdS_S_S_S_S_i(double* %276, double* %277, double* %278, double* %279, double* %280, double* %281, i32 %282) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %for.end307 + %call337 = call i32 @cudaThreadSynchronize() + %call338 = call i64 @_Z8get_timev() + store i64 %call338, i64* %start_copy_back, align 8 + %283 = load double*, double** %yj, align 8 + %284 = bitcast double* %283 to i8* + %285 = load double*, double** %yj_GPU, align 8 + %286 = bitcast double* %285 to i8* + %287 = load i32, i32* %Nparticles.addr, align 4 + %conv339 = sext i32 %287 to i64 + %mul340 = mul i64 8, %conv339 + %call341 = call i32 @cudaMemcpy(i8* %284, i8* %286, i64 %mul340, i32 2) + %288 = load double*, double** %xj, align 8 + %289 = bitcast double* %288 to i8* + %290 = load double*, double** %xj_GPU, align 8 + %291 = bitcast double* %290 to i8* + %292 = load i32, i32* %Nparticles.addr, align 4 + %conv342 = sext i32 %292 to i64 + %mul343 = mul i64 8, %conv342 + %call344 = call i32 @cudaMemcpy(i8* %289, i8* %291, i64 %mul343, i32 2) + %call345 = call i64 @_Z8get_timev() + store i64 %call345, i64* %end_copy_back, align 8 + %293 = load i64, i64* %start_copy, align 8 + %294 = load i64, i64* %end_copy, align 8 + %call346 = call float @_Z12elapsed_timexx(i64 %293, i64 %294) + %conv347 = fpext float %call346 to double + %call348 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.14, i64 0, i64 0), double %conv347) + %295 = load i64, i64* %end_copy, align 8 + %296 = load i64, i64* %start_copy_back, align 8 + %call349 = call float @_Z12elapsed_timexx(i64 %295, i64 %296) + %conv350 = fpext float %call349 to double + %call351 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.15, i64 0, i64 0), double %conv350) + %297 = load i64, i64* %start_copy_back, align 8 + %298 = load i64, i64* %end_copy_back, align 8 + %call352 = call float @_Z12elapsed_timexx(i64 %297, i64 %298) + %conv353 = fpext float %call352 to double + %call354 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.16, i64 0, i64 0), double %conv353) + %call355 = call i64 @_Z8get_timev() + store i64 %call355, i64* %xyj_time, align 8 + %299 = load i64, i64* %u_time, align 8 + %300 = load i64, i64* %xyj_time, align 8 + %call356 = call float @_Z12elapsed_timexx(i64 %299, i64 %300) + %conv357 = fpext float %call356 to double + %call358 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.17, i64 0, i64 0), double %conv357) + store i32 0, i32* %x, align 4 + br label %for.cond359 + +for.cond359: ; preds = %for.inc374, %kcall.end + %301 = load i32, i32* %x, align 4 + %302 = load i32, i32* %Nparticles.addr, align 4 + %cmp360 = icmp slt i32 %301, %302 + br i1 %cmp360, label %for.body361, label %for.end376 + +for.body361: ; preds = %for.cond359 + %303 = load double*, double** %xj, align 8 + %304 = load i32, i32* %x, align 4 + %idxprom362 = sext i32 %304 to i64 + %arrayidx363 = getelementptr inbounds double, double* %303, i64 %idxprom362 + %305 = load double, double* %arrayidx363, align 8 + %306 = load double*, double** %arrayX, align 8 + %307 = load i32, i32* %x, align 4 + %idxprom364 = sext i32 %307 to i64 + %arrayidx365 = getelementptr inbounds double, double* %306, i64 %idxprom364 + store double %305, double* %arrayidx365, align 8 + %308 = load double*, double** %yj, align 8 + %309 = load i32, i32* %x, align 4 + %idxprom366 = sext i32 %309 to i64 + %arrayidx367 = getelementptr inbounds double, double* %308, i64 %idxprom366 + %310 = load double, double* %arrayidx367, align 8 + %311 = load double*, double** %arrayY, align 8 + %312 = load i32, i32* %x, align 4 + %idxprom368 = sext i32 %312 to i64 + %arrayidx369 = getelementptr inbounds double, double* %311, i64 %idxprom368 + store double %310, double* %arrayidx369, align 8 + %313 = load i32, i32* %Nparticles.addr, align 4 + %conv370 = sitofp i32 %313 to double + %div371 = fdiv double 1.000000e+00, %conv370 + %314 = load double*, double** %weights, align 8 + %315 = load i32, i32* %x, align 4 + %idxprom372 = sext i32 %315 to i64 + %arrayidx373 = getelementptr inbounds double, double* %314, i64 %idxprom372 + store double %div371, double* %arrayidx373, align 8 + br label %for.inc374 + +for.inc374: ; preds = %for.body361 + %316 = load i32, i32* %x, align 4 + %inc375 = add nsw i32 %316, 1 + store i32 %inc375, i32* %x, align 4 + br label %for.cond359 + +for.end376: ; preds = %for.cond359 + %call377 = call i64 @_Z8get_timev() + store i64 %call377, i64* %reset, align 8 + %317 = load i64, i64* %xyj_time, align 8 + %318 = load i64, i64* %reset, align 8 + %call378 = call float @_Z12elapsed_timexx(i64 %317, i64 %318) + %conv379 = fpext float %call378 to double + %call380 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.18, i64 0, i64 0), double %conv379) + br label %for.inc381 + +for.inc381: ; preds = %for.end376 + %319 = load i32, i32* %k, align 4 + %inc382 = add nsw i32 %319, 1 + store i32 %inc382, i32* %k, align 4 + br label %for.cond97 + +for.end383: ; preds = %for.cond97 + %call384 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.19, i64 0, i64 0)) + store i32 0, i32* %i, align 4 + br label %for.cond385 + +for.cond385: ; preds = %for.inc391, %for.end383 + %320 = load i32, i32* %i, align 4 + %cmp386 = icmp slt i32 %320, 10 + br i1 %cmp386, label %for.body387, label %for.end393 + +for.body387: ; preds = %for.cond385 + %321 = load double*, double** %arrayX, align 8 + %322 = load i32, i32* %i, align 4 + %idxprom388 = sext i32 %322 to i64 + %arrayidx389 = getelementptr inbounds double, double* %321, i64 %idxprom388 + %323 = load double, double* %arrayidx389, align 8 + %call390 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.20, i64 0, i64 0), double %323) + br label %for.inc391 + +for.inc391: ; preds = %for.body387 + %324 = load i32, i32* %i, align 4 + %inc392 = add nsw i32 %324, 1 + store i32 %inc392, i32* %i, align 4 + br label %for.cond385 + +for.end393: ; preds = %for.cond385 + %325 = load double*, double** %u_GPU, align 8 + %326 = bitcast double* %325 to i8* + %call394 = call i32 @cudaFree(i8* %326) + %327 = load double*, double** %CDF_GPU, align 8 + %328 = bitcast double* %327 to i8* + %call395 = call i32 @cudaFree(i8* %328) + %329 = load double*, double** %yj_GPU, align 8 + %330 = bitcast double* %329 to i8* + %call396 = call i32 @cudaFree(i8* %330) + %331 = load double*, double** %xj_GPU, align 8 + %332 = bitcast double* %331 to i8* + %call397 = call i32 @cudaFree(i8* %332) + %333 = load double*, double** %arrayY_GPU, align 8 + %334 = bitcast double* %333 to i8* + %call398 = call i32 @cudaFree(i8* %334) + %335 = load double*, double** %arrayX_GPU, align 8 + %336 = bitcast double* %335 to i8* + %call399 = call i32 @cudaFree(i8* %336) + %337 = load i32*, i32** %disk, align 8 + %338 = bitcast i32* %337 to i8* + call void @free(i8* %338) #10 + %339 = load double*, double** %objxy, align 8 + %340 = bitcast double* %339 to i8* + call void @free(i8* %340) #10 + %341 = load double*, double** %weights, align 8 + %342 = bitcast double* %341 to i8* + call void @free(i8* %342) #10 + %343 = load double*, double** %likelihood, align 8 + %344 = bitcast double* %343 to i8* + call void @free(i8* %344) #10 + %345 = load double*, double** %arrayX, align 8 + %346 = bitcast double* %345 to i8* + call void @free(i8* %346) #10 + %347 = load double*, double** %arrayY, align 8 + %348 = bitcast double* %347 to i8* + call void @free(i8* %348) #10 + %349 = load double*, double** %xj, align 8 + %350 = bitcast double* %349 to i8* + call void @free(i8* %350) #10 + %351 = load double*, double** %yj, align 8 + %352 = bitcast double* %351 to i8* + call void @free(i8* %352) #10 + %353 = load double*, double** %CDF, align 8 + %354 = bitcast double* %353 to i8* + call void @free(i8* %354) #10 + %355 = load double*, double** %u, align 8 + %356 = bitcast double* %355 to i8* + call void @free(i8* %356) #10 + %357 = load i32*, i32** %ind, align 8 + %358 = bitcast i32* %357 to i8* + call void @free(i8* %358) #10 + ret void +} + +declare dso_local i32 @cudaMalloc(i8**, i64) #3 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local double @_ZSt4fabsIiEN9__gnu_cxx11__enable_ifIXsr12__is_integerIT_EE7__valueEdE6__typeES2_(i32 %__x) #0 comdat { +entry: + %__x.addr = alloca i32, align 4 + store i32 %__x, i32* %__x.addr, align 4 + %0 = load i32, i32* %__x.addr, align 4 + %conv = sitofp i32 %0 to double + %1 = call double @llvm.fabs.f64(double %conv) + ret double %1 +} + +; Function Attrs: nounwind +declare dso_local double @exp(double) #1 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #3 + +; Function Attrs: nounwind readnone speculatable willreturn +declare double @llvm.ceil.f64(double) #6 + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #3 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @cudaThreadSynchronize() #3 + +declare dso_local i32 @cudaFree(i8*) #3 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #8 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %usage = alloca i8*, align 8 + %IszX = alloca i32, align 4 + %IszY = alloca i32, align 4 + %Nfr = alloca i32, align 4 + %Nparticles = alloca i32, align 4 + %seed = alloca i32*, align 8 + %i = alloca i32, align 4 + %I = alloca i32*, align 8 + %start = alloca i64, align 8 + %endVideoSequence = alloca i64, align 8 + %endParticleFilter = alloca i64, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + store i8* getelementptr inbounds ([56 x i8], [56 x i8]* @.str.21, i64 0, i64 0), i8** %usage, align 8 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp ne i32 %0, 9 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i8*, i8** %usage, align 8 + %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.22, i64 0, i64 0), i8* %1) + store i32 0, i32* %retval, align 4 + br label %return + +if.end: ; preds = %entry + %2 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %2, i64 1 + %3 = load i8*, i8** %arrayidx, align 8 + %call2 = call i32 @strcmp(i8* %3, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) #13 + %tobool = icmp ne i32 %call2, 0 + br i1 %tobool, label %if.then14, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %4 = load i8**, i8*** %argv.addr, align 8 + %arrayidx3 = getelementptr inbounds i8*, i8** %4, i64 3 + %5 = load i8*, i8** %arrayidx3, align 8 + %call4 = call i32 @strcmp(i8* %5, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.24, i64 0, i64 0)) #13 + %tobool5 = icmp ne i32 %call4, 0 + br i1 %tobool5, label %if.then14, label %lor.lhs.false6 + +lor.lhs.false6: ; preds = %lor.lhs.false + %6 = load i8**, i8*** %argv.addr, align 8 + %arrayidx7 = getelementptr inbounds i8*, i8** %6, i64 5 + %7 = load i8*, i8** %arrayidx7, align 8 + %call8 = call i32 @strcmp(i8* %7, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.25, i64 0, i64 0)) #13 + %tobool9 = icmp ne i32 %call8, 0 + br i1 %tobool9, label %if.then14, label %lor.lhs.false10 + +lor.lhs.false10: ; preds = %lor.lhs.false6 + %8 = load i8**, i8*** %argv.addr, align 8 + %arrayidx11 = getelementptr inbounds i8*, i8** %8, i64 7 + %9 = load i8*, i8** %arrayidx11, align 8 + %call12 = call i32 @strcmp(i8* %9, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.26, i64 0, i64 0)) #13 + %tobool13 = icmp ne i32 %call12, 0 + br i1 %tobool13, label %if.then14, label %if.end16 + +if.then14: ; preds = %lor.lhs.false10, %lor.lhs.false6, %lor.lhs.false, %if.end + %10 = load i8*, i8** %usage, align 8 + %call15 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.22, i64 0, i64 0), i8* %10) + store i32 0, i32* %retval, align 4 + br label %return + +if.end16: ; preds = %lor.lhs.false10 + %11 = load i8**, i8*** %argv.addr, align 8 + %arrayidx17 = getelementptr inbounds i8*, i8** %11, i64 2 + %12 = load i8*, i8** %arrayidx17, align 8 + %call18 = call i32 (i8*, i8*, ...) @sscanf(i8* %12, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.27, i64 0, i64 0), i32* %IszX) #10 + %cmp19 = icmp eq i32 %call18, -1 + br i1 %cmp19, label %if.then20, label %if.end22 + +if.then20: ; preds = %if.end16 + %call21 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.28, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end22: ; preds = %if.end16 + %13 = load i32, i32* %IszX, align 4 + %cmp23 = icmp sle i32 %13, 0 + br i1 %cmp23, label %if.then24, label %if.end26 + +if.then24: ; preds = %if.end22 + %call25 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.29, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end26: ; preds = %if.end22 + %14 = load i8**, i8*** %argv.addr, align 8 + %arrayidx27 = getelementptr inbounds i8*, i8** %14, i64 4 + %15 = load i8*, i8** %arrayidx27, align 8 + %call28 = call i32 (i8*, i8*, ...) @sscanf(i8* %15, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.27, i64 0, i64 0), i32* %IszY) #10 + %cmp29 = icmp eq i32 %call28, -1 + br i1 %cmp29, label %if.then30, label %if.end32 + +if.then30: ; preds = %if.end26 + %call31 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.30, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end32: ; preds = %if.end26 + %16 = load i32, i32* %IszY, align 4 + %cmp33 = icmp sle i32 %16, 0 + br i1 %cmp33, label %if.then34, label %if.end36 + +if.then34: ; preds = %if.end32 + %call35 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.31, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end36: ; preds = %if.end32 + %17 = load i8**, i8*** %argv.addr, align 8 + %arrayidx37 = getelementptr inbounds i8*, i8** %17, i64 6 + %18 = load i8*, i8** %arrayidx37, align 8 + %call38 = call i32 (i8*, i8*, ...) @sscanf(i8* %18, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.27, i64 0, i64 0), i32* %Nfr) #10 + %cmp39 = icmp eq i32 %call38, -1 + br i1 %cmp39, label %if.then40, label %if.end42 + +if.then40: ; preds = %if.end36 + %call41 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.32, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end42: ; preds = %if.end36 + %19 = load i32, i32* %Nfr, align 4 + %cmp43 = icmp sle i32 %19, 0 + br i1 %cmp43, label %if.then44, label %if.end46 + +if.then44: ; preds = %if.end42 + %call45 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.33, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end46: ; preds = %if.end42 + %20 = load i8**, i8*** %argv.addr, align 8 + %arrayidx47 = getelementptr inbounds i8*, i8** %20, i64 8 + %21 = load i8*, i8** %arrayidx47, align 8 + %call48 = call i32 (i8*, i8*, ...) @sscanf(i8* %21, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.27, i64 0, i64 0), i32* %Nparticles) #10 + %cmp49 = icmp eq i32 %call48, -1 + br i1 %cmp49, label %if.then50, label %if.end52 + +if.then50: ; preds = %if.end46 + %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.34, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end52: ; preds = %if.end46 + %22 = load i32, i32* %Nparticles, align 4 + %cmp53 = icmp sle i32 %22, 0 + br i1 %cmp53, label %if.then54, label %if.end56 + +if.then54: ; preds = %if.end52 + %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.35, i64 0, i64 0)) + store i32 0, i32* %retval, align 4 + br label %return + +if.end56: ; preds = %if.end52 + %23 = load i32, i32* %Nparticles, align 4 + %conv = sext i32 %23 to i64 + %mul = mul i64 4, %conv + %call57 = call noalias i8* @malloc(i64 %mul) #10 + %24 = bitcast i8* %call57 to i32* + store i32* %24, i32** %seed, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end56 + %25 = load i32, i32* %i, align 4 + %26 = load i32, i32* %Nparticles, align 4 + %cmp58 = icmp slt i32 %25, %26 + br i1 %cmp58, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %27 = load i32, i32* %i, align 4 + %28 = load i32*, i32** %seed, align 8 + %29 = load i32, i32* %i, align 4 + %idxprom = sext i32 %29 to i64 + %arrayidx59 = getelementptr inbounds i32, i32* %28, i64 %idxprom + store i32 %27, i32* %arrayidx59, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %30 = load i32, i32* %i, align 4 + %inc = add nsw i32 %30, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %31 = load i32, i32* %IszX, align 4 + %conv60 = sext i32 %31 to i64 + %mul61 = mul i64 4, %conv60 + %32 = load i32, i32* %IszY, align 4 + %conv62 = sext i32 %32 to i64 + %mul63 = mul i64 %mul61, %conv62 + %33 = load i32, i32* %Nfr, align 4 + %conv64 = sext i32 %33 to i64 + %mul65 = mul i64 %mul63, %conv64 + %call66 = call noalias i8* @malloc(i64 %mul65) #10 + %34 = bitcast i8* %call66 to i32* + store i32* %34, i32** %I, align 8 + %call67 = call i64 @_Z8get_timev() + store i64 %call67, i64* %start, align 8 + %35 = load i32*, i32** %I, align 8 + %36 = load i32, i32* %IszX, align 4 + %37 = load i32, i32* %IszY, align 4 + %38 = load i32, i32* %Nfr, align 4 + %39 = load i32*, i32** %seed, align 8 + call void @_Z13videoSequencePiiiiS_(i32* %35, i32 %36, i32 %37, i32 %38, i32* %39) + %call68 = call i64 @_Z8get_timev() + store i64 %call68, i64* %endVideoSequence, align 8 + %40 = load i64, i64* %start, align 8 + %41 = load i64, i64* %endVideoSequence, align 8 + %call69 = call float @_Z12elapsed_timexx(i64 %40, i64 %41) + %conv70 = fpext float %call69 to double + %call71 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.36, i64 0, i64 0), double %conv70) + %42 = load i32*, i32** %I, align 8 + %43 = load i32, i32* %IszX, align 4 + %44 = load i32, i32* %IszY, align 4 + %45 = load i32, i32* %Nfr, align 4 + %46 = load i32*, i32** %seed, align 8 + %47 = load i32, i32* %Nparticles, align 4 + call void @_Z14particleFilterPiiiiS_i(i32* %42, i32 %43, i32 %44, i32 %45, i32* %46, i32 %47) + %call72 = call i64 @_Z8get_timev() + store i64 %call72, i64* %endParticleFilter, align 8 + %48 = load i64, i64* %endVideoSequence, align 8 + %49 = load i64, i64* %endParticleFilter, align 8 + %call73 = call float @_Z12elapsed_timexx(i64 %48, i64 %49) + %conv74 = fpext float %call73 to double + %call75 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.37, i64 0, i64 0), double %conv74) + %50 = load i64, i64* %start, align 8 + %51 = load i64, i64* %endParticleFilter, align 8 + %call76 = call float @_Z12elapsed_timexx(i64 %50, i64 %51) + %conv77 = fpext float %call76 to double + %call78 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.38, i64 0, i64 0), double %conv77) + %52 = load i32*, i32** %seed, align 8 + %53 = bitcast i32* %52 to i8* + call void @free(i8* %53) #10 + %54 = load i32*, i32** %I, align 8 + %55 = bitcast i32* %54 to i8* + call void @free(i8* %55) #10 + store i32 0, i32* %retval, align 4 + br label %return + +return: ; preds = %for.end, %if.then54, %if.then50, %if.then44, %if.then40, %if.then34, %if.then30, %if.then24, %if.then20, %if.then14, %if.then + %56 = load i32, i32* %retval, align 4 + ret i32 %56 +} + +declare dso_local i32 @cudaSetDevice(i32) #3 + +; Function Attrs: nounwind readonly +declare dso_local i32 @strcmp(i8*, i8*) #9 + +; Function Attrs: nounwind +declare dso_local i32 @sscanf(i8*, i8*, ...) #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare double @llvm.powi.f64(double, i32) #6 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (double*, double*, double*, double*, double*, double*, i32)* @_Z6kernelPdS_S_S_S_S_i to i8*), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { argmemonly nounwind willreturn } +attributes #6 = { nounwind readnone speculatable willreturn } +attributes #7 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #9 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #10 = { nounwind } +attributes #11 = { noreturn nounwind } +attributes #12 = { nounwind readnone } +attributes #13 = { nounwind readonly } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/particlefilter/run.sh b/examples/particlefilter/run.sh new file mode 100644 index 0000000..bf29c6b --- /dev/null +++ b/examples/particlefilter/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +llvm-as ex_particle_CUDA_naive_seq-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as ex_particle_CUDA_naive_seq-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator ex_particle_CUDA_naive_seq-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator ex_particle_CUDA_naive_seq-host-x86_64-unknown-linux-gnu.bc host.bc +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \ + -o particlefilter_naive -fPIC -no-pie \ + host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./particlefilter_naive -x 128 -y 128 -z 10 -np 1000 > res.log +if grep -q -e "48.550541 48.550541 48.550541 48.550541" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/pathfinder/pathfinder-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/pathfinder/pathfinder-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..ba65ae1 --- /dev/null +++ b/examples/pathfinder/pathfinder-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,462 @@ +; ModuleID = 'pathfinder-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "pathfinder.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +@_ZZ14dynproc_kerneliPiS_S_iiiiE4prev = internal addrspace(3) global [256 x i32] undef, align 4 +@_ZZ14dynproc_kerneliPiS_S_iiiiE6result = internal addrspace(3) global [256 x i32] undef, align 4 +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z14dynproc_kerneliPiS_S_iiii(i32 %iteration, i32* %gpuWall, i32* %gpuSrc, i32* %gpuResults, i32 %cols, i32 %rows, i32 %startStep, i32 %border) #0 { +entry: + %iteration.addr = alloca i32, align 4 + %gpuWall.addr = alloca i32*, align 8 + %gpuSrc.addr = alloca i32*, align 8 + %gpuResults.addr = alloca i32*, align 8 + %cols.addr = alloca i32, align 4 + %rows.addr = alloca i32, align 4 + %startStep.addr = alloca i32, align 4 + %border.addr = alloca i32, align 4 + %bx = alloca i32, align 4 + %tx = alloca i32, align 4 + %small_block_cols = alloca i32, align 4 + %blkX = alloca i32, align 4 + %blkXmax = alloca i32, align 4 + %xidx = alloca i32, align 4 + %validXmin = alloca i32, align 4 + %validXmax = alloca i32, align 4 + %W = alloca i32, align 4 + %E = alloca i32, align 4 + %isValid = alloca i8, align 1 + %computed = alloca i8, align 1 + %i = alloca i32, align 4 + %left = alloca i32, align 4 + %up = alloca i32, align 4 + %right = alloca i32, align 4 + %shortest = alloca i32, align 4 + %index = alloca i32, align 4 + store i32 %iteration, i32* %iteration.addr, align 4 + store i32* %gpuWall, i32** %gpuWall.addr, align 8 + store i32* %gpuSrc, i32** %gpuSrc.addr, align 8 + store i32* %gpuResults, i32** %gpuResults.addr, align 8 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %rows, i32* %rows.addr, align 4 + store i32 %startStep, i32* %startStep.addr, align 4 + store i32 %border, i32* %border.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %bx, align 4 + %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call1, i32* %tx, align 4 + %0 = load i32, i32* %iteration.addr, align 4 + %mul = mul nsw i32 %0, 1 + %mul2 = mul nsw i32 %mul, 2 + %sub = sub nsw i32 256, %mul2 + store i32 %sub, i32* %small_block_cols, align 4 + %1 = load i32, i32* %small_block_cols, align 4 + %2 = load i32, i32* %bx, align 4 + %mul3 = mul nsw i32 %1, %2 + %3 = load i32, i32* %border.addr, align 4 + %sub4 = sub nsw i32 %mul3, %3 + store i32 %sub4, i32* %blkX, align 4 + %4 = load i32, i32* %blkX, align 4 + %add = add nsw i32 %4, 256 + %sub5 = sub nsw i32 %add, 1 + store i32 %sub5, i32* %blkXmax, align 4 + %5 = load i32, i32* %blkX, align 4 + %6 = load i32, i32* %tx, align 4 + %add6 = add nsw i32 %5, %6 + store i32 %add6, i32* %xidx, align 4 + %7 = load i32, i32* %blkX, align 4 + %cmp = icmp slt i32 %7, 0 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + %8 = load i32, i32* %blkX, align 4 + %sub7 = sub nsw i32 0, %8 + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %sub7, %cond.true ], [ 0, %cond.false ] + store i32 %cond, i32* %validXmin, align 4 + %9 = load i32, i32* %blkXmax, align 4 + %10 = load i32, i32* %cols.addr, align 4 + %sub8 = sub nsw i32 %10, 1 + %cmp9 = icmp sgt i32 %9, %sub8 + br i1 %cmp9, label %cond.true10, label %cond.false14 + +cond.true10: ; preds = %cond.end + %11 = load i32, i32* %blkXmax, align 4 + %12 = load i32, i32* %cols.addr, align 4 + %sub11 = sub nsw i32 %11, %12 + %add12 = add nsw i32 %sub11, 1 + %sub13 = sub nsw i32 255, %add12 + br label %cond.end15 + +cond.false14: ; preds = %cond.end + br label %cond.end15 + +cond.end15: ; preds = %cond.false14, %cond.true10 + %cond16 = phi i32 [ %sub13, %cond.true10 ], [ 255, %cond.false14 ] + store i32 %cond16, i32* %validXmax, align 4 + %13 = load i32, i32* %tx, align 4 + %sub17 = sub nsw i32 %13, 1 + store i32 %sub17, i32* %W, align 4 + %14 = load i32, i32* %tx, align 4 + %add18 = add nsw i32 %14, 1 + store i32 %add18, i32* %E, align 4 + %15 = load i32, i32* %W, align 4 + %16 = load i32, i32* %validXmin, align 4 + %cmp19 = icmp slt i32 %15, %16 + br i1 %cmp19, label %cond.true20, label %cond.false21 + +cond.true20: ; preds = %cond.end15 + %17 = load i32, i32* %validXmin, align 4 + br label %cond.end22 + +cond.false21: ; preds = %cond.end15 + %18 = load i32, i32* %W, align 4 + br label %cond.end22 + +cond.end22: ; preds = %cond.false21, %cond.true20 + %cond23 = phi i32 [ %17, %cond.true20 ], [ %18, %cond.false21 ] + store i32 %cond23, i32* %W, align 4 + %19 = load i32, i32* %E, align 4 + %20 = load i32, i32* %validXmax, align 4 + %cmp24 = icmp sgt i32 %19, %20 + br i1 %cmp24, label %cond.true25, label %cond.false26 + +cond.true25: ; preds = %cond.end22 + %21 = load i32, i32* %validXmax, align 4 + br label %cond.end27 + +cond.false26: ; preds = %cond.end22 + %22 = load i32, i32* %E, align 4 + br label %cond.end27 + +cond.end27: ; preds = %cond.false26, %cond.true25 + %cond28 = phi i32 [ %21, %cond.true25 ], [ %22, %cond.false26 ] + store i32 %cond28, i32* %E, align 4 + %23 = load i32, i32* %tx, align 4 + %24 = load i32, i32* %validXmin, align 4 + %cmp29 = icmp sge i32 %23, %24 + br i1 %cmp29, label %land.rhs, label %land.end + +land.rhs: ; preds = %cond.end27 + %25 = load i32, i32* %tx, align 4 + %26 = load i32, i32* %validXmax, align 4 + %cmp30 = icmp sle i32 %25, %26 + br label %land.end + +land.end: ; preds = %land.rhs, %cond.end27 + %27 = phi i1 [ false, %cond.end27 ], [ %cmp30, %land.rhs ] + %frombool = zext i1 %27 to i8 + store i8 %frombool, i8* %isValid, align 1 + %28 = load i32, i32* %xidx, align 4 + %cmp31 = icmp sge i32 %28, 0 + br i1 %cmp31, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %land.end + %29 = load i32, i32* %xidx, align 4 + %30 = load i32, i32* %cols.addr, align 4 + %sub32 = sub nsw i32 %30, 1 + %cmp33 = icmp sle i32 %29, %sub32 + br i1 %cmp33, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + %31 = load i32*, i32** %gpuSrc.addr, align 8 + %32 = load i32, i32* %xidx, align 4 + %idxprom = sext i32 %32 to i64 + %arrayidx = getelementptr inbounds i32, i32* %31, i64 %idxprom + %33 = load i32, i32* %arrayidx, align 4 + %34 = load i32, i32* %tx, align 4 + %idxprom34 = sext i32 %34 to i64 + %arrayidx35 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE4prev to [256 x i32]*), i64 0, i64 %idxprom34 + store i32 %33, i32* %arrayidx35, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %land.end + call void @llvm.nvvm.barrier0() + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %35 = load i32, i32* %i, align 4 + %36 = load i32, i32* %iteration.addr, align 4 + %cmp36 = icmp slt i32 %35, %36 + br i1 %cmp36, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i8 0, i8* %computed, align 1 + %37 = load i32, i32* %tx, align 4 + %38 = load i32, i32* %i, align 4 + %add37 = add nsw i32 %38, 1 + %cmp38 = icmp sge i32 %37, %add37 + br i1 %cmp38, label %land.lhs.true39, label %if.end69 + +land.lhs.true39: ; preds = %for.body + %39 = load i32, i32* %tx, align 4 + %40 = load i32, i32* %i, align 4 + %sub40 = sub nsw i32 256, %40 + %sub41 = sub nsw i32 %sub40, 2 + %cmp42 = icmp sle i32 %39, %sub41 + br i1 %cmp42, label %land.lhs.true43, label %if.end69 + +land.lhs.true43: ; preds = %land.lhs.true39 + %41 = load i8, i8* %isValid, align 1 + %tobool = trunc i8 %41 to i1 + br i1 %tobool, label %if.then44, label %if.end69 + +if.then44: ; preds = %land.lhs.true43 + store i8 1, i8* %computed, align 1 + %42 = load i32, i32* %W, align 4 + %idxprom45 = sext i32 %42 to i64 + %arrayidx46 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE4prev to [256 x i32]*), i64 0, i64 %idxprom45 + %43 = load i32, i32* %arrayidx46, align 4 + store i32 %43, i32* %left, align 4 + %44 = load i32, i32* %tx, align 4 + %idxprom47 = sext i32 %44 to i64 + %arrayidx48 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE4prev to [256 x i32]*), i64 0, i64 %idxprom47 + %45 = load i32, i32* %arrayidx48, align 4 + store i32 %45, i32* %up, align 4 + %46 = load i32, i32* %E, align 4 + %idxprom49 = sext i32 %46 to i64 + %arrayidx50 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE4prev to [256 x i32]*), i64 0, i64 %idxprom49 + %47 = load i32, i32* %arrayidx50, align 4 + store i32 %47, i32* %right, align 4 + %48 = load i32, i32* %left, align 4 + %49 = load i32, i32* %up, align 4 + %cmp51 = icmp sle i32 %48, %49 + br i1 %cmp51, label %cond.true52, label %cond.false53 + +cond.true52: ; preds = %if.then44 + %50 = load i32, i32* %left, align 4 + br label %cond.end54 + +cond.false53: ; preds = %if.then44 + %51 = load i32, i32* %up, align 4 + br label %cond.end54 + +cond.end54: ; preds = %cond.false53, %cond.true52 + %cond55 = phi i32 [ %50, %cond.true52 ], [ %51, %cond.false53 ] + store i32 %cond55, i32* %shortest, align 4 + %52 = load i32, i32* %shortest, align 4 + %53 = load i32, i32* %right, align 4 + %cmp56 = icmp sle i32 %52, %53 + br i1 %cmp56, label %cond.true57, label %cond.false58 + +cond.true57: ; preds = %cond.end54 + %54 = load i32, i32* %shortest, align 4 + br label %cond.end59 + +cond.false58: ; preds = %cond.end54 + %55 = load i32, i32* %right, align 4 + br label %cond.end59 + +cond.end59: ; preds = %cond.false58, %cond.true57 + %cond60 = phi i32 [ %54, %cond.true57 ], [ %55, %cond.false58 ] + store i32 %cond60, i32* %shortest, align 4 + %56 = load i32, i32* %cols.addr, align 4 + %57 = load i32, i32* %startStep.addr, align 4 + %58 = load i32, i32* %i, align 4 + %add61 = add nsw i32 %57, %58 + %mul62 = mul nsw i32 %56, %add61 + %59 = load i32, i32* %xidx, align 4 + %add63 = add nsw i32 %mul62, %59 + store i32 %add63, i32* %index, align 4 + %60 = load i32, i32* %shortest, align 4 + %61 = load i32*, i32** %gpuWall.addr, align 8 + %62 = load i32, i32* %index, align 4 + %idxprom64 = sext i32 %62 to i64 + %arrayidx65 = getelementptr inbounds i32, i32* %61, i64 %idxprom64 + %63 = load i32, i32* %arrayidx65, align 4 + %add66 = add nsw i32 %60, %63 + %64 = load i32, i32* %tx, align 4 + %idxprom67 = sext i32 %64 to i64 + %arrayidx68 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE6result to [256 x i32]*), i64 0, i64 %idxprom67 + store i32 %add66, i32* %arrayidx68, align 4 + br label %if.end69 + +if.end69: ; preds = %cond.end59, %land.lhs.true43, %land.lhs.true39, %for.body + call void @llvm.nvvm.barrier0() + %65 = load i32, i32* %i, align 4 + %66 = load i32, i32* %iteration.addr, align 4 + %sub70 = sub nsw i32 %66, 1 + %cmp71 = icmp eq i32 %65, %sub70 + br i1 %cmp71, label %if.then72, label %if.end73 + +if.then72: ; preds = %if.end69 + br label %for.end + +if.end73: ; preds = %if.end69 + %67 = load i8, i8* %computed, align 1 + %tobool74 = trunc i8 %67 to i1 + br i1 %tobool74, label %if.then75, label %if.end80 + +if.then75: ; preds = %if.end73 + %68 = load i32, i32* %tx, align 4 + %idxprom76 = sext i32 %68 to i64 + %arrayidx77 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE6result to [256 x i32]*), i64 0, i64 %idxprom76 + %69 = load i32, i32* %arrayidx77, align 4 + %70 = load i32, i32* %tx, align 4 + %idxprom78 = sext i32 %70 to i64 + %arrayidx79 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE4prev to [256 x i32]*), i64 0, i64 %idxprom78 + store i32 %69, i32* %arrayidx79, align 4 + br label %if.end80 + +if.end80: ; preds = %if.then75, %if.end73 + call void @llvm.nvvm.barrier0() + br label %for.inc + +for.inc: ; preds = %if.end80 + %71 = load i32, i32* %i, align 4 + %inc = add nsw i32 %71, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %if.then72, %for.cond + %72 = load i8, i8* %computed, align 1 + %tobool81 = trunc i8 %72 to i1 + br i1 %tobool81, label %if.then82, label %if.end87 + +if.then82: ; preds = %for.end + %73 = load i32, i32* %tx, align 4 + %idxprom83 = sext i32 %73 to i64 + %arrayidx84 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ14dynproc_kerneliPiS_S_iiiiE6result to [256 x i32]*), i64 0, i64 %idxprom83 + %74 = load i32, i32* %arrayidx84, align 4 + %75 = load i32*, i32** %gpuResults.addr, align 8 + %76 = load i32, i32* %xidx, align 4 + %idxprom85 = sext i32 %76 to i64 + %arrayidx86 = getelementptr inbounds i32, i32* %75, i64 %idxprom85 + store i32 %74, i32* %arrayidx86, align 4 + br label %if.end87 + +if.end87: ; preds = %if.then82, %for.end + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32, i32*, i32*, i32*, i32, i32, i32, i32)* @_Z14dynproc_kerneliPiS_S_iiii, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/pathfinder/pathfinder-host-x86_64-unknown-linux-gnu.ll b/examples/pathfinder/pathfinder-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..720c1b8 --- /dev/null +++ b/examples/pathfinder/pathfinder-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,745 @@ +; ModuleID = 'pathfinder-host-x86_64-unknown-linux-gnu.bc' +source_filename = "pathfinder.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZN4dim3C2Ejjj = comdat any + +@rows = dso_local global i32 0, align 4 +@cols = dso_local global i32 0, align 4 +@data = dso_local global i32* null, align 8 +@wall = dso_local global i32** null, align 8 +@result = dso_local global i32* null, align 8 +@pyramid_height = dso_local global i32 0, align 4 +@.str = private unnamed_addr constant [47 x i8] c"Usage: dynproc row_len col_len pyramid_height\0A\00", align 1 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str.1 = private unnamed_addr constant [11 x i8] c"error: %s\0A\00", align 1 +@.str.2 = private unnamed_addr constant [92 x i8] c"pyramidHeight: %d\0AgridSize: [%d]\0Aborder:[%d]\0AblockSize: %d\0AblockGrid:[%d]\0AtargetBlock:[%d]\0A\00", align 1 +@.str.3 = private unnamed_addr constant [4 x i8] c"%d \00", align 1 +@.str.4 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 +@0 = private unnamed_addr constant [30 x i8] c"_Z14dynproc_kerneliPiS_S_iiii\00", align 1 +@1 = private constant [20737 x i8] c"P\EDU\BA\01\00\10\00\F0P\00\00\00\00\00\00\02\00\01\01@\00\00\00\E8B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00@B\00\00\00\00\00\00\C0?\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0A\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z14dynproc_kerneliPiS_S_iiii\00.nv.info._Z14dynproc_kerneliPiS_S_iiii\00.nv.shared._Z14dynproc_kerneliPiS_S_iiii\00.nv.global\00.nv.constant0._Z14dynproc_kerneliPiS_S_iiii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z14dynproc_kerneliPiS_S_iiii\00.text._Z14dynproc_kerneliPiS_S_iiii\00.nv.info._Z14dynproc_kerneliPiS_S_iiii\00.nv.shared._Z14dynproc_kerneliPiS_S_iiii\00.nv.global\00blockIdx\00threadIdx\00$___ZZ14dynproc_kerneliPiS_S_iiiiE4prev__187\00$___ZZ14dynproc_kerneliPiS_S_iiiiE6result__189\00.nv.constant0._Z14dynproc_kerneliPiS_S_iiii\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00P\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9B\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\C4\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\CF\00\00\00\01\00\09\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\D8\00\00\00\01\00\09\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00>\01\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00\00:\00\00\00\00\00\00\04/\08\00\07\00\00\00\11\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00x\00\00\00\04\11\08\00\07\00\00\00x\00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\010\00\03\190\00\04\17\0C\00\00\00\00\00\07\00,\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\00(\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0\11\00\03\1B\FF\00\04\1D\04\00\E8\05\00\00\04\1C\04\00\D89\00\00\04\1E\04\00\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F0\07visible .entry _Z14dyn\E2\00\F6\03_kerneliPiS_S_iiii\A6\04\00r\00\0F+\00\0A\0E\8D\04\00\D3\00\0F3\00\11\1F13\00\1F\1F23\00\1F/3,\CC\00\1E\1F43\00\1F\1F53\00\1F\1F63\00\1F\1F7\C2\04\13O6[12\C3\04\16\A6pred %p<21\C5\04\AB16 %rs<10>\E9\04=105\EB\04 56\EC\04P\09.shaK\00\03\97\00\124\97\00\1FZ\D6\00\09\CFE4prev[1024]C\00%t6resultE\00\0Fs\05\08\1F6s\05\19\00!\04\0F\92\01\12\0E\06\05/20<\00\14\1F6B\05\00\1F9<\00\14\1F5<\00\00\1F8<\00\14\0F\0F\06\02\0F<\00\14\1F3\A2\05\01\0F<\00\14\0F\DE\05\01\0Fh\01\15\1F1\F0\00\00\1F7<\00\14#0]\FA\01#to\B6\15\04~\00\144r\05\01\1F\00\0A\1C\00\115\1C\00\1F4;\00\05\146\DA\05\0F;\00\00\117\1C\00\1F6;\00\05\148+\06\0F;\00\00\119\1C\00\1A8\04\06\03r\0E\1F7]\06\02\1A9\16\00\03]\06/d7^\06\03\1F5^\06\02\1B1q\00\133\A2\06\1A9\17\00\134,\0B\1B0\17\00\02\\\00\192I\0F\CB22, %ctaid.x/\00\02\B9\00\192\D3\06n23, %t-\00\135\FF\06\113\FF\01\03,\00$4,\18\01S;\0Ashl\9D\04325,\1D\00\0A\89\00\D26, 256;\0Asub.s\13\00#7,\19\00\006\00\0Bq\00\02\FB\00(27q\00%8,\1D\00\08\17\00%9,\D2\00\83;\0Amul.lob\00330,8\00\00'\00\074\00531,5\01\08\93\00532,7\00\1B3'\12\136\CD\07\182H\00%3,\1D\00T;\0AaddH\00#4,\1E\00+25\DA\00\126\A7\01(34G\00\185G\00\06\17\00%6,\7F\01\09^\00#7,5\00\00$\00\0B_\00\02\D7\01\1839\01(38_\00rsetp.gtL\003p3,!\00\F2\0D-1;\0A@%p3 bra LBB6_2;\0Abra.uni\10\0021;\0A\08\00\16:[\00(40[\00Tneg.s\84\0A\00\1D\00\08\02\02#99o\09\09U\00\133U\00(2:Z\09?9, <\00\00/39=\00\04*3:m\0E\001\00\0A4\03\157\C0\09\06\BB\00%1,\92\01\08\17\00%2,\95\03\08u\01343,\1E\00#-1E\01#leE\01#4,P\00\00'\00\01G\01\164G\01\1B5\B5\00\134\B5\00\184G\01\1F5\8C\00\03\186\8C\00\06\A7\02347,\1E\00\00;\00\09\1B\02\13,\1F\009254Q\13\120\D4\02\0B\8C\00\136\8C\00\185A\01#44\B7\02\0D?\00/44@\00\04\196@\00\12,2\00\0B\82\01\02\D2\03\08\F7\02/48\E0\02\03349,\1E\00\1C-\C0\15\02\84\03\184\CC\03/50F\00\03351,\1E\00\0CE\00\02\82\03(51E\00\04\F6\01\198\82\03553,@\02\04:\03\04\F5\01#5,8\00\00'\00\01\F5\01\165\F5\01\1B8)\01\137)\01\177\F5\01\185\\\00\08l\01\02\B9\00\0BB\00\139B\00\178B\00\186\B5\00\0DB\00\1F6B\00\04\189\AD\01\227,1\00\0Fh\01\02\08E\04\145\EF\05\198\A3\02\155\C6\00,6]\17\03#6,8\00\00'\00\01\22\01\166\22\01\1C1\0A\04$10`\04\170\E3\00\188_\00\08\E3\00\01y\01\0Cg\01$12D\00\08\A5\04\189\BA\00\0DD\00\1F9D\00\06\09\94\04210,4\00\0FM\02\02(10\14\01\09\B8\05\06\17\00\1D7M\02\14l\87\05#8,8\00\00'\00\02\A5\00\03\12\0C\157\0B\05\03\11\00\00\D9\0A1%p7P\01\178P\01\0C\9C\03\141\1E\05(13u\02\1F8\9C\00\03\1F9\C7\01\07#1,8\00\00'\00\0F\8B\00\00\0D\C9\01\04\E2\04914:.\00\11,3\00\00_\00Blp.u\DE\0Ca1, 1, H\00\03Z\01\138Y\01\02\C9\06\18s\A6\03\156n\06\1D8B\01#9,!\00\110\1A\01\179\1A\01\1C7\A1\00\145\A1\00\185M\03\181]\00\06\17\00\0F\10\06\04\116k\05\186\10\06\05U\07\02,\02\126\86\03\02\1C\04/10\91\00\09\04H\05\2216\91\00\03\96\0B%0,N\0A\01\92\00\02z\00)d1\AA\00\01{\09\031\00$2, \00\132\AC\00\03\19\00$3,Q\00\01'\00\08\E0\00\01\D8\03\00#\00\0Ad\00\184(\02\08d\00$5, \00\172\E9\0D_rd16,i\0E\12\03q\0B\02e\0E\05=\00\02\AD\0B*16\B7\00(8,\1D\00\08J\09\00\1D\00\01!\02\1D6\F8\02\04l\05\CA17:\0Abar.syncO\08K65, \9F\06\129!\08\1C6w\05$18N\00\09y\05\04\E2\03\199F\03(67\13\0B\09.\06\00\94\01\028\00\00'\00\02\13\02\161\13\02\1D3x\00\04\AE\05\141\AF\05\020\03\152\BA\00\04(\03\129(\03\09\D4\01\0F\E5\03\03(69\B6\00\06\CB\02370,\1E\00\1C1V\03\02R\02\116\88\01\127Z\03'12#\0A\0C=\05\142\C5\05\182\C5\05/71\90\00\02(72\90\00\07\94\06\133\C0\08\06\F2\08374,\19\00\007\00\0Dp\03#3,f\00\00*\00\02\A6\00\1F3\A6\00\09\04\CB\0A\132'\06\108B\01%3,z\04T;\0Aand\81\11#4,\1E\00\04\1E\01\22eq\1B\00\10p\05\03\02!\00\00=\0B\10!\11\00\0Fv\00\09\04Y\06)22\D3\01\155\A2\08\0B\D3\01\195\91\03\1D9\ED\07\03\F8\0F\1F0x\03(\2221A\0F\190\E4\03\016\0E\02s\00\0AH\04\01 \0E\066\00\08t\02\2275H\04;23]Z\03\02\F9\09\197\CE\00\1F2_\04\05\02?\0E\01 \00\0B{\00\196{\00\185{\00\136{\00\1B6{\00#10\C8\08\196|\00\187\10\08\08|\00$8, \00\0B|\00\199|\00\188|\00\137|\00\1D9|\00\02\1A\08\187E\09\147\99\03\199\C3\0E679,\B1\00\0D\DD\02#5,:\00\00)\00\02\DD\02\07j\0A\1C2\00\05\152S\0D\08\F8\07\054\06\1C9I\09\01\D3\01\0D\8E\09$25F\00\08\A8\0C)12\A9\00\0EG\00\1F2G\00\06\09c\0C\02\B1\03<1034\01\03\A1!\08h\10\158\06\07\190\99\07681,j\01\0D5\01#6,;\00\00)\00\025\01\07}\0A,27\A8\00\04\82\07(26\EF\00\194{\00\08\EF\00\02\E5\0C\0D|\01\04.\06\182\E9\0B)15\AA\00\0EG\00\1F5G\00\06\1986\01\226,5\00\0F6\01\04\186\1E\01\09\B7\08\06\17\00\04\E5\04\194\E8\0C/84\1A\06\03385,5\00\00$\00\0B^\11386,i\00\00)\00\08N\00\187\A0\08\07N\00&8,7\00\1B7\C5\00\03\86#\188]\03-89\FB\01\03(\15\04\14\02\198\B5\08\05\D7\11\00N\00\09\DB\03\02\D9\11-d3\1A\09\00\A4\11\03Q\00\01'\00\07\DD\11\2290\DB\03)33\C6\00391,\9B\00\00#\00\09\7F\00\09\D5\04\07\A3\05\00\AB\11\0FA\17\13\0F\A5\05\02\2236\9D\00\1A5\D3\00$7,u\00\0B\D3\00(8,6\00\1976\09\123,\02\1D9\F0\03\04p\08*296\09\06\0F\01\1F2\F4\08\02'93\F4\08\07&\01#4,\1D\00\05\1F\0B\15n\0C\09#7,P\00\00(\00\02\9D\03\177\0C\09\0D$\12\04U\08;30:\1A\00\04_\04\183\C9\07\05\17\0F\1A9\C9\07#7,\1E\00\0E\C9\07#8,!\00\04\C9\07\07Z\0D\1C36\11$32v\00(2:\FF\01\1F9\D4\06\04\00%\13\03 \00\0A3\0B/41\18\02*\124\D5\02)41\FF\01\02\A9\12\05\1D\00\09\C3\01\03\A4\07:43]\88\00\1F4C\08(\134\A3\17\1A4\86\00(6,\1D\00\190\85\02\124\13\08\1C9\F8\04\153\D1\13\1A3\85\02\0A&\00\04`\0E\183\9B\06/96\C3\04\03\129m\02\1D6\07\12\03\F2\0B/97\F2\0B\05'35T\02\1F8T\02\04#9,\1E\00\0ET\02\02\1B\00\169T\02\07\94\0E\1C3v\00$36v\00\186T\02/47T\02\05$8, \00\0BT\02\1F9T\02*\2250X\00\199\CE\01\02L\13\05\1D\00\08\A4\05\2298T\02)51\A2\05\05Q\13)24\A3\05/53\BC\0E\04\02(\12\01 \00\0A\CF\04\00b\12\03Q\00\01'\00\09J\02\2255\C5\01\0C\09\12$37O\01\B07:\0Aret;\0A\0A}\0A\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([20737 x i8], [20737 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z4initiPPc(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %n = alloca i32, align 4 + %seed = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp eq i32 %0, 4 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %1, i64 1 + %2 = load i8*, i8** %arrayidx, align 8 + %call = call i32 @atoi(i8* %2) #11 + store i32 %call, i32* @cols, align 4 + %3 = load i8**, i8*** %argv.addr, align 8 + %arrayidx1 = getelementptr inbounds i8*, i8** %3, i64 2 + %4 = load i8*, i8** %arrayidx1, align 8 + %call2 = call i32 @atoi(i8* %4) #11 + store i32 %call2, i32* @rows, align 4 + %5 = load i8**, i8*** %argv.addr, align 8 + %arrayidx3 = getelementptr inbounds i8*, i8** %5, i64 3 + %6 = load i8*, i8** %arrayidx3, align 8 + %call4 = call i32 @atoi(i8* %6) #11 + store i32 %call4, i32* @pyramid_height, align 4 + br label %if.end + +if.else: ; preds = %entry + %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str, i64 0, i64 0)) + call void @exit(i32 0) #12 + unreachable + +if.end: ; preds = %if.then + %7 = load i32, i32* @rows, align 4 + %8 = load i32, i32* @cols, align 4 + %mul = mul nsw i32 %7, %8 + %9 = sext i32 %mul to i64 + %10 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %9, i64 4) + %11 = extractvalue { i64, i1 } %10, 1 + %12 = extractvalue { i64, i1 } %10, 0 + %13 = select i1 %11, i64 -1, i64 %12 + %call6 = call i8* @_Znam(i64 %13) #13 + %14 = bitcast i8* %call6 to i32* + store i32* %14, i32** @data, align 8 + %15 = load i32, i32* @rows, align 4 + %16 = sext i32 %15 to i64 + %17 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %16, i64 8) + %18 = extractvalue { i64, i1 } %17, 1 + %19 = extractvalue { i64, i1 } %17, 0 + %20 = select i1 %18, i64 -1, i64 %19 + %call7 = call i8* @_Znam(i64 %20) #13 + %21 = bitcast i8* %call7 to i32** + store i32** %21, i32*** @wall, align 8 + store i32 0, i32* %n, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %22 = load i32, i32* %n, align 4 + %23 = load i32, i32* @rows, align 4 + %cmp8 = icmp slt i32 %22, %23 + br i1 %cmp8, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %24 = load i32*, i32** @data, align 8 + %25 = load i32, i32* @cols, align 4 + %26 = load i32, i32* %n, align 4 + %mul9 = mul nsw i32 %25, %26 + %idx.ext = sext i32 %mul9 to i64 + %add.ptr = getelementptr inbounds i32, i32* %24, i64 %idx.ext + %27 = load i32**, i32*** @wall, align 8 + %28 = load i32, i32* %n, align 4 + %idxprom = sext i32 %28 to i64 + %arrayidx10 = getelementptr inbounds i32*, i32** %27, i64 %idxprom + store i32* %add.ptr, i32** %arrayidx10, align 8 + br label %for.inc + +for.inc: ; preds = %for.body + %29 = load i32, i32* %n, align 4 + %inc = add nsw i32 %29, 1 + store i32 %inc, i32* %n, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %30 = load i32, i32* @cols, align 4 + %31 = sext i32 %30 to i64 + %32 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %31, i64 4) + %33 = extractvalue { i64, i1 } %32, 1 + %34 = extractvalue { i64, i1 } %32, 0 + %35 = select i1 %33, i64 -1, i64 %34 + %call11 = call i8* @_Znam(i64 %35) #13 + %36 = bitcast i8* %call11 to i32* + store i32* %36, i32** @result, align 8 + store i32 9, i32* %seed, align 4 + %37 = load i32, i32* %seed, align 4 + call void @srand(i32 %37) #14 + store i32 0, i32* %i, align 4 + br label %for.cond12 + +for.cond12: ; preds = %for.inc26, %for.end + %38 = load i32, i32* %i, align 4 + %39 = load i32, i32* @rows, align 4 + %cmp13 = icmp slt i32 %38, %39 + br i1 %cmp13, label %for.body14, label %for.end28 + +for.body14: ; preds = %for.cond12 + store i32 0, i32* %j, align 4 + br label %for.cond15 + +for.cond15: ; preds = %for.inc23, %for.body14 + %40 = load i32, i32* %j, align 4 + %41 = load i32, i32* @cols, align 4 + %cmp16 = icmp slt i32 %40, %41 + br i1 %cmp16, label %for.body17, label %for.end25 + +for.body17: ; preds = %for.cond15 + %call18 = call i32 @rand() #14 + %rem = srem i32 %call18, 10 + %42 = load i32**, i32*** @wall, align 8 + %43 = load i32, i32* %i, align 4 + %idxprom19 = sext i32 %43 to i64 + %arrayidx20 = getelementptr inbounds i32*, i32** %42, i64 %idxprom19 + %44 = load i32*, i32** %arrayidx20, align 8 + %45 = load i32, i32* %j, align 4 + %idxprom21 = sext i32 %45 to i64 + %arrayidx22 = getelementptr inbounds i32, i32* %44, i64 %idxprom21 + store i32 %rem, i32* %arrayidx22, align 4 + br label %for.inc23 + +for.inc23: ; preds = %for.body17 + %46 = load i32, i32* %j, align 4 + %inc24 = add nsw i32 %46, 1 + store i32 %inc24, i32* %j, align 4 + br label %for.cond15 + +for.end25: ; preds = %for.cond15 + br label %for.inc26 + +for.inc26: ; preds = %for.end25 + %47 = load i32, i32* %i, align 4 + %inc27 = add nsw i32 %47, 1 + store i32 %inc27, i32* %i, align 4 + br label %for.cond12 + +for.end28: ; preds = %for.cond12 + ret void +} + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #1 + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #3 + +; Function Attrs: nounwind readnone speculatable willreturn +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #4 + +; Function Attrs: nobuiltin +declare dso_local noalias i8* @_Znam(i64) #5 + +; Function Attrs: nounwind +declare dso_local void @srand(i32) #6 + +; Function Attrs: nounwind +declare dso_local i32 @rand() #6 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z5fatalPc(i8* %s) #0 { +entry: + %s.addr = alloca i8*, align 8 + store i8* %s, i8** %s.addr, align 8 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %1 = load i8*, i8** %s.addr, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.1, i64 0, i64 0), i8* %1) + ret void +} + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #2 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z14dynproc_kerneliPiS_S_iiii(i32 %iteration, i32* %gpuWall, i32* %gpuSrc, i32* %gpuResults, i32 %cols, i32 %rows, i32 %startStep, i32 %border) #0 { +entry: + %iteration.addr = alloca i32, align 4 + %gpuWall.addr = alloca i32*, align 8 + %gpuSrc.addr = alloca i32*, align 8 + %gpuResults.addr = alloca i32*, align 8 + %cols.addr = alloca i32, align 4 + %rows.addr = alloca i32, align 4 + %startStep.addr = alloca i32, align 4 + %border.addr = alloca i32, align 4 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32 %iteration, i32* %iteration.addr, align 4 + store i32* %gpuWall, i32** %gpuWall.addr, align 8 + store i32* %gpuSrc, i32** %gpuSrc.addr, align 8 + store i32* %gpuResults, i32** %gpuResults.addr, align 8 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %rows, i32* %rows.addr, align 4 + store i32 %startStep, i32* %startStep.addr, align 4 + store i32 %border, i32* %border.addr, align 4 + %kernel_args = alloca i8*, i64 8, align 16 + %0 = bitcast i32* %iteration.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32** %gpuWall.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i32** %gpuSrc.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast i32** %gpuResults.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %cols.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %rows.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast i32* %startStep.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast i32* %border.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %17 = load i64, i64* %shmem_size, align 8 + %18 = load i8*, i8** %stream, align 8 + %19 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %20 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false) + %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %22 = load i64, i64* %21, align 8 + %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %24 = load i32, i32* %23, align 8 + %25 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %26 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %25, i8* align 8 %26, i64 12, i1 false) + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %28 = load i64, i64* %27, align 8 + %29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %30 = load i32, i32* %29, align 8 + %31 = bitcast i8* %18 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32, i32*, i32*, i32*, i32, i32, i32, i32)* @_Z14dynproc_kerneliPiS_S_iiii to i8*), i64 %22, i32 %24, i64 %28, i32 %30, i8** %kernel_args, i64 %17, %struct.CUstream_st* %31) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #7 + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @_Z9calc_pathPiPS_iiiii(i32* %gpuWall, i32** %gpuResult, i32 %rows, i32 %cols, i32 %pyramid_height, i32 %blockCols, i32 %borderCols) #0 { +entry: + %gpuWall.addr = alloca i32*, align 8 + %gpuResult.addr = alloca i32**, align 8 + %rows.addr = alloca i32, align 4 + %cols.addr = alloca i32, align 4 + %pyramid_height.addr = alloca i32, align 4 + %blockCols.addr = alloca i32, align 4 + %borderCols.addr = alloca i32, align 4 + %dimBlock = alloca %struct.dim3, align 4 + %dimGrid = alloca %struct.dim3, align 4 + %src = alloca i32, align 4 + %dst = alloca i32, align 4 + %t = alloca i32, align 4 + %temp = alloca i32, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp1 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp1.coerce = alloca { i64, i32 }, align 4 + store i32* %gpuWall, i32** %gpuWall.addr, align 8 + store i32** %gpuResult, i32*** %gpuResult.addr, align 8 + store i32 %rows, i32* %rows.addr, align 4 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %pyramid_height, i32* %pyramid_height.addr, align 4 + store i32 %blockCols, i32* %blockCols.addr, align 4 + store i32 %borderCols, i32* %borderCols.addr, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 256, i32 1, i32 1) + %0 = load i32, i32* %blockCols.addr, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %0, i32 1, i32 1) + store i32 1, i32* %src, align 4 + store i32 0, i32* %dst, align 4 + store i32 0, i32* %t, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %t, align 4 + %2 = load i32, i32* %rows.addr, align 4 + %sub = sub nsw i32 %2, 1 + %cmp = icmp slt i32 %1, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %3 = load i32, i32* %src, align 4 + store i32 %3, i32* %temp, align 4 + %4 = load i32, i32* %dst, align 4 + store i32 %4, i32* %src, align 4 + %5 = load i32, i32* %temp, align 4 + store i32 %5, i32* %dst, align 4 + %6 = bitcast %struct.dim3* %agg.tmp to i8* + %7 = bitcast %struct.dim3* %dimGrid to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %6, i8* align 4 %7, i64 12, i1 false) + %8 = bitcast %struct.dim3* %agg.tmp1 to i8* + %9 = bitcast %struct.dim3* %dimBlock to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %8, i8* align 4 %9, i64 12, i1 false) + %10 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %11 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %10, i8* align 4 %11, i64 12, i1 false) + %12 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %13 = load i64, i64* %12, align 4 + %14 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %15 = load i32, i32* %14, align 4 + %16 = bitcast { i64, i32 }* %agg.tmp1.coerce to i8* + %17 = bitcast %struct.dim3* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %16, i8* align 4 %17, i64 12, i1 false) + %18 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp1.coerce, i32 0, i32 0 + %19 = load i64, i64* %18, align 4 + %20 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp1.coerce, i32 0, i32 1 + %21 = load i32, i32* %20, align 4 + %call = call i32 @__cudaPushCallConfiguration(i64 %13, i32 %15, i64 %19, i32 %21, i64 0, i8* null) + %tobool = icmp ne i32 %call, 0 + br i1 %tobool, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %for.body + %22 = load i32, i32* %pyramid_height.addr, align 4 + %23 = load i32, i32* %rows.addr, align 4 + %24 = load i32, i32* %t, align 4 + %sub2 = sub nsw i32 %23, %24 + %sub3 = sub nsw i32 %sub2, 1 + %cmp4 = icmp sle i32 %22, %sub3 + br i1 %cmp4, label %cond.true, label %cond.false + +cond.true: ; preds = %kcall.configok + %25 = load i32, i32* %pyramid_height.addr, align 4 + br label %cond.end + +cond.false: ; preds = %kcall.configok + %26 = load i32, i32* %rows.addr, align 4 + %27 = load i32, i32* %t, align 4 + %sub5 = sub nsw i32 %26, %27 + %sub6 = sub nsw i32 %sub5, 1 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %25, %cond.true ], [ %sub6, %cond.false ] + %28 = load i32*, i32** %gpuWall.addr, align 8 + %29 = load i32**, i32*** %gpuResult.addr, align 8 + %30 = load i32, i32* %src, align 4 + %idxprom = sext i32 %30 to i64 + %arrayidx = getelementptr inbounds i32*, i32** %29, i64 %idxprom + %31 = load i32*, i32** %arrayidx, align 8 + %32 = load i32**, i32*** %gpuResult.addr, align 8 + %33 = load i32, i32* %dst, align 4 + %idxprom7 = sext i32 %33 to i64 + %arrayidx8 = getelementptr inbounds i32*, i32** %32, i64 %idxprom7 + %34 = load i32*, i32** %arrayidx8, align 8 + %35 = load i32, i32* %cols.addr, align 4 + %36 = load i32, i32* %rows.addr, align 4 + %37 = load i32, i32* %t, align 4 + %38 = load i32, i32* %borderCols.addr, align 4 + call void @_Z14dynproc_kerneliPiS_S_iiii(i32 %cond, i32* %28, i32* %31, i32* %34, i32 %35, i32 %36, i32 %37, i32 %38) + br label %kcall.end + +kcall.end: ; preds = %cond.end, %for.body + %call9 = call i32 @cudaDeviceSynchronize() + br label %for.inc + +for.inc: ; preds = %kcall.end + %39 = load i32, i32* %pyramid_height.addr, align 4 + %40 = load i32, i32* %t, align 4 + %add = add nsw i32 %40, %39 + store i32 %add, i32* %t, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %41 = load i32, i32* %dst, align 4 + ret i32 %41 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #8 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2 + +declare dso_local i32 @cudaDeviceSynchronize() #2 + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #9 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + call void @_Z3runiPPc(i32 %0, i8** %1) + ret i32 0 +} + +declare dso_local i32 @cudaSetDevice(i32) #2 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z3runiPPc(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %borderCols = alloca i32, align 4 + %smallBlockCol = alloca i32, align 4 + %blockCols = alloca i32, align 4 + %gpuWall = alloca i32*, align 8 + %gpuResult = alloca [2 x i32*], align 16 + %size = alloca i32, align 4 + %final_ret = alloca i32, align 4 + %i = alloca i32, align 4 + %i32 = alloca i32, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %0 = load i32, i32* %argc.addr, align 4 + %1 = load i8**, i8*** %argv.addr, align 8 + call void @_Z4initiPPc(i32 %0, i8** %1) + %2 = load i32, i32* @pyramid_height, align 4 + %mul = mul nsw i32 %2, 1 + store i32 %mul, i32* %borderCols, align 4 + %3 = load i32, i32* @pyramid_height, align 4 + %mul1 = mul nsw i32 %3, 1 + %mul2 = mul nsw i32 %mul1, 2 + %sub = sub nsw i32 256, %mul2 + store i32 %sub, i32* %smallBlockCol, align 4 + %4 = load i32, i32* @cols, align 4 + %5 = load i32, i32* %smallBlockCol, align 4 + %div = sdiv i32 %4, %5 + %6 = load i32, i32* @cols, align 4 + %7 = load i32, i32* %smallBlockCol, align 4 + %rem = srem i32 %6, %7 + %cmp = icmp eq i32 %rem, 0 + %8 = zext i1 %cmp to i64 + %cond = select i1 %cmp, i32 0, i32 1 + %add = add nsw i32 %div, %cond + store i32 %add, i32* %blockCols, align 4 + %9 = load i32, i32* @pyramid_height, align 4 + %10 = load i32, i32* @cols, align 4 + %11 = load i32, i32* %borderCols, align 4 + %12 = load i32, i32* %blockCols, align 4 + %13 = load i32, i32* %smallBlockCol, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([92 x i8], [92 x i8]* @.str.2, i64 0, i64 0), i32 %9, i32 %10, i32 %11, i32 256, i32 %12, i32 %13) + %14 = load i32, i32* @rows, align 4 + %15 = load i32, i32* @cols, align 4 + %mul3 = mul nsw i32 %14, %15 + store i32 %mul3, i32* %size, align 4 + %arrayidx = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 0 + %16 = bitcast i32** %arrayidx to i8** + %17 = load i32, i32* @cols, align 4 + %conv = sext i32 %17 to i64 + %mul4 = mul i64 4, %conv + %call5 = call i32 @cudaMalloc(i8** %16, i64 %mul4) + %arrayidx6 = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 1 + %18 = bitcast i32** %arrayidx6 to i8** + %19 = load i32, i32* @cols, align 4 + %conv7 = sext i32 %19 to i64 + %mul8 = mul i64 4, %conv7 + %call9 = call i32 @cudaMalloc(i8** %18, i64 %mul8) + %arrayidx10 = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 0 + %20 = load i32*, i32** %arrayidx10, align 16 + %21 = bitcast i32* %20 to i8* + %22 = load i32*, i32** @data, align 8 + %23 = bitcast i32* %22 to i8* + %24 = load i32, i32* @cols, align 4 + %conv11 = sext i32 %24 to i64 + %mul12 = mul i64 4, %conv11 + %call13 = call i32 @cudaMemcpy(i8* %21, i8* %23, i64 %mul12, i32 1) + %25 = bitcast i32** %gpuWall to i8** + %26 = load i32, i32* %size, align 4 + %27 = load i32, i32* @cols, align 4 + %sub14 = sub nsw i32 %26, %27 + %conv15 = sext i32 %sub14 to i64 + %mul16 = mul i64 4, %conv15 + %call17 = call i32 @cudaMalloc(i8** %25, i64 %mul16) + %28 = load i32*, i32** %gpuWall, align 8 + %29 = bitcast i32* %28 to i8* + %30 = load i32*, i32** @data, align 8 + %31 = load i32, i32* @cols, align 4 + %idx.ext = sext i32 %31 to i64 + %add.ptr = getelementptr inbounds i32, i32* %30, i64 %idx.ext + %32 = bitcast i32* %add.ptr to i8* + %33 = load i32, i32* %size, align 4 + %34 = load i32, i32* @cols, align 4 + %sub18 = sub nsw i32 %33, %34 + %conv19 = sext i32 %sub18 to i64 + %mul20 = mul i64 4, %conv19 + %call21 = call i32 @cudaMemcpy(i8* %29, i8* %32, i64 %mul20, i32 1) + %35 = load i32*, i32** %gpuWall, align 8 + %arraydecay = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 0 + %36 = load i32, i32* @rows, align 4 + %37 = load i32, i32* @cols, align 4 + %38 = load i32, i32* @pyramid_height, align 4 + %39 = load i32, i32* %blockCols, align 4 + %40 = load i32, i32* %borderCols, align 4 + %call22 = call i32 @_Z9calc_pathPiPS_iiiii(i32* %35, i32** %arraydecay, i32 %36, i32 %37, i32 %38, i32 %39, i32 %40) + store i32 %call22, i32* %final_ret, align 4 + %41 = load i32*, i32** @result, align 8 + %42 = bitcast i32* %41 to i8* + %43 = load i32, i32* %final_ret, align 4 + %idxprom = sext i32 %43 to i64 + %arrayidx23 = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 %idxprom + %44 = load i32*, i32** %arrayidx23, align 8 + %45 = bitcast i32* %44 to i8* + %46 = load i32, i32* @cols, align 4 + %conv24 = sext i32 %46 to i64 + %mul25 = mul i64 4, %conv24 + %call26 = call i32 @cudaMemcpy(i8* %42, i8* %45, i64 %mul25, i32 2) + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %47 = load i32, i32* %i, align 4 + %48 = load i32, i32* @cols, align 4 + %cmp27 = icmp slt i32 %47, %48 + br i1 %cmp27, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %49 = load i32*, i32** @data, align 8 + %50 = load i32, i32* %i, align 4 + %idxprom28 = sext i32 %50 to i64 + %arrayidx29 = getelementptr inbounds i32, i32* %49, i64 %idxprom28 + %51 = load i32, i32* %arrayidx29, align 4 + %call30 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.3, i64 0, i64 0), i32 %51) + br label %for.inc + +for.inc: ; preds = %for.body + %52 = load i32, i32* %i, align 4 + %inc = add nsw i32 %52, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %call31 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i64 0, i64 0)) + store i32 0, i32* %i32, align 4 + br label %for.cond33 + +for.cond33: ; preds = %for.inc39, %for.end + %53 = load i32, i32* %i32, align 4 + %54 = load i32, i32* @cols, align 4 + %cmp34 = icmp slt i32 %53, %54 + br i1 %cmp34, label %for.body35, label %for.end41 + +for.body35: ; preds = %for.cond33 + %55 = load i32*, i32** @result, align 8 + %56 = load i32, i32* %i32, align 4 + %idxprom36 = sext i32 %56 to i64 + %arrayidx37 = getelementptr inbounds i32, i32* %55, i64 %idxprom36 + %57 = load i32, i32* %arrayidx37, align 4 + %call38 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.3, i64 0, i64 0), i32 %57) + br label %for.inc39 + +for.inc39: ; preds = %for.body35 + %58 = load i32, i32* %i32, align 4 + %inc40 = add nsw i32 %58, 1 + store i32 %inc40, i32* %i32, align 4 + br label %for.cond33 + +for.end41: ; preds = %for.cond33 + %call42 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i64 0, i64 0)) + %59 = load i32*, i32** %gpuWall, align 8 + %60 = bitcast i32* %59 to i8* + %call43 = call i32 @cudaFree(i8* %60) + %arrayidx44 = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 0 + %61 = load i32*, i32** %arrayidx44, align 16 + %62 = bitcast i32* %61 to i8* + %call45 = call i32 @cudaFree(i8* %62) + %arrayidx46 = getelementptr inbounds [2 x i32*], [2 x i32*]* %gpuResult, i64 0, i64 1 + %63 = load i32*, i32** %arrayidx46, align 8 + %64 = bitcast i32* %63 to i8* + %call47 = call i32 @cudaFree(i8* %64) + %65 = load i32*, i32** @data, align 8 + %isnull = icmp eq i32* %65, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %for.end41 + %66 = bitcast i32* %65 to i8* + call void @_ZdaPv(i8* %66) #15 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %for.end41 + %67 = load i32**, i32*** @wall, align 8 + %isnull48 = icmp eq i32** %67, null + br i1 %isnull48, label %delete.end50, label %delete.notnull49 + +delete.notnull49: ; preds = %delete.end + %68 = bitcast i32** %67 to i8* + call void @_ZdaPv(i8* %68) #15 + br label %delete.end50 + +delete.end50: ; preds = %delete.notnull49, %delete.end + %69 = load i32*, i32** @result, align 8 + %isnull51 = icmp eq i32* %69, null + br i1 %isnull51, label %delete.end53, label %delete.notnull52 + +delete.notnull52: ; preds = %delete.end50 + %70 = bitcast i32* %69 to i8* + call void @_ZdaPv(i8* %70) #15 + br label %delete.end53 + +delete.end53: ; preds = %delete.notnull52, %delete.end50 + ret void +} + +declare dso_local i32 @cudaMalloc(i8**, i64) #2 + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2 + +declare dso_local i32 @cudaFree(i8*) #2 + +; Function Attrs: nobuiltin nounwind +declare dso_local void @_ZdaPv(i8*) #10 + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32, i32*, i32*, i32*, i32, i32, i32, i32)* @_Z14dynproc_kerneliPiS_S_iiii to i8*), i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind readnone speculatable willreturn } +attributes #5 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { argmemonly nounwind willreturn } +attributes #8 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #9 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #10 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #11 = { nounwind readonly } +attributes #12 = { noreturn nounwind } +attributes #13 = { builtin } +attributes #14 = { nounwind } +attributes #15 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/pathfinder/pathfinder.cu b/examples/pathfinder/pathfinder.cu new file mode 100644 index 0000000..d57c677 --- /dev/null +++ b/examples/pathfinder/pathfinder.cu @@ -0,0 +1,238 @@ +#include +#include +#include +#include + +#ifdef TIMING +#include "timing.h" + +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +#define BLOCK_SIZE 256 +#define STR_SIZE 256 +#define DEVICE 0 +#define HALO \ + 1 // halo width along one direction when advancing to the next iteration + +//#define BENCH_PRINT + +void run(int argc, char **argv); + +int rows, cols; +int *data; +int **wall; +int *result; +#define M_SEED 9 +int pyramid_height; + +void init(int argc, char **argv) { + if (argc == 4) { + cols = atoi(argv[1]); + rows = atoi(argv[2]); + pyramid_height = atoi(argv[3]); + } else { + printf("Usage: dynproc row_len col_len pyramid_height\n"); + exit(0); + } + data = new int[rows * cols]; + wall = new int *[rows]; + for (int n = 0; n < rows; n++) + wall[n] = data + cols * n; + result = new int[cols]; + + int seed = M_SEED; + srand(seed); + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + wall[i][j] = rand() % 10; + } + } +#ifdef BENCH_PRINT + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + printf("%d ", wall[i][j]); + } + printf("\n"); + } +#endif +} + +void fatal(char *s) { fprintf(stderr, "error: %s\n", s); } + +#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max)) +#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +__global__ void dynproc_kernel(int iteration, int *gpuWall, int *gpuSrc, + int *gpuResults, int cols, int rows, + int startStep, int border) { + + __shared__ int prev[BLOCK_SIZE]; + __shared__ int result[BLOCK_SIZE]; + + int bx = blockIdx.x; + int tx = threadIdx.x; + + // each block finally computes result for a small block + // after N iterations. + // it is the non-overlapping small blocks that cover + // all the input data + + // calculate the small block size + int small_block_cols = BLOCK_SIZE - iteration * HALO * 2; + + // calculate the boundary for the block according to + // the boundary of its small block + int blkX = small_block_cols * bx - border; + int blkXmax = blkX + BLOCK_SIZE - 1; + + // calculate the global thread coordination + int xidx = blkX + tx; + + // effective range within this block that falls within + // the valid range of the input data + // used to rule out computation outside the boundary. + int validXmin = (blkX < 0) ? -blkX : 0; + int validXmax = (blkXmax > cols - 1) ? BLOCK_SIZE - 1 - (blkXmax - cols + 1) + : BLOCK_SIZE - 1; + + int W = tx - 1; + int E = tx + 1; + + W = (W < validXmin) ? validXmin : W; + E = (E > validXmax) ? validXmax : E; + + bool isValid = IN_RANGE(tx, validXmin, validXmax); + + if (IN_RANGE(xidx, 0, cols - 1)) { + prev[tx] = gpuSrc[xidx]; + } + __syncthreads(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012 + bool computed; + for (int i = 0; i < iteration; i++) { + computed = false; + if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) && isValid) { + computed = true; + int left = prev[W]; + int up = prev[tx]; + int right = prev[E]; + int shortest = MIN(left, up); + shortest = MIN(shortest, right); + int index = cols * (startStep + i) + xidx; + result[tx] = shortest + gpuWall[index]; + } + __syncthreads(); + if (i == iteration - 1) + break; + if (computed) // Assign the computation range + prev[tx] = result[tx]; + __syncthreads(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012 + } + + // update the global memory + // after the last iteration, only threads coordinated within the + // small block perform the calculation and switch on ``computed'' + if (computed) { + gpuResults[xidx] = result[tx]; + } +} + +/* + compute N time steps +*/ +int calc_path(int *gpuWall, int *gpuResult[2], int rows, int cols, + int pyramid_height, int blockCols, int borderCols) { + dim3 dimBlock(BLOCK_SIZE); + dim3 dimGrid(blockCols); + + int src = 1, dst = 0; + for (int t = 0; t < rows - 1; t += pyramid_height) { + int temp = src; + src = dst; + dst = temp; + dynproc_kernel<<>>( + MIN(pyramid_height, rows - t - 1), gpuWall, gpuResult[src], + gpuResult[dst], cols, rows, t, borderCols); + + // for the measurement fairness + cudaDeviceSynchronize(); + } + return dst; +} + +int main(int argc, char **argv) { + cudaSetDevice(0); + + run(argc, argv); + + return EXIT_SUCCESS; +} + +void run(int argc, char **argv) { + init(argc, argv); + + /* --------------- pyramid parameters --------------- */ + int borderCols = (pyramid_height)*HALO; + int smallBlockCol = BLOCK_SIZE - (pyramid_height)*HALO * 2; + int blockCols = cols / smallBlockCol + ((cols % smallBlockCol == 0) ? 0 : 1); + + printf("pyramidHeight: %d\ngridSize: [%d]\nborder:[%d]\nblockSize: " + "%d\nblockGrid:[%d]\ntargetBlock:[%d]\n", + pyramid_height, cols, borderCols, BLOCK_SIZE, blockCols, + smallBlockCol); + + int *gpuWall, *gpuResult[2]; + int size = rows * cols; + + cudaMalloc((void **)&gpuResult[0], sizeof(int) * cols); + cudaMalloc((void **)&gpuResult[1], sizeof(int) * cols); + cudaMemcpy(gpuResult[0], data, sizeof(int) * cols, cudaMemcpyHostToDevice); + cudaMalloc((void **)&gpuWall, sizeof(int) * (size - cols)); + cudaMemcpy(gpuWall, data + cols, sizeof(int) * (size - cols), + cudaMemcpyHostToDevice); + +#ifdef TIMING + gettimeofday(&tv_kernel_start, NULL); +#endif + + int final_ret = calc_path(gpuWall, gpuResult, rows, cols, pyramid_height, + blockCols, borderCols); + +#ifdef TIMING + gettimeofday(&tv_kernel_end, NULL); + tvsub(&tv_kernel_end, &tv_kernel_start, &tv); + kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; +#endif + + cudaMemcpy(result, gpuResult[final_ret], sizeof(int) * cols, + cudaMemcpyDeviceToHost); + + for (int i = 0; i < cols; i++) + printf("%d ", data[i]); + printf("\n"); + for (int i = 0; i < cols; i++) + printf("%d ", result[i]); + printf("\n"); + + cudaFree(gpuWall); + cudaFree(gpuResult[0]); + cudaFree(gpuResult[1]); + + delete[] data; + delete[] wall; + delete[] result; + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} diff --git a/examples/pathfinder/run.sh b/examples/pathfinder/run.sh new file mode 100644 index 0000000..7bfd85e --- /dev/null +++ b/examples/pathfinder/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +llvm-as pathfinder-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as pathfinder-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator pathfinder-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator pathfinder-host-x86_64-unknown-linux-gnu.bc host.bc +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime \ + -L../../build/runtime/threadPool -o pathfinder \ + -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./pathfinder 100000 100 20 > res.log +if grep -q "5 4 5 7 0 3 0 8 2" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/srad_v2/run.sh b/examples/srad_v2/run.sh new file mode 100644 index 0000000..fe49cfa --- /dev/null +++ b/examples/srad_v2/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e +llvm-as srad-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as srad-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator srad-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator srad-host-x86_64-unknown-linux-gnu.bc host.bc + +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime \ + -L../../build/runtime/threadPool \ + -o srad -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./srad 2048 2048 0 127 0 127 0.5 2 > res.log +if grep -q "1.98368 2.16545 1.72989" res.log; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/srad_v2/srad-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/srad_v2/srad-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..f1b895c --- /dev/null +++ b/examples/srad_v2/srad-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,1551 @@ +; ModuleID = 'srad-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "srad.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.__cuda_builtin_gridDim_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv = comdat any + +$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any + +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 +@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1 +@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result = internal addrspace(3) global [16 x [16 x float]] undef, align 4 +@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp = internal addrspace(3) global [16 x [16 x float]] undef, align 4 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z11srad_cuda_1PfS_S_S_S_S_iif(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %q0sqr) #0 { +entry: + %E_C.addr = alloca float*, align 8 + %W_C.addr = alloca float*, align 8 + %N_C.addr = alloca float*, align 8 + %S_C.addr = alloca float*, align 8 + %J_cuda.addr = alloca float*, align 8 + %C_cuda.addr = alloca float*, align 8 + %cols.addr = alloca i32, align 4 + %rows.addr = alloca i32, align 4 + %q0sqr.addr = alloca float, align 4 + %bx = alloca i32, align 4 + %by = alloca i32, align 4 + %tx = alloca i32, align 4 + %ty = alloca i32, align 4 + %index = alloca i32, align 4 + %index_n = alloca i32, align 4 + %index_s = alloca i32, align 4 + %index_w = alloca i32, align 4 + %index_e = alloca i32, align 4 + %n = alloca float, align 4 + %w = alloca float, align 4 + %e = alloca float, align 4 + %s = alloca float, align 4 + %jc = alloca float, align 4 + %g2 = alloca float, align 4 + %l = alloca float, align 4 + %num = alloca float, align 4 + %den = alloca float, align 4 + %qsqr = alloca float, align 4 + %c = alloca float, align 4 + store float* %E_C, float** %E_C.addr, align 8 + store float* %W_C, float** %W_C.addr, align 8 + store float* %N_C, float** %N_C.addr, align 8 + store float* %S_C, float** %S_C.addr, align 8 + store float* %J_cuda, float** %J_cuda.addr, align 8 + store float* %C_cuda, float** %C_cuda.addr, align 8 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %rows, i32* %rows.addr, align 4 + store float %q0sqr, float* %q0sqr.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %bx, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2 + store i32 %call1, i32* %by, align 4 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call2, i32* %tx, align 4 + %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + store i32 %call3, i32* %ty, align 4 + %0 = load i32, i32* %cols.addr, align 4 + %mul = mul nsw i32 %0, 16 + %1 = load i32, i32* %by, align 4 + %mul4 = mul nsw i32 %mul, %1 + %2 = load i32, i32* %bx, align 4 + %mul5 = mul nsw i32 16, %2 + %add = add nsw i32 %mul4, %mul5 + %3 = load i32, i32* %cols.addr, align 4 + %4 = load i32, i32* %ty, align 4 + %mul6 = mul nsw i32 %3, %4 + %add7 = add nsw i32 %add, %mul6 + %5 = load i32, i32* %tx, align 4 + %add8 = add nsw i32 %add7, %5 + store i32 %add8, i32* %index, align 4 + %6 = load i32, i32* %cols.addr, align 4 + %mul9 = mul nsw i32 %6, 16 + %7 = load i32, i32* %by, align 4 + %mul10 = mul nsw i32 %mul9, %7 + %8 = load i32, i32* %bx, align 4 + %mul11 = mul nsw i32 16, %8 + %add12 = add nsw i32 %mul10, %mul11 + %9 = load i32, i32* %tx, align 4 + %add13 = add nsw i32 %add12, %9 + %10 = load i32, i32* %cols.addr, align 4 + %sub = sub nsw i32 %add13, %10 + store i32 %sub, i32* %index_n, align 4 + %11 = load i32, i32* %cols.addr, align 4 + %mul14 = mul nsw i32 %11, 16 + %12 = load i32, i32* %by, align 4 + %mul15 = mul nsw i32 %mul14, %12 + %13 = load i32, i32* %bx, align 4 + %mul16 = mul nsw i32 16, %13 + %add17 = add nsw i32 %mul15, %mul16 + %14 = load i32, i32* %cols.addr, align 4 + %mul18 = mul nsw i32 %14, 16 + %add19 = add nsw i32 %add17, %mul18 + %15 = load i32, i32* %tx, align 4 + %add20 = add nsw i32 %add19, %15 + store i32 %add20, i32* %index_s, align 4 + %16 = load i32, i32* %cols.addr, align 4 + %mul21 = mul nsw i32 %16, 16 + %17 = load i32, i32* %by, align 4 + %mul22 = mul nsw i32 %mul21, %17 + %18 = load i32, i32* %bx, align 4 + %mul23 = mul nsw i32 16, %18 + %add24 = add nsw i32 %mul22, %mul23 + %19 = load i32, i32* %cols.addr, align 4 + %20 = load i32, i32* %ty, align 4 + %mul25 = mul nsw i32 %19, %20 + %add26 = add nsw i32 %add24, %mul25 + %sub27 = sub nsw i32 %add26, 1 + store i32 %sub27, i32* %index_w, align 4 + %21 = load i32, i32* %cols.addr, align 4 + %mul28 = mul nsw i32 %21, 16 + %22 = load i32, i32* %by, align 4 + %mul29 = mul nsw i32 %mul28, %22 + %23 = load i32, i32* %bx, align 4 + %mul30 = mul nsw i32 16, %23 + %add31 = add nsw i32 %mul29, %mul30 + %24 = load i32, i32* %cols.addr, align 4 + %25 = load i32, i32* %ty, align 4 + %mul32 = mul nsw i32 %24, %25 + %add33 = add nsw i32 %add31, %mul32 + %add34 = add nsw i32 %add33, 16 + store i32 %add34, i32* %index_e, align 4 + %26 = load float*, float** %J_cuda.addr, align 8 + %27 = load i32, i32* %index_n, align 4 + %idxprom = sext i32 %27 to i64 + %arrayidx = getelementptr inbounds float, float* %26, i64 %idxprom + %28 = load float, float* %arrayidx, align 4 + %29 = load i32, i32* %ty, align 4 + %idxprom35 = sext i32 %29 to i64 + %arrayidx36 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom35 + %30 = load i32, i32* %tx, align 4 + %idxprom37 = sext i32 %30 to i64 + %arrayidx38 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx36, i64 0, i64 %idxprom37 + store float %28, float* %arrayidx38, align 4 + %31 = load float*, float** %J_cuda.addr, align 8 + %32 = load i32, i32* %index_s, align 4 + %idxprom39 = sext i32 %32 to i64 + %arrayidx40 = getelementptr inbounds float, float* %31, i64 %idxprom39 + %33 = load float, float* %arrayidx40, align 4 + %34 = load i32, i32* %ty, align 4 + %idxprom41 = sext i32 %34 to i64 + %arrayidx42 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom41 + %35 = load i32, i32* %tx, align 4 + %idxprom43 = sext i32 %35 to i64 + %arrayidx44 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx42, i64 0, i64 %idxprom43 + store float %33, float* %arrayidx44, align 4 + %36 = load i32, i32* %by, align 4 + %cmp = icmp eq i32 %36, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %37 = load float*, float** %J_cuda.addr, align 8 + %38 = load i32, i32* %bx, align 4 + %mul45 = mul nsw i32 16, %38 + %39 = load i32, i32* %tx, align 4 + %add46 = add nsw i32 %mul45, %39 + %idxprom47 = sext i32 %add46 to i64 + %arrayidx48 = getelementptr inbounds float, float* %37, i64 %idxprom47 + %40 = load float, float* %arrayidx48, align 4 + %41 = load i32, i32* %ty, align 4 + %idxprom49 = sext i32 %41 to i64 + %arrayidx50 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom49 + %42 = load i32, i32* %tx, align 4 + %idxprom51 = sext i32 %42 to i64 + %arrayidx52 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx50, i64 0, i64 %idxprom51 + store float %40, float* %arrayidx52, align 4 + br label %if.end72 + +if.else: ; preds = %entry + %43 = load i32, i32* %by, align 4 + %call53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2 + %sub54 = sub i32 %call53, 1 + %cmp55 = icmp eq i32 %43, %sub54 + br i1 %cmp55, label %if.then56, label %if.end + +if.then56: ; preds = %if.else + %44 = load float*, float** %J_cuda.addr, align 8 + %45 = load i32, i32* %cols.addr, align 4 + %mul57 = mul nsw i32 %45, 16 + %call58 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2 + %sub59 = sub i32 %call58, 1 + %mul60 = mul i32 %mul57, %sub59 + %46 = load i32, i32* %bx, align 4 + %mul61 = mul nsw i32 16, %46 + %add62 = add i32 %mul60, %mul61 + %47 = load i32, i32* %cols.addr, align 4 + %mul63 = mul nsw i32 %47, 15 + %add64 = add i32 %add62, %mul63 + %48 = load i32, i32* %tx, align 4 + %add65 = add i32 %add64, %48 + %idxprom66 = zext i32 %add65 to i64 + %arrayidx67 = getelementptr inbounds float, float* %44, i64 %idxprom66 + %49 = load float, float* %arrayidx67, align 4 + %50 = load i32, i32* %ty, align 4 + %idxprom68 = sext i32 %50 to i64 + %arrayidx69 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom68 + %51 = load i32, i32* %tx, align 4 + %idxprom70 = sext i32 %51 to i64 + %arrayidx71 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx69, i64 0, i64 %idxprom70 + store float %49, float* %arrayidx71, align 4 + br label %if.end + +if.end: ; preds = %if.then56, %if.else + br label %if.end72 + +if.end72: ; preds = %if.end, %if.then + call void @llvm.nvvm.barrier0() + %52 = load float*, float** %J_cuda.addr, align 8 + %53 = load i32, i32* %index_w, align 4 + %idxprom73 = sext i32 %53 to i64 + %arrayidx74 = getelementptr inbounds float, float* %52, i64 %idxprom73 + %54 = load float, float* %arrayidx74, align 4 + %55 = load i32, i32* %ty, align 4 + %idxprom75 = sext i32 %55 to i64 + %arrayidx76 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom75 + %56 = load i32, i32* %tx, align 4 + %idxprom77 = sext i32 %56 to i64 + %arrayidx78 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx76, i64 0, i64 %idxprom77 + store float %54, float* %arrayidx78, align 4 + %57 = load float*, float** %J_cuda.addr, align 8 + %58 = load i32, i32* %index_e, align 4 + %idxprom79 = sext i32 %58 to i64 + %arrayidx80 = getelementptr inbounds float, float* %57, i64 %idxprom79 + %59 = load float, float* %arrayidx80, align 4 + %60 = load i32, i32* %ty, align 4 + %idxprom81 = sext i32 %60 to i64 + %arrayidx82 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom81 + %61 = load i32, i32* %tx, align 4 + %idxprom83 = sext i32 %61 to i64 + %arrayidx84 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx82, i64 0, i64 %idxprom83 + store float %59, float* %arrayidx84, align 4 + %62 = load i32, i32* %bx, align 4 + %cmp85 = icmp eq i32 %62, 0 + br i1 %cmp85, label %if.then86, label %if.else97 + +if.then86: ; preds = %if.end72 + %63 = load float*, float** %J_cuda.addr, align 8 + %64 = load i32, i32* %cols.addr, align 4 + %mul87 = mul nsw i32 %64, 16 + %65 = load i32, i32* %by, align 4 + %mul88 = mul nsw i32 %mul87, %65 + %66 = load i32, i32* %cols.addr, align 4 + %67 = load i32, i32* %ty, align 4 + %mul89 = mul nsw i32 %66, %67 + %add90 = add nsw i32 %mul88, %mul89 + %idxprom91 = sext i32 %add90 to i64 + %arrayidx92 = getelementptr inbounds float, float* %63, i64 %idxprom91 + %68 = load float, float* %arrayidx92, align 4 + %69 = load i32, i32* %ty, align 4 + %idxprom93 = sext i32 %69 to i64 + %arrayidx94 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom93 + %70 = load i32, i32* %tx, align 4 + %idxprom95 = sext i32 %70 to i64 + %arrayidx96 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx94, i64 0, i64 %idxprom95 + store float %68, float* %arrayidx96, align 4 + br label %if.end119 + +if.else97: ; preds = %if.end72 + %71 = load i32, i32* %bx, align 4 + %call98 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 + %sub99 = sub i32 %call98, 1 + %cmp100 = icmp eq i32 %71, %sub99 + br i1 %cmp100, label %if.then101, label %if.end118 + +if.then101: ; preds = %if.else97 + %72 = load float*, float** %J_cuda.addr, align 8 + %73 = load i32, i32* %cols.addr, align 4 + %mul102 = mul nsw i32 %73, 16 + %74 = load i32, i32* %by, align 4 + %mul103 = mul nsw i32 %mul102, %74 + %call104 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 + %sub105 = sub i32 %call104, 1 + %mul106 = mul i32 16, %sub105 + %add107 = add i32 %mul103, %mul106 + %75 = load i32, i32* %cols.addr, align 4 + %76 = load i32, i32* %ty, align 4 + %mul108 = mul nsw i32 %75, %76 + %add109 = add i32 %add107, %mul108 + %add110 = add i32 %add109, 16 + %sub111 = sub i32 %add110, 1 + %idxprom112 = zext i32 %sub111 to i64 + %arrayidx113 = getelementptr inbounds float, float* %72, i64 %idxprom112 + %77 = load float, float* %arrayidx113, align 4 + %78 = load i32, i32* %ty, align 4 + %idxprom114 = sext i32 %78 to i64 + %arrayidx115 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom114 + %79 = load i32, i32* %tx, align 4 + %idxprom116 = sext i32 %79 to i64 + %arrayidx117 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx115, i64 0, i64 %idxprom116 + store float %77, float* %arrayidx117, align 4 + br label %if.end118 + +if.end118: ; preds = %if.then101, %if.else97 + br label %if.end119 + +if.end119: ; preds = %if.end118, %if.then86 + call void @llvm.nvvm.barrier0() + %80 = load float*, float** %J_cuda.addr, align 8 + %81 = load i32, i32* %index, align 4 + %idxprom120 = sext i32 %81 to i64 + %arrayidx121 = getelementptr inbounds float, float* %80, i64 %idxprom120 + %82 = load float, float* %arrayidx121, align 4 + %83 = load i32, i32* %ty, align 4 + %idxprom122 = sext i32 %83 to i64 + %arrayidx123 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom122 + %84 = load i32, i32* %tx, align 4 + %idxprom124 = sext i32 %84 to i64 + %arrayidx125 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx123, i64 0, i64 %idxprom124 + store float %82, float* %arrayidx125, align 4 + call void @llvm.nvvm.barrier0() + %85 = load i32, i32* %ty, align 4 + %idxprom126 = sext i32 %85 to i64 + %arrayidx127 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom126 + %86 = load i32, i32* %tx, align 4 + %idxprom128 = sext i32 %86 to i64 + %arrayidx129 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx127, i64 0, i64 %idxprom128 + %87 = load float, float* %arrayidx129, align 4 + store float %87, float* %jc, align 4 + %88 = load i32, i32* %ty, align 4 + %cmp130 = icmp eq i32 %88, 0 + br i1 %cmp130, label %land.lhs.true, label %if.else155 + +land.lhs.true: ; preds = %if.end119 + %89 = load i32, i32* %tx, align 4 + %cmp131 = icmp eq i32 %89, 0 + br i1 %cmp131, label %if.then132, label %if.else155 + +if.then132: ; preds = %land.lhs.true + %90 = load i32, i32* %ty, align 4 + %idxprom133 = sext i32 %90 to i64 + %arrayidx134 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom133 + %91 = load i32, i32* %tx, align 4 + %idxprom135 = sext i32 %91 to i64 + %arrayidx136 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx134, i64 0, i64 %idxprom135 + %92 = load float, float* %arrayidx136, align 4 + %93 = load float, float* %jc, align 4 + %sub137 = fsub contract float %92, %93 + store float %sub137, float* %n, align 4 + %94 = load i32, i32* %ty, align 4 + %add138 = add nsw i32 %94, 1 + %idxprom139 = sext i32 %add138 to i64 + %arrayidx140 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom139 + %95 = load i32, i32* %tx, align 4 + %idxprom141 = sext i32 %95 to i64 + %arrayidx142 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx140, i64 0, i64 %idxprom141 + %96 = load float, float* %arrayidx142, align 4 + %97 = load float, float* %jc, align 4 + %sub143 = fsub contract float %96, %97 + store float %sub143, float* %s, align 4 + %98 = load i32, i32* %ty, align 4 + %idxprom144 = sext i32 %98 to i64 + %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom144 + %99 = load i32, i32* %tx, align 4 + %idxprom146 = sext i32 %99 to i64 + %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146 + %100 = load float, float* %arrayidx147, align 4 + %101 = load float, float* %jc, align 4 + %sub148 = fsub contract float %100, %101 + store float %sub148, float* %w, align 4 + %102 = load i32, i32* %ty, align 4 + %idxprom149 = sext i32 %102 to i64 + %arrayidx150 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom149 + %103 = load i32, i32* %tx, align 4 + %add151 = add nsw i32 %103, 1 + %idxprom152 = sext i32 %add151 to i64 + %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx150, i64 0, i64 %idxprom152 + %104 = load float, float* %arrayidx153, align 4 + %105 = load float, float* %jc, align 4 + %sub154 = fsub contract float %104, %105 + store float %sub154, float* %e, align 4 + br label %if.end372 + +if.else155: ; preds = %land.lhs.true, %if.end119 + %106 = load i32, i32* %ty, align 4 + %cmp156 = icmp eq i32 %106, 0 + br i1 %cmp156, label %land.lhs.true157, label %if.else182 + +land.lhs.true157: ; preds = %if.else155 + %107 = load i32, i32* %tx, align 4 + %cmp158 = icmp eq i32 %107, 15 + br i1 %cmp158, label %if.then159, label %if.else182 + +if.then159: ; preds = %land.lhs.true157 + %108 = load i32, i32* %ty, align 4 + %idxprom160 = sext i32 %108 to i64 + %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom160 + %109 = load i32, i32* %tx, align 4 + %idxprom162 = sext i32 %109 to i64 + %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162 + %110 = load float, float* %arrayidx163, align 4 + %111 = load float, float* %jc, align 4 + %sub164 = fsub contract float %110, %111 + store float %sub164, float* %n, align 4 + %112 = load i32, i32* %ty, align 4 + %add165 = add nsw i32 %112, 1 + %idxprom166 = sext i32 %add165 to i64 + %arrayidx167 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom166 + %113 = load i32, i32* %tx, align 4 + %idxprom168 = sext i32 %113 to i64 + %arrayidx169 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx167, i64 0, i64 %idxprom168 + %114 = load float, float* %arrayidx169, align 4 + %115 = load float, float* %jc, align 4 + %sub170 = fsub contract float %114, %115 + store float %sub170, float* %s, align 4 + %116 = load i32, i32* %ty, align 4 + %idxprom171 = sext i32 %116 to i64 + %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom171 + %117 = load i32, i32* %tx, align 4 + %sub173 = sub nsw i32 %117, 1 + %idxprom174 = sext i32 %sub173 to i64 + %arrayidx175 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom174 + %118 = load float, float* %arrayidx175, align 4 + %119 = load float, float* %jc, align 4 + %sub176 = fsub contract float %118, %119 + store float %sub176, float* %w, align 4 + %120 = load i32, i32* %ty, align 4 + %idxprom177 = sext i32 %120 to i64 + %arrayidx178 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom177 + %121 = load i32, i32* %tx, align 4 + %idxprom179 = sext i32 %121 to i64 + %arrayidx180 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx178, i64 0, i64 %idxprom179 + %122 = load float, float* %arrayidx180, align 4 + %123 = load float, float* %jc, align 4 + %sub181 = fsub contract float %122, %123 + store float %sub181, float* %e, align 4 + br label %if.end371 + +if.else182: ; preds = %land.lhs.true157, %if.else155 + %124 = load i32, i32* %ty, align 4 + %cmp183 = icmp eq i32 %124, 15 + br i1 %cmp183, label %land.lhs.true184, label %if.else209 + +land.lhs.true184: ; preds = %if.else182 + %125 = load i32, i32* %tx, align 4 + %cmp185 = icmp eq i32 %125, 15 + br i1 %cmp185, label %if.then186, label %if.else209 + +if.then186: ; preds = %land.lhs.true184 + %126 = load i32, i32* %ty, align 4 + %sub187 = sub nsw i32 %126, 1 + %idxprom188 = sext i32 %sub187 to i64 + %arrayidx189 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom188 + %127 = load i32, i32* %tx, align 4 + %idxprom190 = sext i32 %127 to i64 + %arrayidx191 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx189, i64 0, i64 %idxprom190 + %128 = load float, float* %arrayidx191, align 4 + %129 = load float, float* %jc, align 4 + %sub192 = fsub contract float %128, %129 + store float %sub192, float* %n, align 4 + %130 = load i32, i32* %ty, align 4 + %idxprom193 = sext i32 %130 to i64 + %arrayidx194 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom193 + %131 = load i32, i32* %tx, align 4 + %idxprom195 = sext i32 %131 to i64 + %arrayidx196 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx194, i64 0, i64 %idxprom195 + %132 = load float, float* %arrayidx196, align 4 + %133 = load float, float* %jc, align 4 + %sub197 = fsub contract float %132, %133 + store float %sub197, float* %s, align 4 + %134 = load i32, i32* %ty, align 4 + %idxprom198 = sext i32 %134 to i64 + %arrayidx199 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom198 + %135 = load i32, i32* %tx, align 4 + %sub200 = sub nsw i32 %135, 1 + %idxprom201 = sext i32 %sub200 to i64 + %arrayidx202 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx199, i64 0, i64 %idxprom201 + %136 = load float, float* %arrayidx202, align 4 + %137 = load float, float* %jc, align 4 + %sub203 = fsub contract float %136, %137 + store float %sub203, float* %w, align 4 + %138 = load i32, i32* %ty, align 4 + %idxprom204 = sext i32 %138 to i64 + %arrayidx205 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom204 + %139 = load i32, i32* %tx, align 4 + %idxprom206 = sext i32 %139 to i64 + %arrayidx207 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx205, i64 0, i64 %idxprom206 + %140 = load float, float* %arrayidx207, align 4 + %141 = load float, float* %jc, align 4 + %sub208 = fsub contract float %140, %141 + store float %sub208, float* %e, align 4 + br label %if.end370 + +if.else209: ; preds = %land.lhs.true184, %if.else182 + %142 = load i32, i32* %ty, align 4 + %cmp210 = icmp eq i32 %142, 15 + br i1 %cmp210, label %land.lhs.true211, label %if.else236 + +land.lhs.true211: ; preds = %if.else209 + %143 = load i32, i32* %tx, align 4 + %cmp212 = icmp eq i32 %143, 0 + br i1 %cmp212, label %if.then213, label %if.else236 + +if.then213: ; preds = %land.lhs.true211 + %144 = load i32, i32* %ty, align 4 + %sub214 = sub nsw i32 %144, 1 + %idxprom215 = sext i32 %sub214 to i64 + %arrayidx216 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom215 + %145 = load i32, i32* %tx, align 4 + %idxprom217 = sext i32 %145 to i64 + %arrayidx218 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx216, i64 0, i64 %idxprom217 + %146 = load float, float* %arrayidx218, align 4 + %147 = load float, float* %jc, align 4 + %sub219 = fsub contract float %146, %147 + store float %sub219, float* %n, align 4 + %148 = load i32, i32* %ty, align 4 + %idxprom220 = sext i32 %148 to i64 + %arrayidx221 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom220 + %149 = load i32, i32* %tx, align 4 + %idxprom222 = sext i32 %149 to i64 + %arrayidx223 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx221, i64 0, i64 %idxprom222 + %150 = load float, float* %arrayidx223, align 4 + %151 = load float, float* %jc, align 4 + %sub224 = fsub contract float %150, %151 + store float %sub224, float* %s, align 4 + %152 = load i32, i32* %ty, align 4 + %idxprom225 = sext i32 %152 to i64 + %arrayidx226 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom225 + %153 = load i32, i32* %tx, align 4 + %idxprom227 = sext i32 %153 to i64 + %arrayidx228 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx226, i64 0, i64 %idxprom227 + %154 = load float, float* %arrayidx228, align 4 + %155 = load float, float* %jc, align 4 + %sub229 = fsub contract float %154, %155 + store float %sub229, float* %w, align 4 + %156 = load i32, i32* %ty, align 4 + %idxprom230 = sext i32 %156 to i64 + %arrayidx231 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom230 + %157 = load i32, i32* %tx, align 4 + %add232 = add nsw i32 %157, 1 + %idxprom233 = sext i32 %add232 to i64 + %arrayidx234 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx231, i64 0, i64 %idxprom233 + %158 = load float, float* %arrayidx234, align 4 + %159 = load float, float* %jc, align 4 + %sub235 = fsub contract float %158, %159 + store float %sub235, float* %e, align 4 + br label %if.end369 + +if.else236: ; preds = %land.lhs.true211, %if.else209 + %160 = load i32, i32* %ty, align 4 + %cmp237 = icmp eq i32 %160, 0 + br i1 %cmp237, label %if.then238, label %if.else262 + +if.then238: ; preds = %if.else236 + %161 = load i32, i32* %ty, align 4 + %idxprom239 = sext i32 %161 to i64 + %arrayidx240 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom239 + %162 = load i32, i32* %tx, align 4 + %idxprom241 = sext i32 %162 to i64 + %arrayidx242 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx240, i64 0, i64 %idxprom241 + %163 = load float, float* %arrayidx242, align 4 + %164 = load float, float* %jc, align 4 + %sub243 = fsub contract float %163, %164 + store float %sub243, float* %n, align 4 + %165 = load i32, i32* %ty, align 4 + %add244 = add nsw i32 %165, 1 + %idxprom245 = sext i32 %add244 to i64 + %arrayidx246 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom245 + %166 = load i32, i32* %tx, align 4 + %idxprom247 = sext i32 %166 to i64 + %arrayidx248 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx246, i64 0, i64 %idxprom247 + %167 = load float, float* %arrayidx248, align 4 + %168 = load float, float* %jc, align 4 + %sub249 = fsub contract float %167, %168 + store float %sub249, float* %s, align 4 + %169 = load i32, i32* %ty, align 4 + %idxprom250 = sext i32 %169 to i64 + %arrayidx251 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom250 + %170 = load i32, i32* %tx, align 4 + %sub252 = sub nsw i32 %170, 1 + %idxprom253 = sext i32 %sub252 to i64 + %arrayidx254 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx251, i64 0, i64 %idxprom253 + %171 = load float, float* %arrayidx254, align 4 + %172 = load float, float* %jc, align 4 + %sub255 = fsub contract float %171, %172 + store float %sub255, float* %w, align 4 + %173 = load i32, i32* %ty, align 4 + %idxprom256 = sext i32 %173 to i64 + %arrayidx257 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom256 + %174 = load i32, i32* %tx, align 4 + %add258 = add nsw i32 %174, 1 + %idxprom259 = sext i32 %add258 to i64 + %arrayidx260 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx257, i64 0, i64 %idxprom259 + %175 = load float, float* %arrayidx260, align 4 + %176 = load float, float* %jc, align 4 + %sub261 = fsub contract float %175, %176 + store float %sub261, float* %e, align 4 + br label %if.end368 + +if.else262: ; preds = %if.else236 + %177 = load i32, i32* %tx, align 4 + %cmp263 = icmp eq i32 %177, 15 + br i1 %cmp263, label %if.then264, label %if.else288 + +if.then264: ; preds = %if.else262 + %178 = load i32, i32* %ty, align 4 + %sub265 = sub nsw i32 %178, 1 + %idxprom266 = sext i32 %sub265 to i64 + %arrayidx267 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom266 + %179 = load i32, i32* %tx, align 4 + %idxprom268 = sext i32 %179 to i64 + %arrayidx269 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx267, i64 0, i64 %idxprom268 + %180 = load float, float* %arrayidx269, align 4 + %181 = load float, float* %jc, align 4 + %sub270 = fsub contract float %180, %181 + store float %sub270, float* %n, align 4 + %182 = load i32, i32* %ty, align 4 + %add271 = add nsw i32 %182, 1 + %idxprom272 = sext i32 %add271 to i64 + %arrayidx273 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom272 + %183 = load i32, i32* %tx, align 4 + %idxprom274 = sext i32 %183 to i64 + %arrayidx275 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx273, i64 0, i64 %idxprom274 + %184 = load float, float* %arrayidx275, align 4 + %185 = load float, float* %jc, align 4 + %sub276 = fsub contract float %184, %185 + store float %sub276, float* %s, align 4 + %186 = load i32, i32* %ty, align 4 + %idxprom277 = sext i32 %186 to i64 + %arrayidx278 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom277 + %187 = load i32, i32* %tx, align 4 + %sub279 = sub nsw i32 %187, 1 + %idxprom280 = sext i32 %sub279 to i64 + %arrayidx281 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx278, i64 0, i64 %idxprom280 + %188 = load float, float* %arrayidx281, align 4 + %189 = load float, float* %jc, align 4 + %sub282 = fsub contract float %188, %189 + store float %sub282, float* %w, align 4 + %190 = load i32, i32* %ty, align 4 + %idxprom283 = sext i32 %190 to i64 + %arrayidx284 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom283 + %191 = load i32, i32* %tx, align 4 + %idxprom285 = sext i32 %191 to i64 + %arrayidx286 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx284, i64 0, i64 %idxprom285 + %192 = load float, float* %arrayidx286, align 4 + %193 = load float, float* %jc, align 4 + %sub287 = fsub contract float %192, %193 + store float %sub287, float* %e, align 4 + br label %if.end367 + +if.else288: ; preds = %if.else262 + %194 = load i32, i32* %ty, align 4 + %cmp289 = icmp eq i32 %194, 15 + br i1 %cmp289, label %if.then290, label %if.else314 + +if.then290: ; preds = %if.else288 + %195 = load i32, i32* %ty, align 4 + %sub291 = sub nsw i32 %195, 1 + %idxprom292 = sext i32 %sub291 to i64 + %arrayidx293 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom292 + %196 = load i32, i32* %tx, align 4 + %idxprom294 = sext i32 %196 to i64 + %arrayidx295 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx293, i64 0, i64 %idxprom294 + %197 = load float, float* %arrayidx295, align 4 + %198 = load float, float* %jc, align 4 + %sub296 = fsub contract float %197, %198 + store float %sub296, float* %n, align 4 + %199 = load i32, i32* %ty, align 4 + %idxprom297 = sext i32 %199 to i64 + %arrayidx298 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom297 + %200 = load i32, i32* %tx, align 4 + %idxprom299 = sext i32 %200 to i64 + %arrayidx300 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx298, i64 0, i64 %idxprom299 + %201 = load float, float* %arrayidx300, align 4 + %202 = load float, float* %jc, align 4 + %sub301 = fsub contract float %201, %202 + store float %sub301, float* %s, align 4 + %203 = load i32, i32* %ty, align 4 + %idxprom302 = sext i32 %203 to i64 + %arrayidx303 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom302 + %204 = load i32, i32* %tx, align 4 + %sub304 = sub nsw i32 %204, 1 + %idxprom305 = sext i32 %sub304 to i64 + %arrayidx306 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx303, i64 0, i64 %idxprom305 + %205 = load float, float* %arrayidx306, align 4 + %206 = load float, float* %jc, align 4 + %sub307 = fsub contract float %205, %206 + store float %sub307, float* %w, align 4 + %207 = load i32, i32* %ty, align 4 + %idxprom308 = sext i32 %207 to i64 + %arrayidx309 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom308 + %208 = load i32, i32* %tx, align 4 + %add310 = add nsw i32 %208, 1 + %idxprom311 = sext i32 %add310 to i64 + %arrayidx312 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx309, i64 0, i64 %idxprom311 + %209 = load float, float* %arrayidx312, align 4 + %210 = load float, float* %jc, align 4 + %sub313 = fsub contract float %209, %210 + store float %sub313, float* %e, align 4 + br label %if.end366 + +if.else314: ; preds = %if.else288 + %211 = load i32, i32* %tx, align 4 + %cmp315 = icmp eq i32 %211, 0 + br i1 %cmp315, label %if.then316, label %if.else340 + +if.then316: ; preds = %if.else314 + %212 = load i32, i32* %ty, align 4 + %sub317 = sub nsw i32 %212, 1 + %idxprom318 = sext i32 %sub317 to i64 + %arrayidx319 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom318 + %213 = load i32, i32* %tx, align 4 + %idxprom320 = sext i32 %213 to i64 + %arrayidx321 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx319, i64 0, i64 %idxprom320 + %214 = load float, float* %arrayidx321, align 4 + %215 = load float, float* %jc, align 4 + %sub322 = fsub contract float %214, %215 + store float %sub322, float* %n, align 4 + %216 = load i32, i32* %ty, align 4 + %add323 = add nsw i32 %216, 1 + %idxprom324 = sext i32 %add323 to i64 + %arrayidx325 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom324 + %217 = load i32, i32* %tx, align 4 + %idxprom326 = sext i32 %217 to i64 + %arrayidx327 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx325, i64 0, i64 %idxprom326 + %218 = load float, float* %arrayidx327, align 4 + %219 = load float, float* %jc, align 4 + %sub328 = fsub contract float %218, %219 + store float %sub328, float* %s, align 4 + %220 = load i32, i32* %ty, align 4 + %idxprom329 = sext i32 %220 to i64 + %arrayidx330 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom329 + %221 = load i32, i32* %tx, align 4 + %idxprom331 = sext i32 %221 to i64 + %arrayidx332 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx330, i64 0, i64 %idxprom331 + %222 = load float, float* %arrayidx332, align 4 + %223 = load float, float* %jc, align 4 + %sub333 = fsub contract float %222, %223 + store float %sub333, float* %w, align 4 + %224 = load i32, i32* %ty, align 4 + %idxprom334 = sext i32 %224 to i64 + %arrayidx335 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom334 + %225 = load i32, i32* %tx, align 4 + %add336 = add nsw i32 %225, 1 + %idxprom337 = sext i32 %add336 to i64 + %arrayidx338 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx335, i64 0, i64 %idxprom337 + %226 = load float, float* %arrayidx338, align 4 + %227 = load float, float* %jc, align 4 + %sub339 = fsub contract float %226, %227 + store float %sub339, float* %e, align 4 + br label %if.end365 + +if.else340: ; preds = %if.else314 + %228 = load i32, i32* %ty, align 4 + %sub341 = sub nsw i32 %228, 1 + %idxprom342 = sext i32 %sub341 to i64 + %arrayidx343 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom342 + %229 = load i32, i32* %tx, align 4 + %idxprom344 = sext i32 %229 to i64 + %arrayidx345 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx343, i64 0, i64 %idxprom344 + %230 = load float, float* %arrayidx345, align 4 + %231 = load float, float* %jc, align 4 + %sub346 = fsub contract float %230, %231 + store float %sub346, float* %n, align 4 + %232 = load i32, i32* %ty, align 4 + %add347 = add nsw i32 %232, 1 + %idxprom348 = sext i32 %add347 to i64 + %arrayidx349 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom348 + %233 = load i32, i32* %tx, align 4 + %idxprom350 = sext i32 %233 to i64 + %arrayidx351 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx349, i64 0, i64 %idxprom350 + %234 = load float, float* %arrayidx351, align 4 + %235 = load float, float* %jc, align 4 + %sub352 = fsub contract float %234, %235 + store float %sub352, float* %s, align 4 + %236 = load i32, i32* %ty, align 4 + %idxprom353 = sext i32 %236 to i64 + %arrayidx354 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom353 + %237 = load i32, i32* %tx, align 4 + %sub355 = sub nsw i32 %237, 1 + %idxprom356 = sext i32 %sub355 to i64 + %arrayidx357 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx354, i64 0, i64 %idxprom356 + %238 = load float, float* %arrayidx357, align 4 + %239 = load float, float* %jc, align 4 + %sub358 = fsub contract float %238, %239 + store float %sub358, float* %w, align 4 + %240 = load i32, i32* %ty, align 4 + %idxprom359 = sext i32 %240 to i64 + %arrayidx360 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom359 + %241 = load i32, i32* %tx, align 4 + %add361 = add nsw i32 %241, 1 + %idxprom362 = sext i32 %add361 to i64 + %arrayidx363 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx360, i64 0, i64 %idxprom362 + %242 = load float, float* %arrayidx363, align 4 + %243 = load float, float* %jc, align 4 + %sub364 = fsub contract float %242, %243 + store float %sub364, float* %e, align 4 + br label %if.end365 + +if.end365: ; preds = %if.else340, %if.then316 + br label %if.end366 + +if.end366: ; preds = %if.end365, %if.then290 + br label %if.end367 + +if.end367: ; preds = %if.end366, %if.then264 + br label %if.end368 + +if.end368: ; preds = %if.end367, %if.then238 + br label %if.end369 + +if.end369: ; preds = %if.end368, %if.then213 + br label %if.end370 + +if.end370: ; preds = %if.end369, %if.then186 + br label %if.end371 + +if.end371: ; preds = %if.end370, %if.then159 + br label %if.end372 + +if.end372: ; preds = %if.end371, %if.then132 + %244 = load float, float* %n, align 4 + %245 = load float, float* %n, align 4 + %mul373 = fmul contract float %244, %245 + %246 = load float, float* %s, align 4 + %247 = load float, float* %s, align 4 + %mul374 = fmul contract float %246, %247 + %add375 = fadd contract float %mul373, %mul374 + %248 = load float, float* %w, align 4 + %249 = load float, float* %w, align 4 + %mul376 = fmul contract float %248, %249 + %add377 = fadd contract float %add375, %mul376 + %250 = load float, float* %e, align 4 + %251 = load float, float* %e, align 4 + %mul378 = fmul contract float %250, %251 + %add379 = fadd contract float %add377, %mul378 + %252 = load float, float* %jc, align 4 + %253 = load float, float* %jc, align 4 + %mul380 = fmul contract float %252, %253 + %div = fdiv float %add379, %mul380 + store float %div, float* %g2, align 4 + %254 = load float, float* %n, align 4 + %255 = load float, float* %s, align 4 + %add381 = fadd contract float %254, %255 + %256 = load float, float* %w, align 4 + %add382 = fadd contract float %add381, %256 + %257 = load float, float* %e, align 4 + %add383 = fadd contract float %add382, %257 + %258 = load float, float* %jc, align 4 + %div384 = fdiv float %add383, %258 + store float %div384, float* %l, align 4 + %259 = load float, float* %g2, align 4 + %conv = fpext float %259 to double + %mul385 = fmul contract double 5.000000e-01, %conv + %260 = load float, float* %l, align 4 + %261 = load float, float* %l, align 4 + %mul386 = fmul contract float %260, %261 + %conv387 = fpext float %mul386 to double + %mul388 = fmul contract double 6.250000e-02, %conv387 + %sub389 = fsub contract double %mul385, %mul388 + %conv390 = fptrunc double %sub389 to float + store float %conv390, float* %num, align 4 + %262 = load float, float* %l, align 4 + %conv391 = fpext float %262 to double + %mul392 = fmul contract double 2.500000e-01, %conv391 + %add393 = fadd contract double 1.000000e+00, %mul392 + %conv394 = fptrunc double %add393 to float + store float %conv394, float* %den, align 4 + %263 = load float, float* %num, align 4 + %264 = load float, float* %den, align 4 + %265 = load float, float* %den, align 4 + %mul395 = fmul contract float %264, %265 + %div396 = fdiv float %263, %mul395 + store float %div396, float* %qsqr, align 4 + %266 = load float, float* %qsqr, align 4 + %267 = load float, float* %q0sqr.addr, align 4 + %sub397 = fsub contract float %266, %267 + %268 = load float, float* %q0sqr.addr, align 4 + %269 = load float, float* %q0sqr.addr, align 4 + %add398 = fadd contract float 1.000000e+00, %269 + %mul399 = fmul contract float %268, %add398 + %div400 = fdiv float %sub397, %mul399 + store float %div400, float* %den, align 4 + %270 = load float, float* %den, align 4 + %conv401 = fpext float %270 to double + %add402 = fadd contract double 1.000000e+00, %conv401 + %div403 = fdiv double 1.000000e+00, %add402 + %conv404 = fptrunc double %div403 to float + store float %conv404, float* %c, align 4 + %271 = load float, float* %c, align 4 + %cmp405 = fcmp olt float %271, 0.000000e+00 + br i1 %cmp405, label %if.then406, label %if.else411 + +if.then406: ; preds = %if.end372 + %272 = load i32, i32* %ty, align 4 + %idxprom407 = sext i32 %272 to i64 + %arrayidx408 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom407 + %273 = load i32, i32* %tx, align 4 + %idxprom409 = sext i32 %273 to i64 + %arrayidx410 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx408, i64 0, i64 %idxprom409 + store float 0.000000e+00, float* %arrayidx410, align 4 + br label %if.end424 + +if.else411: ; preds = %if.end372 + %274 = load float, float* %c, align 4 + %cmp412 = fcmp ogt float %274, 1.000000e+00 + br i1 %cmp412, label %if.then413, label %if.else418 + +if.then413: ; preds = %if.else411 + %275 = load i32, i32* %ty, align 4 + %idxprom414 = sext i32 %275 to i64 + %arrayidx415 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom414 + %276 = load i32, i32* %tx, align 4 + %idxprom416 = sext i32 %276 to i64 + %arrayidx417 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx415, i64 0, i64 %idxprom416 + store float 1.000000e+00, float* %arrayidx417, align 4 + br label %if.end423 + +if.else418: ; preds = %if.else411 + %277 = load float, float* %c, align 4 + %278 = load i32, i32* %ty, align 4 + %idxprom419 = sext i32 %278 to i64 + %arrayidx420 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom419 + %279 = load i32, i32* %tx, align 4 + %idxprom421 = sext i32 %279 to i64 + %arrayidx422 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx420, i64 0, i64 %idxprom421 + store float %277, float* %arrayidx422, align 4 + br label %if.end423 + +if.end423: ; preds = %if.else418, %if.then413 + br label %if.end424 + +if.end424: ; preds = %if.end423, %if.then406 + call void @llvm.nvvm.barrier0() + %280 = load i32, i32* %ty, align 4 + %idxprom425 = sext i32 %280 to i64 + %arrayidx426 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom425 + %281 = load i32, i32* %tx, align 4 + %idxprom427 = sext i32 %281 to i64 + %arrayidx428 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx426, i64 0, i64 %idxprom427 + %282 = load float, float* %arrayidx428, align 4 + %283 = load float*, float** %C_cuda.addr, align 8 + %284 = load i32, i32* %index, align 4 + %idxprom429 = sext i32 %284 to i64 + %arrayidx430 = getelementptr inbounds float, float* %283, i64 %idxprom429 + store float %282, float* %arrayidx430, align 4 + %285 = load float, float* %e, align 4 + %286 = load float*, float** %E_C.addr, align 8 + %287 = load i32, i32* %index, align 4 + %idxprom431 = sext i32 %287 to i64 + %arrayidx432 = getelementptr inbounds float, float* %286, i64 %idxprom431 + store float %285, float* %arrayidx432, align 4 + %288 = load float, float* %w, align 4 + %289 = load float*, float** %W_C.addr, align 8 + %290 = load i32, i32* %index, align 4 + %idxprom433 = sext i32 %290 to i64 + %arrayidx434 = getelementptr inbounds float, float* %289, i64 %idxprom433 + store float %288, float* %arrayidx434, align 4 + %291 = load float, float* %s, align 4 + %292 = load float*, float** %S_C.addr, align 8 + %293 = load i32, i32* %index, align 4 + %idxprom435 = sext i32 %293 to i64 + %arrayidx436 = getelementptr inbounds float, float* %292, i64 %idxprom435 + store float %291, float* %arrayidx436, align 4 + %294 = load float, float* %n, align 4 + %295 = load float*, float** %N_C.addr, align 8 + %296 = load i32, i32* %index, align 4 + %idxprom437 = sext i32 %296 to i64 + %arrayidx438 = getelementptr inbounds float, float* %295, i64 %idxprom437 + store float %294, float* %arrayidx438, align 4 + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + ret i32 %0 +} + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + ret i32 %0 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z11srad_cuda_2PfS_S_S_S_S_iiff(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %lambda, float %q0sqr) #0 { +entry: + %E_C.addr = alloca float*, align 8 + %W_C.addr = alloca float*, align 8 + %N_C.addr = alloca float*, align 8 + %S_C.addr = alloca float*, align 8 + %J_cuda.addr = alloca float*, align 8 + %C_cuda.addr = alloca float*, align 8 + %cols.addr = alloca i32, align 4 + %rows.addr = alloca i32, align 4 + %lambda.addr = alloca float, align 4 + %q0sqr.addr = alloca float, align 4 + %bx = alloca i32, align 4 + %by = alloca i32, align 4 + %tx = alloca i32, align 4 + %ty = alloca i32, align 4 + %index = alloca i32, align 4 + %index_s = alloca i32, align 4 + %index_e = alloca i32, align 4 + %cc = alloca float, align 4 + %cn = alloca float, align 4 + %cs = alloca float, align 4 + %ce = alloca float, align 4 + %cw = alloca float, align 4 + %d_sum = alloca float, align 4 + store float* %E_C, float** %E_C.addr, align 8 + store float* %W_C, float** %W_C.addr, align 8 + store float* %N_C, float** %N_C.addr, align 8 + store float* %S_C, float** %S_C.addr, align 8 + store float* %J_cuda, float** %J_cuda.addr, align 8 + store float* %C_cuda, float** %C_cuda.addr, align 8 + store i32 %cols, i32* %cols.addr, align 4 + store i32 %rows, i32* %rows.addr, align 4 + store float %lambda, float* %lambda.addr, align 4 + store float %q0sqr, float* %q0sqr.addr, align 4 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 + store i32 %call, i32* %bx, align 4 + %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2 + store i32 %call1, i32* %by, align 4 + %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 + store i32 %call2, i32* %tx, align 4 + %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2 + store i32 %call3, i32* %ty, align 4 + %0 = load i32, i32* %cols.addr, align 4 + %mul = mul nsw i32 %0, 16 + %1 = load i32, i32* %by, align 4 + %mul4 = mul nsw i32 %mul, %1 + %2 = load i32, i32* %bx, align 4 + %mul5 = mul nsw i32 16, %2 + %add = add nsw i32 %mul4, %mul5 + %3 = load i32, i32* %cols.addr, align 4 + %4 = load i32, i32* %ty, align 4 + %mul6 = mul nsw i32 %3, %4 + %add7 = add nsw i32 %add, %mul6 + %5 = load i32, i32* %tx, align 4 + %add8 = add nsw i32 %add7, %5 + store i32 %add8, i32* %index, align 4 + %6 = load i32, i32* %cols.addr, align 4 + %mul9 = mul nsw i32 %6, 16 + %7 = load i32, i32* %by, align 4 + %mul10 = mul nsw i32 %mul9, %7 + %8 = load i32, i32* %bx, align 4 + %mul11 = mul nsw i32 16, %8 + %add12 = add nsw i32 %mul10, %mul11 + %9 = load i32, i32* %cols.addr, align 4 + %mul13 = mul nsw i32 %9, 16 + %add14 = add nsw i32 %add12, %mul13 + %10 = load i32, i32* %tx, align 4 + %add15 = add nsw i32 %add14, %10 + store i32 %add15, i32* %index_s, align 4 + %11 = load i32, i32* %cols.addr, align 4 + %mul16 = mul nsw i32 %11, 16 + %12 = load i32, i32* %by, align 4 + %mul17 = mul nsw i32 %mul16, %12 + %13 = load i32, i32* %bx, align 4 + %mul18 = mul nsw i32 16, %13 + %add19 = add nsw i32 %mul17, %mul18 + %14 = load i32, i32* %cols.addr, align 4 + %15 = load i32, i32* %ty, align 4 + %mul20 = mul nsw i32 %14, %15 + %add21 = add nsw i32 %add19, %mul20 + %add22 = add nsw i32 %add21, 16 + store i32 %add22, i32* %index_e, align 4 + %16 = load float*, float** %J_cuda.addr, align 8 + %17 = load i32, i32* %index, align 4 + %idxprom = sext i32 %17 to i64 + %arrayidx = getelementptr inbounds float, float* %16, i64 %idxprom + %18 = load float, float* %arrayidx, align 4 + %19 = load i32, i32* %ty, align 4 + %idxprom23 = sext i32 %19 to i64 + %arrayidx24 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom23 + %20 = load i32, i32* %tx, align 4 + %idxprom25 = sext i32 %20 to i64 + %arrayidx26 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx24, i64 0, i64 %idxprom25 + store float %18, float* %arrayidx26, align 4 + call void @llvm.nvvm.barrier0() + %21 = load float*, float** %C_cuda.addr, align 8 + %22 = load i32, i32* %index_s, align 4 + %idxprom27 = sext i32 %22 to i64 + %arrayidx28 = getelementptr inbounds float, float* %21, i64 %idxprom27 + %23 = load float, float* %arrayidx28, align 4 + %24 = load i32, i32* %ty, align 4 + %idxprom29 = sext i32 %24 to i64 + %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom29 + %25 = load i32, i32* %tx, align 4 + %idxprom31 = sext i32 %25 to i64 + %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31 + store float %23, float* %arrayidx32, align 4 + %26 = load i32, i32* %by, align 4 + %call33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2 + %sub = sub i32 %call33, 1 + %cmp = icmp eq i32 %26, %sub + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %27 = load float*, float** %C_cuda.addr, align 8 + %28 = load i32, i32* %cols.addr, align 4 + %mul34 = mul nsw i32 %28, 16 + %call35 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2 + %sub36 = sub i32 %call35, 1 + %mul37 = mul i32 %mul34, %sub36 + %29 = load i32, i32* %bx, align 4 + %mul38 = mul nsw i32 16, %29 + %add39 = add i32 %mul37, %mul38 + %30 = load i32, i32* %cols.addr, align 4 + %mul40 = mul nsw i32 %30, 15 + %add41 = add i32 %add39, %mul40 + %31 = load i32, i32* %tx, align 4 + %add42 = add i32 %add41, %31 + %idxprom43 = zext i32 %add42 to i64 + %arrayidx44 = getelementptr inbounds float, float* %27, i64 %idxprom43 + %32 = load float, float* %arrayidx44, align 4 + %33 = load i32, i32* %ty, align 4 + %idxprom45 = sext i32 %33 to i64 + %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom45 + %34 = load i32, i32* %tx, align 4 + %idxprom47 = sext i32 %34 to i64 + %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47 + store float %32, float* %arrayidx48, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + call void @llvm.nvvm.barrier0() + %35 = load float*, float** %C_cuda.addr, align 8 + %36 = load i32, i32* %index_e, align 4 + %idxprom49 = sext i32 %36 to i64 + %arrayidx50 = getelementptr inbounds float, float* %35, i64 %idxprom49 + %37 = load float, float* %arrayidx50, align 4 + %38 = load i32, i32* %ty, align 4 + %idxprom51 = sext i32 %38 to i64 + %arrayidx52 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom51 + %39 = load i32, i32* %tx, align 4 + %idxprom53 = sext i32 %39 to i64 + %arrayidx54 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx52, i64 0, i64 %idxprom53 + store float %37, float* %arrayidx54, align 4 + %40 = load i32, i32* %bx, align 4 + %call55 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 + %sub56 = sub i32 %call55, 1 + %cmp57 = icmp eq i32 %40, %sub56 + br i1 %cmp57, label %if.then58, label %if.end75 + +if.then58: ; preds = %if.end + %41 = load float*, float** %C_cuda.addr, align 8 + %42 = load i32, i32* %cols.addr, align 4 + %mul59 = mul nsw i32 %42, 16 + %43 = load i32, i32* %by, align 4 + %mul60 = mul nsw i32 %mul59, %43 + %call61 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 + %sub62 = sub i32 %call61, 1 + %mul63 = mul i32 16, %sub62 + %add64 = add i32 %mul60, %mul63 + %44 = load i32, i32* %cols.addr, align 4 + %45 = load i32, i32* %ty, align 4 + %mul65 = mul nsw i32 %44, %45 + %add66 = add i32 %add64, %mul65 + %add67 = add i32 %add66, 16 + %sub68 = sub i32 %add67, 1 + %idxprom69 = zext i32 %sub68 to i64 + %arrayidx70 = getelementptr inbounds float, float* %41, i64 %idxprom69 + %46 = load float, float* %arrayidx70, align 4 + %47 = load i32, i32* %ty, align 4 + %idxprom71 = sext i32 %47 to i64 + %arrayidx72 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom71 + %48 = load i32, i32* %tx, align 4 + %idxprom73 = sext i32 %48 to i64 + %arrayidx74 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx72, i64 0, i64 %idxprom73 + store float %46, float* %arrayidx74, align 4 + br label %if.end75 + +if.end75: ; preds = %if.then58, %if.end + call void @llvm.nvvm.barrier0() + %49 = load float*, float** %C_cuda.addr, align 8 + %50 = load i32, i32* %index, align 4 + %idxprom76 = sext i32 %50 to i64 + %arrayidx77 = getelementptr inbounds float, float* %49, i64 %idxprom76 + %51 = load float, float* %arrayidx77, align 4 + %52 = load i32, i32* %ty, align 4 + %idxprom78 = sext i32 %52 to i64 + %arrayidx79 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom78 + %53 = load i32, i32* %tx, align 4 + %idxprom80 = sext i32 %53 to i64 + %arrayidx81 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx79, i64 0, i64 %idxprom80 + store float %51, float* %arrayidx81, align 4 + call void @llvm.nvvm.barrier0() + %54 = load i32, i32* %ty, align 4 + %idxprom82 = sext i32 %54 to i64 + %arrayidx83 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom82 + %55 = load i32, i32* %tx, align 4 + %idxprom84 = sext i32 %55 to i64 + %arrayidx85 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx83, i64 0, i64 %idxprom84 + %56 = load float, float* %arrayidx85, align 4 + store float %56, float* %cc, align 4 + %57 = load i32, i32* %ty, align 4 + %cmp86 = icmp eq i32 %57, 15 + br i1 %cmp86, label %land.lhs.true, label %if.else + +land.lhs.true: ; preds = %if.end75 + %58 = load i32, i32* %tx, align 4 + %cmp87 = icmp eq i32 %58, 15 + br i1 %cmp87, label %if.then88, label %if.else + +if.then88: ; preds = %land.lhs.true + %59 = load float, float* %cc, align 4 + store float %59, float* %cn, align 4 + %60 = load i32, i32* %ty, align 4 + %idxprom89 = sext i32 %60 to i64 + %arrayidx90 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom89 + %61 = load i32, i32* %tx, align 4 + %idxprom91 = sext i32 %61 to i64 + %arrayidx92 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx90, i64 0, i64 %idxprom91 + %62 = load float, float* %arrayidx92, align 4 + store float %62, float* %cs, align 4 + %63 = load float, float* %cc, align 4 + store float %63, float* %cw, align 4 + %64 = load i32, i32* %ty, align 4 + %idxprom93 = sext i32 %64 to i64 + %arrayidx94 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom93 + %65 = load i32, i32* %tx, align 4 + %idxprom95 = sext i32 %65 to i64 + %arrayidx96 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx94, i64 0, i64 %idxprom95 + %66 = load float, float* %arrayidx96, align 4 + store float %66, float* %ce, align 4 + br label %if.end133 + +if.else: ; preds = %land.lhs.true, %if.end75 + %67 = load i32, i32* %tx, align 4 + %cmp97 = icmp eq i32 %67, 15 + br i1 %cmp97, label %if.then98, label %if.else108 + +if.then98: ; preds = %if.else + %68 = load float, float* %cc, align 4 + store float %68, float* %cn, align 4 + %69 = load i32, i32* %ty, align 4 + %add99 = add nsw i32 %69, 1 + %idxprom100 = sext i32 %add99 to i64 + %arrayidx101 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom100 + %70 = load i32, i32* %tx, align 4 + %idxprom102 = sext i32 %70 to i64 + %arrayidx103 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx101, i64 0, i64 %idxprom102 + %71 = load float, float* %arrayidx103, align 4 + store float %71, float* %cs, align 4 + %72 = load float, float* %cc, align 4 + store float %72, float* %cw, align 4 + %73 = load i32, i32* %ty, align 4 + %idxprom104 = sext i32 %73 to i64 + %arrayidx105 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom104 + %74 = load i32, i32* %tx, align 4 + %idxprom106 = sext i32 %74 to i64 + %arrayidx107 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx105, i64 0, i64 %idxprom106 + %75 = load float, float* %arrayidx107, align 4 + store float %75, float* %ce, align 4 + br label %if.end132 + +if.else108: ; preds = %if.else + %76 = load i32, i32* %ty, align 4 + %cmp109 = icmp eq i32 %76, 15 + br i1 %cmp109, label %if.then110, label %if.else120 + +if.then110: ; preds = %if.else108 + %77 = load float, float* %cc, align 4 + store float %77, float* %cn, align 4 + %78 = load i32, i32* %ty, align 4 + %idxprom111 = sext i32 %78 to i64 + %arrayidx112 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom111 + %79 = load i32, i32* %tx, align 4 + %idxprom113 = sext i32 %79 to i64 + %arrayidx114 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx112, i64 0, i64 %idxprom113 + %80 = load float, float* %arrayidx114, align 4 + store float %80, float* %cs, align 4 + %81 = load float, float* %cc, align 4 + store float %81, float* %cw, align 4 + %82 = load i32, i32* %ty, align 4 + %idxprom115 = sext i32 %82 to i64 + %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom115 + %83 = load i32, i32* %tx, align 4 + %add117 = add nsw i32 %83, 1 + %idxprom118 = sext i32 %add117 to i64 + %arrayidx119 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom118 + %84 = load float, float* %arrayidx119, align 4 + store float %84, float* %ce, align 4 + br label %if.end131 + +if.else120: ; preds = %if.else108 + %85 = load float, float* %cc, align 4 + store float %85, float* %cn, align 4 + %86 = load i32, i32* %ty, align 4 + %add121 = add nsw i32 %86, 1 + %idxprom122 = sext i32 %add121 to i64 + %arrayidx123 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom122 + %87 = load i32, i32* %tx, align 4 + %idxprom124 = sext i32 %87 to i64 + %arrayidx125 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx123, i64 0, i64 %idxprom124 + %88 = load float, float* %arrayidx125, align 4 + store float %88, float* %cs, align 4 + %89 = load float, float* %cc, align 4 + store float %89, float* %cw, align 4 + %90 = load i32, i32* %ty, align 4 + %idxprom126 = sext i32 %90 to i64 + %arrayidx127 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom126 + %91 = load i32, i32* %tx, align 4 + %add128 = add nsw i32 %91, 1 + %idxprom129 = sext i32 %add128 to i64 + %arrayidx130 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx127, i64 0, i64 %idxprom129 + %92 = load float, float* %arrayidx130, align 4 + store float %92, float* %ce, align 4 + br label %if.end131 + +if.end131: ; preds = %if.else120, %if.then110 + br label %if.end132 + +if.end132: ; preds = %if.end131, %if.then98 + br label %if.end133 + +if.end133: ; preds = %if.end132, %if.then88 + %93 = load float, float* %cn, align 4 + %94 = load float*, float** %N_C.addr, align 8 + %95 = load i32, i32* %index, align 4 + %idxprom134 = sext i32 %95 to i64 + %arrayidx135 = getelementptr inbounds float, float* %94, i64 %idxprom134 + %96 = load float, float* %arrayidx135, align 4 + %mul136 = fmul contract float %93, %96 + %97 = load float, float* %cs, align 4 + %98 = load float*, float** %S_C.addr, align 8 + %99 = load i32, i32* %index, align 4 + %idxprom137 = sext i32 %99 to i64 + %arrayidx138 = getelementptr inbounds float, float* %98, i64 %idxprom137 + %100 = load float, float* %arrayidx138, align 4 + %mul139 = fmul contract float %97, %100 + %add140 = fadd contract float %mul136, %mul139 + %101 = load float, float* %cw, align 4 + %102 = load float*, float** %W_C.addr, align 8 + %103 = load i32, i32* %index, align 4 + %idxprom141 = sext i32 %103 to i64 + %arrayidx142 = getelementptr inbounds float, float* %102, i64 %idxprom141 + %104 = load float, float* %arrayidx142, align 4 + %mul143 = fmul contract float %101, %104 + %add144 = fadd contract float %add140, %mul143 + %105 = load float, float* %ce, align 4 + %106 = load float*, float** %E_C.addr, align 8 + %107 = load i32, i32* %index, align 4 + %idxprom145 = sext i32 %107 to i64 + %arrayidx146 = getelementptr inbounds float, float* %106, i64 %idxprom145 + %108 = load float, float* %arrayidx146, align 4 + %mul147 = fmul contract float %105, %108 + %add148 = fadd contract float %add144, %mul147 + store float %add148, float* %d_sum, align 4 + %109 = load i32, i32* %ty, align 4 + %idxprom149 = sext i32 %109 to i64 + %arrayidx150 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom149 + %110 = load i32, i32* %tx, align 4 + %idxprom151 = sext i32 %110 to i64 + %arrayidx152 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx150, i64 0, i64 %idxprom151 + %111 = load float, float* %arrayidx152, align 4 + %conv = fpext float %111 to double + %112 = load float, float* %lambda.addr, align 4 + %conv153 = fpext float %112 to double + %mul154 = fmul contract double 2.500000e-01, %conv153 + %113 = load float, float* %d_sum, align 4 + %conv155 = fpext float %113 to double + %mul156 = fmul contract double %mul154, %conv155 + %add157 = fadd contract double %conv, %mul156 + %conv158 = fptrunc double %add157 to float + %114 = load i32, i32* %ty, align 4 + %idxprom159 = sext i32 %114 to i64 + %arrayidx160 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result to [16 x [16 x float]]*), i64 0, i64 %idxprom159 + %115 = load i32, i32* %tx, align 4 + %idxprom161 = sext i32 %115 to i64 + %arrayidx162 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx160, i64 0, i64 %idxprom161 + store float %conv158, float* %arrayidx162, align 4 + call void @llvm.nvvm.barrier0() + %116 = load i32, i32* %ty, align 4 + %idxprom163 = sext i32 %116 to i64 + %arrayidx164 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result to [16 x [16 x float]]*), i64 0, i64 %idxprom163 + %117 = load i32, i32* %tx, align 4 + %idxprom165 = sext i32 %117 to i64 + %arrayidx166 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx164, i64 0, i64 %idxprom165 + %118 = load float, float* %arrayidx166, align 4 + %119 = load float*, float** %J_cuda.addr, align 8 + %120 = load i32, i32* %index, align 4 + %idxprom167 = sext i32 %120 to i64 + %arrayidx168 = getelementptr inbounds float, float* %119, i64 %idxprom167 + store float %118, float* %arrayidx168, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #3 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7} +!llvm.ident = !{!9} +!nvvmir.version = !{!10} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (float*, float*, float*, float*, float*, float*, i32, i32, float)* @_Z11srad_cuda_1PfS_S_S_S_S_iif, !"kernel", i32 1} +!4 = !{void (float*, float*, float*, float*, float*, float*, i32, i32, float, float)* @_Z11srad_cuda_2PfS_S_S_S_S_iiff, !"kernel", i32 1} +!5 = !{null, !"align", i32 8} +!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!7 = !{null, !"align", i32 16} +!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!10 = !{i32 1, i32 4} diff --git a/examples/srad_v2/srad-host-x86_64-unknown-linux-gnu.ll b/examples/srad_v2/srad-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..7d90ac4 --- /dev/null +++ b/examples/srad_v2/srad-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,962 @@ +; ModuleID = 'srad-host-x86_64-unknown-linux-gnu.bc' +source_filename = "srad.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque + +$_ZSt3expf = comdat any + +$_ZN4dim3C2Ejjj = comdat any + +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str = private unnamed_addr constant [67 x i8] c"Usage: %s \0A\00", align 1 +@.str.1 = private unnamed_addr constant [28 x i8] c"\09 - number of rows\0A\00", align 1 +@.str.2 = private unnamed_addr constant [29 x i8] c"\09 - number of cols\0A\00", align 1 +@.str.3 = private unnamed_addr constant [35 x i8] c"\09 \09 - y1 value of the speckle\0A\00", align 1 +@.str.4 = private unnamed_addr constant [38 x i8] c"\09 - y2 value of the speckle\0A\00", align 1 +@.str.5 = private unnamed_addr constant [39 x i8] c"\09 - x1 value of the speckle\0A\00", align 1 +@.str.6 = private unnamed_addr constant [39 x i8] c"\09 - x2 value of the speckle\0A\00", align 1 +@.str.7 = private unnamed_addr constant [27 x i8] c"\09 - lambda (0,1)\0A\00", align 1 +@.str.8 = private unnamed_addr constant [41 x i8] c"\09 - number of iterations\0A\00", align 1 +@.str.9 = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1 +@.str.10 = private unnamed_addr constant [39 x i8] c"rows and cols must be multiples of 16\0A\00", align 1 +@.str.11 = private unnamed_addr constant [30 x i8] c"Randomizing the input matrix\0A\00", align 1 +@.str.12 = private unnamed_addr constant [26 x i8] c"Start the SRAD main loop\0A\00", align 1 +@.str.13 = private unnamed_addr constant [18 x i8] c"Printing Output:\0A\00", align 1 +@.str.14 = private unnamed_addr constant [6 x i8] c"%.5f \00", align 1 +@.str.15 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 +@.str.16 = private unnamed_addr constant [18 x i8] c"Computation Done\0A\00", align 1 +@0 = private unnamed_addr constant [31 x i8] c"_Z11srad_cuda_1PfS_S_S_S_S_iif\00", align 1 +@1 = private unnamed_addr constant [32 x i8] c"_Z11srad_cuda_2PfS_S_S_S_S_iiff\00", align 1 +@2 = private constant [94817 x i8] c"P\EDU\BA\01\00\10\00Pr\01\00\00\00\00\00\02\00\01\01@\00\00\00HG\01\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\A0F\01\00\00\00\00\00\E0B\01\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0F\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.info._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.shared._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.global\00.nv.constant0._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.text._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.info._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.shared._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant2._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant0._Z11srad_cuda_1PfS_S_S_S_S_iif\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z11srad_cuda_2PfS_S_S_S_S_iiff\00.text._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.info._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.shared._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.global\00blockIdx\00threadIdx\00gridDim\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c__1225\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c__1227\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp__1229\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result__1231\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp__1233\00.nv.constant0._Z11srad_cuda_2PfS_S_S_S_S_iiff\00_param\00_Z11srad_cuda_1PfS_S_S_S_S_iif\00.text._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.info._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.shared._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant2._Z11srad_cuda_1PfS_S_S_S_S_iif\00__ocg_const\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm20_dblrcp_rn_slowpath_v3\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm3x_div_rn_noftz_f32\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp__199\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result__201\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north__203\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south__205\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east__207\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west__209\00.nv.constant0._Z11srad_cuda_1PfS_S_S_S_S_iif\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00R\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A1\00\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\CC\00\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D7\00\00\00\01\00\0D\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\E0\00\00\00\01\00\0D\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\EA\00\00\00\01\00\0D\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F9\01\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00M\02\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9A\02\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\C4\02\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\FD\02\00\00\22\00\0B\00\90\CE\00\00\00\00\00\00H\05\00\00\00\00\00\00?\03\00\00\22\00\0B\00\D8\D3\00\00\00\00\00\00`\01\00\00\00\00\00\00|\03\00\00\22\00\0B\008\D5\00\00\00\00\00\00H\08\00\00\00\00\00\00\E0\04\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00@W\00\00\00\00\00\00.\02\00\00\12\10\0B\00\00\00\00\00\00\00\00\00\80\DD\00\00\00\00\00\00\04/\08\00\10\00\00\00\15\00\00\00\04#\08\00\0D\00\00\00\00\00\00\00\04\12\08\00\0D\00\00\00\00\00\00\00\04\11\08\00\0D\00\00\00\00\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00\00\00\00\00\04\11\08\00\0C\00\00\00\00\00\00\00\04#\08\00\0B\00\00\00\00\00\00\00\04\12\08\00\0B\00\00\00\00\00\00\00\04\11\08\00\0B\00\00\00\00\00\00\00\04#\08\00\10\00\00\00\00\00\00\00\04\12\08\00\10\00\00\00\90\00\00\00\04\11\08\00\10\00\00\00\90\00\00\00\04/\08\00\0F\00\00\00\16\00\00\00\04#\08\00\0F\00\00\00\00\00\00\00\04\12\08\00\0F\00\00\00x\00\00\00\04\11\08\00\0F\00\00\00x\00\00\00\010\00\00\01*\00\00\04\0A\08\00\07\00\00\00@\01@\00\03\19@\00\04\17\0C\00\00\00\00\00\09\00<\00\00\F0\11\00\04\17\0C\00\00\00\00\00\08\008\00\00\F0\11\00\04\17\0C\00\00\00\00\00\07\004\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\10\08\00\00\88\08\00\00\04\1C\04\00\10W\00\00\04\1E\04\00@\00\00\00\010\00\00\01*\00\00\04\0A\08\00\0E\00\00\00@\01<\00\03\19<\00\04\17\0C\00\00\00\00\00\08\008\00\00\F0\11\00\04\17\0C\00\00\00\00\00\07\004\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\90\07\00\00\08\08\00\00\04\1C\04\00\88\CE\00\00\04\1E\04\00\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\02\04@\00\01\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB\82\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F0\09visible .entry _Z11srad_\08\01d_1PfS_\02\006iif\A7\04\00\A1\00\0F,\00\0B\0E\8F\04\0F4\00\16\1F14\00 \1F24\00 \1F34\00 \1F44\00 \1754\00/324\00\13\1F64\00 \1674\00\1Ff4\00\15\1F8\FF\04\13_6[144\00\05\15\A5pred %p<19\02\05\00\90\00{%f<166>&\056165&\00\00I\00^fd<10;\05\114e\10P\09.sha_\00\03\AB\00\124\AB\00\1FZ\EB\00\0A\CFE4temp[1024]D\00& 11E\00\7F_resultL\00,o5northE\00-?souE\00.O4eas\CE\00-64weD\00\0F\DE\06\08\1F6\DE\06\12\02*\02\00\8B\05\0F\C0\02\13\1E]\1A\07\0F<\00\15\1E7\AD\06\0Fx\00\16\1F6@\06\00\1F6=\00\15\1F5=\00\00\1F5=\00\15\0F\B8\07\01\1F4=\00\15\1F3\F5\07\01\0F=\00\15\0F\89\07\01\0Fm\01\16\0Fq\07\01\0Fn\01\16#0]<\02#tof\17\04C\00\117\1F\07\04[\02\0A\1C\00\118\1C\00\1F7;\00\05\119\1F\00\1F5;\00\02!10\1D\00\1F9<\00\05!11 \00\1F4=\00\03\122\F7\07\1F1>\00\06\143\D5\07\0F>\00\01\124>\00\1F3>\00\06\145@\08\0F>\00\01\126>\00\1F5>\00\06\147\94\08\0F>\00\01\023\01/17\C8\08\03\1F8\C9\08\02*16\17\00\03\CA\08?d14\CC\08\03*12\18\00\03\CD\08:d10\18\00\134w\00\1A8\FC\08\1F46\13\02\155\12\09Est.f\16\00\01\8B\00+f1(\09\8A%ctaid.xC\00\156\1F\09\06+\0E\154-\00\1By-\00\02\CD\00\184-\00\00\98\01\1FtX\00\00\02\B1\00\185+\00\136+\00\0BV\00\127\C6\00\116+\03\02*\00%7,\F7\00\07\16\00%8,\87\00\A1;\0Amul.lo.s\1A\00\229,5\00 %r8\01#hlh\07\02\D1\02G9, 4F\00\00\C6\02\04\FB\00\09-\00#2,\1E\00c4;\0Aadd[\00\02\B7\02\02J\00(12H\00%4,\C0\00\0B\8F\00%15\90\00\1A1M\00&6,S\00\185M\00%7,8\01\091\00&8,7\00\1B78\01\03=\0B\188H\00\1F9:\01\02/20;\01\06321,8\00\00'\00\08\11\01322, \00\08?\01/23?\01\03324,\1E\00\09?\01625,K\00\09H\00\1F6\F2\00\03627,7\00\00e\03$ub\1A\00%8, \00+19U\16\03\BC\10\08\0C\01\192\0C\01\06\AA\00\113\CC\01\1B2\17\02/31#\01\06532,7\00(314\00\0F\0C\01\04\113\0C\01\1C3\0C\01635,N\00\0A\1A\00&6, \00\180b\00\0F\18\02\04638,7\00\0B\18\02\128\A6\03\183\0C\01\1F3\18\02\03\1F4\18\02\07341,8\00\00'\00\08\DE\00\114\C1\00\1B4\18\02\1F4\0C\01\04\114\0C\01\1C4\0C\01645,K\00\09H\00\1F6W\03\06'47\93\00\196N\00&8,T\00\197\1A\00#9, \00\1C-\88\18\02\A2\04'49}\00/50'\01\02\1F5\1C\02\07352,8\00\00'\00\08\F9\00353, \00\08'\01/54'\01\03355,\1E\00\09'\01656,K\00\081\04/57'\01\06'58\93\00\09\0D\01659,T\00\188\1A\00360, \00*16'\01\139\9E\05\03\96\02\033\07%9,\C7\06\01\18\00\02F\00\15dh\04\158\DE\00\032\09$1, \00\132x\00\03\19\00$2,Q\00\01'\00\01N\00\02\0F\0B\00f\09\01\22\00\0Ac\00\193\F9\00%ov\95\09/4,\84\0C\14\03\06\08\02\F7\0B\05?\00\03a\08\1A4\B8\00$6,u\00\1A6\B8\00(7,6\00\196\A3\00\188\B5\03\08\06\01\03\E6\00\1C8\06\01\02\\\04\05U\00\179\D1\07\00\1D\00\01\D0\07\182\81\01/31\81\01\03\00]\04\04\05\04\09{\00\00\8D\04\03 \00\0B{\00$4,Q\00\01'\00\07\81\01\223,\80\00\1A4c\00\1F5\81\01\04\00\8E\04\0F\C0\0D\13\0F\81\01\02\2237\82\00\1A6\B8\00$8,u\00\0A\81\01\02\07\04\056\00\188\A3\00/40\81\01\04\02\D5\04-d4\87\02\02\89\04\04U\00)41\81\01\114\03\03'f3\7F\03\196\12\04rsetp.ne\7F\003p1,!\00\F2\0C0;\0A@%p1 bra LBB6_2;\0Abra.uni\10\0021;\0A\08\00\17:\DB\01+55\DB\01\03\13\09\1F79\04\03378,\1E\00\08g\04/798\06\03680,4\00\02\E9\0B\00\03\01\049\01\02\1B\04)805\01\00k\04\03\1C\00\0A5\01\02i\04\22d5\04\03'57;\02\125;\02)58;\02(59;\02\07\98\01\02\7F\04\01 \00\196T\02/61\D5\03*\126\D7\01)61\B8\00863,\1D\00\1D0\07\01$64 \01\08\A4\00$5,\1C\00\0A\07\01866,V\00\195<\02\126\8D\0B\1B5\FA\01\135\FA\01'2:T\02\192f\06\067\0B\00Y\00\15n\91\0B\07\B5\05#4,\1F\00,-1\85\02#2,R\00\00'\00\01\88\02\162\88\02\1B4\8E\00\133\8E\00\183\88\02/43\88\02\02(65#\07\07\A6\00\1F6\A6\00\05#7,\1F\00+-1\AA\06368,R\00\00'\00\08\D6\02369, \00\09\D6\02\1F0\04\03\04#1,\1E\00\09=\07572,K\00\1C7|\00$73|\00)155\00%4,;\00\187\F3\03/75S\03\03676,7\00\02\82\0E\00C\01\036\00\12d\00\09\1A7\D6\04\02\AA\08-d4L\02\00\FE\08\03|\01\01'\00\07S\03\124S\03)46S\03/47S\03\04\02\13\09\01 \00\0AS\03/49\A7\05*\02X\08:d49\0B\04(1,\1D\00\1E8Z\04\142 \01\08Z\04$3,\1C\00\0BZ\04(4,V\00\193S\03\115\E0\0A\1Cf\C5\02\134\C5\02/4:k\03\03\C95:\0Abar.sync \CD\08/67L\07\03\00~\02\04*\0A\0A\0A\04\02&\06\1D6\C7\07\00\82\02\03Q\00\01'\00\07\BE\01\126\BE\01)70c\00/71L\07\04\00\80\02\0F\83\14\12\0F\A4\01\02\02\DC\029d72\B7\00\02\A3\02\01t\00\0AK\07\00\DA\02\076\00\194\A2\00\1F6K\07\04\00r\06\03 \00\0B\05\01(8,U\00\197\BC\01\117u\0B(f6\80\01\1F7M\0A\04\00\AC\06\04\83\0A\09{\00\128\17\02\1D8\C6\07482,Q\00\01'\00\07\80\01\127\80\01\1B8c\00\0FM\0A\05?84,G\16\13\0F\80\01\02\128\FC\00\1A8L\0A\128\DC\03\1D8L\0A887,6\00\09L\0A\1F8L\0A\05\138\E5\00\0D\85\02890,U\00\0AL\0A\129L\0A\187V\11\0A\10\11\09\CB\08\01\12\02\158\CB\08\163C\06\1B7~\03\136f\03\186C\06/10D\06\03/96g\0D\02/97\E1\11\07#8,8\00\00'\00\08\FC\05\119\08\01\1B9*\06?100\9E\11\07&01M\009100p\11502,U\00=101\C8\042104\1B\00\0A\\\03E105,\1F\00\0A\D3\05E106,\0D\01\02+\00\07\98\02\129\98\02O106]l\00\00\157\A3\00\07\9C\02?108\1D\04)C109,G\00\0A\C3\00\03\FF\15\01v\00\0A\A1\02\131\FA\15\05;\00(10\A4\02?112\A5\02\04\131\F5\15-11\E3\05\141\F3\15\04\\\00\1A1\E6\05\2211\E7\05\1C94\0B\130j\02\177;\09(82\C4\02\06\95\08\178;\09\19x\08\0B\01}\00\1F8;\09\02#4,R\00\00'\00\01\F8\02\164\F8\02\0C\8F\00\138\8E\00\188\F8\02/91\F7\02\02/85\D7\14\03\1F6\F7\02\06\02\C5\03\02\19\04(86\F7\02\00\F8\03\02 \00\09\8E\15)89\F1\00\07'\03#0,\1F\00\09#\09\00\AA\00\05\1D\00\081\12/92@\03\06&93\95\00)92N\00&4,T\00\193\1A\00#5, \00--1#\09\129t\03\0A\1C\08\02f\00-d99\02\02e\00\12df\00(d9\B1\0E\128M\03.94L\03\159\AF\00\07J\03/96\E6\05)\2297\82\00\0A\DB\09\02\AA\04-d9\B1\0E\02T\04\056\00\09\B1\0E\06\AF\04\0DB\03\02\AD\04\02\22\00\0CY\04'2,Y\00\01\AE\04\08A\03\120\B7\0E\1B8\B2\02\139\B2\02/9:Y\03\04/10*\09\05?115\AB\07\03%11\B6\13\1C6\02\04%7,\22\00\0C\02\04%8,V\00\02+\00\07\CC\01#10\1A\05+18k\00\1F9\B4\07\04\101^\12\0F\16\1F\12\0F\18\05\03\03\A4\12:120\C2\00\04\A3\12-19\18\05\00\18\12\09;\00\192\AB\00/24\D6\01\05\038\12.12/\06\03s\12\06\\\00\0AQ\0E#12\DF\19,0;\A6\01\05w\00\0F\14\0C\05\121C\17\02\22\00\0C\CB\00\1B9\CB\00\0A\A1\02\1F3\A1\02\06\03:\12.13\A1\02\02\EB\16\06\\\00(31\E1\01\131\E1\01\01L\02\08\C1\1A\121\0E.9f11\D2\07\09\AC\09\0A\A8\08$5,\22\00\02\A9\08\165\B1\05,13A\06\05v\11(1:_\00\0A\AC\01\09_\00$6,\22\00\02_\00\1F6_\00\09\142_\00(2:6\13/61\95\01\04\133\88\10\01\22\00\1B6R\13\1F3\FF\10*\123\22\10\00H\00\0A<\13\03\CC\10\06 \00\09\B8\02/36\F5\0B\05\133\DB\0C-36\CE\03\123\CD\0F\07\\\00\0A\CE\03\03\9C\05\113S\00\07\18\00&9,\F5\01\02\EE\19%rn\22\02$0,<\00\01+\00\0A'\02\139\09\03(10'\02\0Ad\12\08{\1B$0, \00\0E\D7\09\123\D4\0D:160\81\01\1F7S\04*\123\99\10\00G\00\0AG\01\02_\10\02v\00\0B\D7\14\03\86\0D\06;\00\0A\9B\01/74\9B\01\05\03q\0D-37S\04\123\A3\10\07\\\00\195\88\03\03\89\03\113\BC\05\07\18\00\1F2\9B\01\09$3,<\00\01+\00\0B\C2\03#08\9C\01\1A3\BE\00\1F7<\16\05\1F7\D4\0A*$37\0E\16\1B8>\16%0,y\00\0Bi\01\03F\0E\05;\00)80\AB\00/82i\01\05\01U\0A\03\22\00\0BL\17\02W\0A\06\\\00)83Q\01\03\C3\11:384i\01\1F5i\01\09$6,<\00\01+\00\0Ci\01\130i\01\1A6\BE\00\1F5n\04\05\03\86\0E.38R\17*87{\02(86Y\03/61\A3\1E\04\02\C0\04\01 \00\0FY\03\00\138\C2\1E\1B2\87\00\04v\01\0E\0C\19\03\C1\0E\05\8F\00)89-\01\03\CA\0F:390E\01\1F8E\01\09$9,<\00\01+\00\0CE\01\134E\01\0C\DE\0B$38\CD\05\193,\06\1F5\8B\06\07$7,\22\00\02,\06\177,\06\1C6,\06\04\B4\12)14_\00\1F6\8B\06\07$8,\22\00!15`\00\1F8`\00\09\04\FC\12)15\8C\06\1F3\8C\06\06\03\B2\07.33\8C\06\1F3\8C\06,\04i\1A\1B3\8C\06\02\F6\1E\07 \00\0A\F1\04\1F3\8C\06\06\04\FB\19\1E3\8C\06\046\1A\05\\\00\08\8C\06#96B\02\193\8B\06/97A\02\07\01\D5\0B\019\00\00(\00\0F\87\06\02)98\86\06\1F5\86\06\04\02b\19\01 \00\0F-\03\00\02\80\1A+15{\01\1F4\86\06+\03\F6\1A+34\86\06\04\F5\1A\1E3\86\06\03\81\16\06;\00\0A\95\01\1F4\86\06\06\03\DA\16.34\86\06\03\DC\16\06\\\00\09\08\11\149m\1C\0A\85\06/00\D7\03\08\01\1C\0D\22f9\DD\03/00\84\06\04*01\BC\00\0F\1E\0B\05\1333\17.34-\06+49\10\01\0A\EE\01\0F\BE#\05\02\22\1B\01 \00\1F-\EF\01\00\135\93\08\0B\B5\06\03\F7\16-35R\0B\123G\17\06\90\00)51-\01\033\1F:352E\01\1F3E\01\09$4,<\00\01+\00\0Fa\06\04*04\05\1E\0F:\15\05/35;\15*\133N\1C;354\15\01%6,y\00\0B\9D\01\03\A1\1C\06;\00\0A\0C\07/58\AD\02\05\02\10!\02\22\00\0C\CE\0A*0,\\\00\09\85\06$05\CE\0A\0A\85\06/06i\01\09$7,<\00\01+\00\0F\85\06\04\1C0[\15$37\C6\05\196&\06\1F7\85\06\07\03\BF\14\157&\06\179&\06\0DR\13\05`\00\197`\00\1F8\86\06\07\03\0F'\00#\00\03a\00/10b\00\09\05G\07\188b\00/51%\05\05$2, \00\0F7\03\01\02\13\11*15\CD\11?302&\05*\03\9F\15\1B3\DA\15\123\F1\15\02v\00\0By\02\03\F5\15\06;\00\0A$\03\1F0\BB\06\06\02\D9\15\02\22\00\0C\0A\22*8,\\\00\08\BB\06\148B\0A\190\BB\06\1F8A\0A\08\01\F0\09\019\00\00(\00\0F\BB\06\02\0Ar\18?309\DB\03\05/10\E2!+\03\EB\15+31\89\06\03<,.30\89\06\046,\05;\00\0A\89\06\1F1\89\06\06\03U\12.31\89\06\03/,\06\\\00\08\89\06\148`\0A)16c\01\0F_\0A\08\01\CC\0A\019\00\00(\00\0F\86\06\03*89\B8\00\0F\85\06\06\03\15\13.31\85\06*19o\02\0A\FA)/53\85\06\05$4, \00\0FN\03\01\132\85\06\0Bp\05\03\08\12.32\85\06\03\11\13\05\90\00\09\B6%#90@\01\192\84\06/91^\09\08#2,9\00\00(\00\0F\80\06\03\1A9\F8\01\0F\06&\06/32\7F\06+\03\99\13,32\0F\01\034\02\1E2\7F\06\03\B6%\06;\00\0AO\03\0F\0E&\06\133\17\13\1F3\10&\00\03\11&\06\\\00\08\7F\06$93\C1\0A\09~\06/94c\01\08#5,9\00\00(\00\0Fz\06\03,95y\06\04\D5\1B)19\19\06\1F9y\06\07\02\A7\03\01#\00\04\19\06\08\00%\0C\01%\142t\16(20b\00/10{\06\08\03\A7-\150`%\1F2a\00\09\04\ED\13\192\ED\13/47z\06\04\02\03\0A/14\B1\09\03\122\1A\0F:148z(\1F7z\06*\132\97\11\1B2\B0\1F\132\B1\1F\1E2\B2\1F\132\ED\11\06;\00\09z\06\1F2\B6\1F\06\132\B7\1F\1E2\B8\1F\132\B9\1F\06\\\00\08z\06\147\8E)\197z\06\1F7Z\0A\08\01\BF\00\019\00\00(\00\0Fz\06\02\0C\B7\00\0Fz\06\05/28z\06+\132\91\11\1B2\DB&\1322 -27z\06\132\E7\11\06;\00\1A2\E4)\0Fz\06\05\132\C8\1F-28z\06\132}\11\06\\\00\08z\06#75c\01\198z\06/76c\01\08#7,9\00\00(\00\0Fz\06\03*77\B8\00\1F7d\01\06\0F\83\13*$28\DD*\1B8\F4*%0,y\00\0Bc\01\03\C4\1B\05;\00\199\83\13?292H+\06\03@\01\0E\1E\1C\132\1F\1C\05\\\00\09!\1C#78c\01)94c\01\0F\85\16\08\01B\14\019\00\00(\00\0F\9D\06\03\1B8\B8\00\0F}\13\05\122t .d2\1E\1C:297\D2\03\199}\13/49\DD\07\05\02\BE\07\1F4\D6\16\02#29\80\0E\0B\F6\02\032\1C/29\89,\00)0,\8F\00\189y\06$81\7F\0A\09y\06/82?\01\08#3,9\00\00(\00\0Fy\06\03,83y\06\05\1F)\192\17\06\1F1y\06\08\02T3\161w+\07\AD\22,24\17\06\04\F2(823:\BA\01/43\BA\01\05\02\C7'\02\22\00\1B6x.\1F5y.+\03\18\11:245\12\02\02\EF0\07 \00\0A-\05\1F4f.\06\03\1D\10-24\DF\01\132\A1'\06\\\00\09\A4&\03\BB\09)25\DF\01\1F6\BB\09\08\01\FC\15\019\00\00(\00\0F\E4\05\02\1A655\0F\F4\0D\05\02\8D\12/14F\16\02#25y\07\0A\F3\0D/25y\07+\03\80(+25y\07\03\7F(.25y\07\03d\10\06;\00\0A\95\01\1F5y\07\06\03I\10.25y\07\03H-\06\\\00\08y\07$63\95\01\09y\07/64\95\01\08#5,9\00\00(\00\0F\16\06\03*65\B8\00\0F\9A-\05\132\09\11.25\22\07*61\0C\01)60\EA\01\0F\90\0C\05\02K\03/14\90\0C\03\132O\18\1C4\891\04e\00\0E\F3\05\03\C6\1C\05\90\00(63)\01\03\EE\159264@\01\0F\EE\15\08\01\CA\1C\019\00\00(\00\0F\F3\05\03\196\1F\1F?265@\01\06%6,\22\00\0C@\01\1B7@\01\0A\F3\05\1F5@\01\05$6, \00\0F?\01\01\03\C2\0A\1B6\87\00\03\14\04\1E6\14\04\03\B6*\05\8F\00\0A\B8*\149\F8\09\09\14\04/70\95\08\08#1,9\00\00(\00\0F\F3\05\03,71\F3\05\04\0B\19\192\0B\19/12\0A\0C\08$4,#\00\04l\0C\07\A9%,26\F4\05\05U\06\185b\00\1F3\AA\17\05\02]7\01 \00\0F\A7\02\01\02`\10+13\0D\06/16\92\04*\03.#+21\BF\01\03\16\10.21F\02\05;5\04;\00\0A\9A\02/20\92\04\05\04\945\0F\955\01)2,\\\00\09\E2\0F\144E\09\0A\E2\0F\1F4E\09\08\01w\06\019\00\00(\00\0F'\06\02\195=\04/37\96\01\05$8, \00\0F\FD\02\00\024#;138\EA\00\03!\04\0F\C75\00*25>\01*24>\01\0F\D0\05\06\04\AC5\1E2\D0\05\03f#\05\\\00\192\1A3\141>\01\09\D0\05/52>\01\08#3,9\00\00(\00\0Fj\1F\03*53\B8\00\0F\D0\05\06\04k6\1E2\D0\05*31\0C\01\1A3\93\01\0F\83\0A\05\00\9E\1A\03 \00\0F)\03\01\133\D0\05\0B\84\0A\03\916.23\D0\05\03\0A\1C\05\90\00\09\956#54@\01)34@\01\0F\03\15\08\01;\18\019\00\00(\00\0F\D0\05\03\0A4\18\1F2\E56\06/234(*$23\\\01\0B\E1\03\03\B2\1C.23\E1\03\03\956\06;\00\0A\E1\03\0F\ED6\06\132\AD\1A.24\E1\03\03\FB\1B\05\\\00(41L\01\03\03\15)24\E1\03/58c\01\08#9,9\00\00(\00\0F_\1F\03\1D5^\1F\05\86\0B\09\D9\18/13\E7\0B\08$5,#\00\04\F4\05\07\EC%,28\F4\05\04\DB\18\192\DB\18/29^\04\05\03\E7?\0F\CB\02\03\121\92 +13\90-/88x(*\03\FE\0E\1B19\0F\1319\0F\1E19\0F\131\DD\0E\05;\00\0A9\0F\1F19\0F\06\1319\0F\1E19\0F\1319\0F\05\\\00\099\0F\133F\09\1A19\0F\1F3F\09\08\01\D2\02\019\00\00(\00\0F4\1F\02\0A\CA\02/19u\03\05/19[:+\131\CE\0E\1B1\AC+\131\AD+\1E1\90\0F\141V+\05;\00\0Av\03\0F\B1+\06\132\B1+.20v\03\03\B1+\04\\\008201L\01\04j\09\1A0W\07\0Fj\09\08\01\E1\03\019\00\00(\00\0F\FF\1E\03*41\B8\00\0F\E2\0E\06\03\E8\1A.20%\07)05o\029204\19\06\0F\19$\05\03\FA\05\1F3\C7\1B\03#20a\09\0B}\0D\03\1C\1B.20Y\07\03\1C\1B\04\90\00\192\1C\1B$42@\01\09Y\07/43@\01\08#4,9\00\00(\00\0F\19\06\03\0Bv\0F\1F0Y\07\06\03?1.20Y\07*11@\01\0A\B2(/33@\01\05$4, \00\0F\81\0A\01\03^\01\0B(\0D\03\D5\1A.21X\07\03r1\04\8F\008213(\01\04~\14\191\E2\03/46?\01\08#7,9\00\00(\00\0F\CF\1E\03\1D4\CE\1E\04 +\192\0C\1E/14\E9\0B\08$6,#\00\03\DC\11\07\81+,30\F4\05\04\E1.\192\B6\18/23\F4\05\04\02\1D\0A/12\8F\0E\03\010*\01&\00\0A\92\04\1F6l.+\03\AB\0E+16l.\03\0C,\1E1n\0F\131\01\0F\06;\00\0Al.\1F6l.\06\03\91+.16l.\03\9D\0E\06\\\00\08\90\16#24\F4\05\196\90\16\1F2j\09\08\01&\1B\019\00\00(\00\0Fm\1E\02\192\E5\0E/25\96\01\05\02\F4\01\1F2\E5\0E\02\121\AB\0E;126\EA\00%8,\1F\00\0C>\01\1B9>\01\0A{\06\1F7\DF.\06\03\0D).17\DF.\03\C6+\05\\\00\197\BEB\03E\099172>\01\0FE\09\08\01\10\1C\019\00\00(\00\0F\CF\05\03*29\B8\00\0F\08\1D\05?174%6*\03\05\1A;174\0F\01%6,y\00\0Bc\01\03[\1A\06;\00\1A6\AB\00\1F8c\01\06\04@\01\1D8c\01\03\C0+\06\\\00\08\9C\10\143#2)80\95\08\0F\B0\14\08\01\93\06\019\00\00(\00\0F\F2\05\03\0A\81(/18-)\05\131=\1A-18'\14+18\AD\03)82\F6\02\0F0&\05\02:\0E/12\DE\0E\02#18\AA\04\0B(\0A\03p\1A.18\E0\03\03p\1A\05\8F\00\09p\1A$33?\01\09\E0\03/34?\01\08#5,9\00\00(\00\0FG\1E\03\1D3G\1E\04\84\1D\1A3\E5\1D\0F\FB\03\05\04\EF\05\0Fy\11\03#13\EA\17\0Ay\11\01-\07\0F\91\05(\03**\0CO\07\03\06\12\02v\00\0B\F0\02\03\90\0D\06;\00\0A\F0\02\1F3\F0\02\06\03u\0D.13\F0\02\02L\0F\07\\\00\09w&\03\E2\08*14v&\0F\DE'\09#4,9\00\00(\00\0F@1\03\0A\1DM\0Fy\11\05\03\8AL\1F1\9B\02\03\02k\0E+11\9B\02\03g\0E.14\22\03*43>\01\0Ab*/14\CF\06\06\03b*.14\EF\02\03*\19\05\\\00\09b*$15>\01\0Ab*\0F\B3'\09#7,9\00\00(\00\0F_*\04\09_\1D\1F1^*\06\03[ \02\22\00\0C\0C\01\1B9\0C\01\09\D9#/19.\04\05$0, \00\0F\BA\08\02\03\CD\0E\0B\CF6\03^*.15\D1\06\03^*\05\90\00\0A^*\03\F7\13\1B1]*\0F\F7\13\08\01\A1\00\22f1\BF3\0F<\1D\04\1A2\BB\0E/53@\01\05\03\95\19.15\8A\03*55@\01\195\D3\02/21@\01\05$2, \00\0F?\01\01\03\82\08\0B\E9=\03\C8\19\1E1\C8\19\03\EE+\06\8F\00\09\C8\19$21?\01\09\9F\0C/22?\01\08#3,9\00\00(\00\0F<\1D\03/23n\05\06/1:z\0B\04?32:\89\11\04?33:\97\17\04?34:\A4\1D\04?35:7$\04?36:\CA*\04?37:i1\04838:\F1\04\05\03B\0A\B4\03\05\D3\01\00\86\03\02\9F<\06\B5\03\01\D7\01\02$\00\01\07\00X;\0Afma \00$3,\\\00\07\07\00\192`\00&4,\A6\02\0D@\00$5,$\00\07\07\00\09\EA3\057\16\00\A7\01\0D@\00$7,$\00\07\07\00\09\AB\04\0B&\0A\0B\E0\00$9,$\00\01\07\00W;\0Adiv \00\02\DE\08\05g\00\0Cx\03\131\F7\05)30D\06\1F1j\01\03)32j\01\00\EF\02\07j\00$3,<\00\01+\00\099\00\0AC\01\0B9\00(5,@\00\1949\00\0A<\01\0B9\00(7,@\00\1969\00\0A5\01\0B\15\01(9,@\00\0C^\0B\132\E6.\0Ap\07\07|\11\04\9FA3f64\1E\00\12d\F6\00)402\00&1,R\00\0C\B9\01\02i\10#149\00\1D1R\00\03 \00\1629\00\116\1C\\#3,\1E\00h0dBFB0\01\00\06]\02\02+\00#4,\9B\00Y0d3FE+\00\01\1A\00\02aW\03\95\00\035\00\020\07\01'\00\18]\BCF\154\01\18\04>\11\05\85\00\1F4\BCF\04\134\C1!.42Q\03\03\102#42Q\03\0B\A6E\144\A6E\1A6\9D\00\192\DC\0A\08\9E\00\04@\06\0C\9D\00\1F8\9D\00\06\04\FF\00\0D>2\134\E4!#42v\01\0BQX\134RX\04C\18\04\9D\00\193\B2\0B\08\0AT\04\B9\03\0A\8BG?432\9D\00\05\03\DE!.43B\06\03\DE!#43\E2\01+33\9D\00\03v\0A\09\C6'\06\DF\18\02\88\0C\08\9D\00\04\F7\02+24\9E\00\1F6\9E\00\06%7,\22\00\0C\9E\00%8,V\00\02+\00\0B\9E\00\04\BC\03\09\B4\0C+65>\0D\05\9D\00\05\CD\06\0B\B7S/40\9D\00\05\04\A4X\0F\A5X\00\03O\13\144\A7X\0C\A8X\03\A9X\01\C7\19\0F\EEj\14\1B2\08\04\0E\EFj\0F-\00\09\0E\F0j\0F5\00\17\1F15\00!\1F25\00!\1F35\00!\1F45\00!\0E\F5j\0F5\00\10\1F65\00!\0E\F7j\0F5\00\10\1F85\00!\1F9-k\13\227[\10\0E\0F-k\1A\1C7,k,43+k-81*k\1D6)k?191)k\16\0D\E8\00!E7:\22._c\13j\0FH\00\1B\106\F7%\0FG\00/211c\1B\00\0E\C1k\0FM\00 \133M\00\0E\C4k\0FO\00!\09Ul\0F\F7j\08\1F7\F7j\18\0ENi\0F\8B\02\06\1E]4k\0F=\00\0E\0F5k\1A\0Eo\03\0F6k\18\0E\E1\03\0F7k\19\0ET\04\0F8k\19\0E\C7\04\0F9k\19\0E:\05\0F:k\19\0E\AD\05\0F;k\19\0E \06\0F\1F4,i\0C\1E6,i\0F\98\0C\16\0F/iV/72/i8\1F4hn\02\0A5f/415f\04\06dn/-1`i\02\02)%*42ci\1B7ci\137ci?7_1\DBf\02\04\12\02\0B\C3n\0B\DBf/44\A6\00\05\02qn\1F4\DBf\00$46\F7n\00'\00\09\11o#7, \00\09?o\1F8\F8\04\03\06\C0n\0A\18n&0,K\00\1C9\01n\141|\00\0A\DBf\0C\ACn\06Mn\0F\1C-\04\02s#\037\00\1F3\DBf\04/54\DBf.\1F5\DBf\0C\1F6\DBf\16\0FV\03\18\0F\DEf?/53\DEf7\1B5\C8\02\132\C8\02\1F2\9C]\05/55\E6\04\03.56\C6f\0F\D7k&\1F6\D7k\0C\0B\E6\04\0E%/\0F6\11\07\0F\C0k\03\02#\01\0B&/(62.l\0F\D9k\00\036\00\09\CB\19.64|l\0F\D8k/\08b\0B/55\C0k\03/56\85b\04$57\8Fq\0F\C0k\04\03\15r\1A7\C0k\1B7\C0k\137\C0k\1A7\C0k/67\E5\04\02/58\E3r\03\1F9\C3\09\06\03\1Cr\02Vr\199\A6k\02\EBn\1B0\85b/62\85b\04$63\96l\1F4\C8l\03\00S\00\0Ell\0F\C5\09\01&66\95\00/65pl\00\02T\00\09\DF\09\03ol\1F7\85b\02\03\1D\00\1F8\EAi.\177\EAi\0B\85b$71\AF\00\0E\EBi\0F%\03\12\0F\EEiV/72\EEi8\1B7\AF\02\134\AF\02\1F4\CC\04\05/79\CC\04\03\1E8?\0B\0F\12j'\1F8\12j\0C\1E6\12j\0F\BB\15\1B\0F\1BjV/72\1Bj8\1F8Wb\05/91\8F\07\04\03\BD\1F\1D9j\02(93\BD\00\0AZ\1F\1F9\8F\05\05\035\1F\1D9\8F\05\03/\0F\C6\02\05\01\BE'\0F#\09+\07\81/\09\A2E\09~/\0C\E1.*7,;\00\0A+1/485/\06%9,\22\00\0Cn-*0,\\\00\08\85\05$23n-/0]x.\04\1D3\EB6\0E`\01\07x5\1A4\A7.\1F1a\01\05\1F5\CF\05-\02\DB'\01J\00\0F\01/\04\1F1\01/\06\05;\00\0B\AE\00\0E\D7\05\0F\CE.-\185\CE.\0F\99.\02\0C\C2\0A\141\F7\05\08\8B?/71\D5e\0C%714\03\1654\03\1B9\\\00\138[\00\09=.\1F84\03\0C/181w\01/761w\01\0F_M\02\02c-+r8\0F@/28*\062\06\92g\0B\F05/30\C5g\06+31qg\0A]1/32\08\02\05\03\F3\10\1E1\F4\10\03\86<\07\E8g\1D3\FE1\01&\00\0Fh\03\04\191\97\03\1F0h\03\0E\0B\B2\00\0FA\11\05\01'.\0Fh\03+\05d5\1B6\0C\01\168N2\0C`\01\1B9\9F5\0AF:\0FI\11\06\03 5.d1K\11\05$5\04\\\00/4162\01/42h\03\05\1C1h\03\04\0E\0C\08\FF\05\06/12\9F\06.\06sp\0B\89l\07pp\0C\DFp*5,;\00\0B\AF\00\0E?\05\0F\DDl-\185\DDl\0F7\03\02\1E5\A46\0F7\03\08/16$m\04\1E6$m\0F\9D\04\18\0F-mQ\0B\DF|\0B\A2\18\06\DF|\0F\80\05\01\02W\09\1F8\\m*'ld\EE7\04bm\0F\9C_\04\0C\AC\09$12\0F\03,1:\BBn\0F\0F\03\09\08\9C\19/73z\06\03\03Rz\0F\FDA\02\02}*\1A7\94@\1F9x\063\04\A8p\0BV\01\03\10-\1E9u\06*01\87p\0A\C2\05\1F0t\06\06\05\10u\0Et\06\03\8CD\06\FDp\1F39n\01\1F0t\06\06\1D16;\0F=\03\0A\0A=\0B/05\CCp\05\05\CAu\0E\1A\06*07\05\01\1F6:|\01/72:|\01\0F\E3@\03/088|\00\07\BCu\0Eq\04(0,\8A\00\01/m\0B$<\01&\00\0F\DC\02\05\1F3\DC\02\06/2:b\06\04\00\16i\0F\E4\09\03(14\BE\0C\1F6\1E)\03/15\1E)\04/16\E9\0F\05\079C\0E\CDB\07 +#include +#include +#include + +// includes, project +#include + +// includes, kernels +#include "srad_kernel.cu" + +void random_matrix(float *I, int rows, int cols); +void runTest(int argc, char **argv); +void usage(int argc, char **argv) { + fprintf(stderr, + "Usage: %s \n", + argv[0]); + fprintf(stderr, "\t - number of rows\n"); + fprintf(stderr, "\t - number of cols\n"); + fprintf(stderr, "\t - y1 value of the speckle\n"); + fprintf(stderr, "\t - y2 value of the speckle\n"); + fprintf(stderr, "\t - x1 value of the speckle\n"); + fprintf(stderr, "\t - x2 value of the speckle\n"); + fprintf(stderr, "\t - lambda (0,1)\n"); + fprintf(stderr, "\t - number of iterations\n"); + + exit(1); +} +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + cudaSetDevice(0); + printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE); + runTest(argc, argv); + + return EXIT_SUCCESS; +} + +void runTest(int argc, char **argv) { + int rows, cols, size_I, size_R, niter = 10, iter; + float *I, *J, lambda, q0sqr, sum, sum2, tmp, meanROI, varROI; + +#ifdef CPU + float Jc, G2, L, num, den, qsqr; + int *iN, *iS, *jE, *jW, k; + float *dN, *dS, *dW, *dE; + float cN, cS, cW, cE, D; +#endif + +#ifdef GPU + + float *J_cuda; + float *C_cuda; + float *E_C, *W_C, *N_C, *S_C; + +#endif + + unsigned int r1, r2, c1, c2; + float *c; + + if (argc == 9) { + rows = atoi(argv[1]); // number of rows in the domain + cols = atoi(argv[2]); // number of cols in the domain + if ((rows % 16 != 0) || (cols % 16 != 0)) { + fprintf(stderr, "rows and cols must be multiples of 16\n"); + exit(1); + } + r1 = atoi(argv[3]); // y1 position of the speckle + r2 = atoi(argv[4]); // y2 position of the speckle + c1 = atoi(argv[5]); // x1 position of the speckle + c2 = atoi(argv[6]); // x2 position of the speckle + lambda = atof(argv[7]); // Lambda value + niter = atoi(argv[8]); // number of iterations + + } else { + usage(argc, argv); + } + + size_I = cols * rows; + size_R = (r2 - r1 + 1) * (c2 - c1 + 1); + + I = (float *)malloc(size_I * sizeof(float)); + J = (float *)malloc(size_I * sizeof(float)); + c = (float *)malloc(sizeof(float) * size_I); + +#ifdef CPU + + iN = (int *)malloc(sizeof(unsigned int *) * rows); + iS = (int *)malloc(sizeof(unsigned int *) * rows); + jW = (int *)malloc(sizeof(unsigned int *) * cols); + jE = (int *)malloc(sizeof(unsigned int *) * cols); + + dN = (float *)malloc(sizeof(float) * size_I); + dS = (float *)malloc(sizeof(float) * size_I); + dW = (float *)malloc(sizeof(float) * size_I); + dE = (float *)malloc(sizeof(float) * size_I); + + for (int i = 0; i < rows; i++) { + iN[i] = i - 1; + iS[i] = i + 1; + } + for (int j = 0; j < cols; j++) { + jW[j] = j - 1; + jE[j] = j + 1; + } + iN[0] = 0; + iS[rows - 1] = rows - 1; + jW[0] = 0; + jE[cols - 1] = cols - 1; + +#endif + +#ifdef GPU + + // Allocate device memory + cudaMalloc((void **)&J_cuda, sizeof(float) * size_I); + cudaMalloc((void **)&C_cuda, sizeof(float) * size_I); + cudaMalloc((void **)&E_C, sizeof(float) * size_I); + cudaMalloc((void **)&W_C, sizeof(float) * size_I); + cudaMalloc((void **)&S_C, sizeof(float) * size_I); + cudaMalloc((void **)&N_C, sizeof(float) * size_I); + +#endif + + printf("Randomizing the input matrix\n"); + // Generate a random matrix + random_matrix(I, rows, cols); + + for (int k = 0; k < size_I; k++) { + J[k] = (float)exp(I[k]); + } + printf("Start the SRAD main loop\n"); + for (iter = 0; iter < niter; iter++) { + sum = 0; + sum2 = 0; + for (int i = r1; i <= r2; i++) { + for (int j = c1; j <= c2; j++) { + tmp = J[i * cols + j]; + sum += tmp; + sum2 += tmp * tmp; + } + } + meanROI = sum / size_R; + varROI = (sum2 / size_R) - meanROI * meanROI; + q0sqr = varROI / (meanROI * meanROI); + +#ifdef CPU + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + + k = i * cols + j; + Jc = J[k]; + + // directional derivates + dN[k] = J[iN[i] * cols + j] - Jc; + dS[k] = J[iS[i] * cols + j] - Jc; + dW[k] = J[i * cols + jW[j]] - Jc; + dE[k] = J[i * cols + jE[j]] - Jc; + + G2 = (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / + (Jc * Jc); + + L = (dN[k] + dS[k] + dW[k] + dE[k]) / Jc; + + num = (0.5 * G2) - ((1.0 / 16.0) * (L * L)); + den = 1 + (.25 * L); + qsqr = num / (den * den); + + // diffusion coefficent (equ 33) + den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr)); + c[k] = 1.0 / (1.0 + den); + + // saturate diffusion coefficent + if (c[k] < 0) { + c[k] = 0; + } else if (c[k] > 1) { + c[k] = 1; + } + } + } + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + + // current index + k = i * cols + j; + + // diffusion coefficent + cN = c[k]; + cS = c[iS[i] * cols + j]; + cW = c[k]; + cE = c[i * cols + jE[j]]; + + // divergence (equ 58) + D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k]; + + // image update (equ 61) + J[k] = J[k] + 0.25 * lambda * D; + } + } + +#endif // CPU + +#ifdef GPU + + // Currently the input size must be divided by 16 - the block size + int block_x = cols / BLOCK_SIZE; + int block_y = rows / BLOCK_SIZE; + + dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 dimGrid(block_x, block_y); + + // Copy data from main memory to device memory + cudaMemcpy(J_cuda, J, sizeof(float) * size_I, cudaMemcpyHostToDevice); + + // Run kernels + srad_cuda_1<<>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, + rows, q0sqr); + cudaThreadSynchronize(); + srad_cuda_2<<>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, + rows, lambda, q0sqr); + cudaThreadSynchronize(); + + // Copy data from device memory to main memory + cudaMemcpy(J, J_cuda, sizeof(float) * size_I, cudaMemcpyDeviceToHost); + +#endif + } + + cudaThreadSynchronize(); + + //#ifdef OUTPUT + // Printing output + printf("Printing Output:\n"); + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 20; j++) { + printf("%.5f ", J[i * cols + j]); + } + printf("\n"); + } + //#endif + + printf("Computation Done\n"); + + free(I); + free(J); +#ifdef CPU + free(iN); + free(iS); + free(jW); + free(jE); + free(dN); + free(dS); + free(dW); + free(dE); +#endif +#ifdef GPU + cudaFree(C_cuda); + cudaFree(J_cuda); + cudaFree(E_C); + cudaFree(W_C); + cudaFree(N_C); + cudaFree(S_C); +#endif + free(c); +} + +void random_matrix(float *I, int rows, int cols) { + + srand(7); + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + I[i * cols + j] = rand() / (float)RAND_MAX; + } + } +} diff --git a/examples/srad_v2/srad.h b/examples/srad_v2/srad.h new file mode 100644 index 0000000..499c144 --- /dev/null +++ b/examples/srad_v2/srad.h @@ -0,0 +1,15 @@ +#define STR_SIZE 256 + +#ifdef RD_WG_SIZE_0_0 +#define BLOCK_SIZE RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) +#define BLOCK_SIZE RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) +#define BLOCK_SIZE RD_WG_SIZE +#else +#define BLOCK_SIZE 16 +#endif + +#define GPU +#define TIMER +//#define OUTPUT diff --git a/examples/srad_v2/srad_kernel.cu b/examples/srad_v2/srad_kernel.cu new file mode 100644 index 0000000..d124fd7 --- /dev/null +++ b/examples/srad_v2/srad_kernel.cu @@ -0,0 +1,257 @@ +#include "srad.h" +#include + +__global__ void +srad_cuda_1( + float *E_C, + float *W_C, + float *N_C, + float *S_C, + float * J_cuda, + float * C_cuda, + int cols, + int rows, + float q0sqr +) +{ + + //block id + int bx = blockIdx.x; + int by = blockIdx.y; + + //thread id + int tx = threadIdx.x; + int ty = threadIdx.y; + + //indices + int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx; + int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols; + int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx; + int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1; + int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE; + + float n, w, e, s, jc, g2, l, num, den, qsqr, c; + + //shared memory allocation + __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float temp_result[BLOCK_SIZE][BLOCK_SIZE]; + + __shared__ float north[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float south[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float east[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float west[BLOCK_SIZE][BLOCK_SIZE]; + + //load data to shared memory + north[ty][tx] = J_cuda[index_n]; + south[ty][tx] = J_cuda[index_s]; + if ( by == 0 ){ + north[ty][tx] = J_cuda[BLOCK_SIZE * bx + tx]; + } + else if ( by == gridDim.y - 1 ){ + south[ty][tx] = J_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx]; + } + __syncthreads(); + + west[ty][tx] = J_cuda[index_w]; + east[ty][tx] = J_cuda[index_e]; + + if ( bx == 0 ){ + west[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + cols * ty]; + } + else if ( bx == gridDim.x - 1 ){ + east[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1]; + } + + __syncthreads(); + + + + temp[ty][tx] = J_cuda[index]; + + __syncthreads(); + + jc = temp[ty][tx]; + + if ( ty == 0 && tx == 0 ){ //nw + n = north[ty][tx] - jc; + s = temp[ty+1][tx] - jc; + w = west[ty][tx] - jc; + e = temp[ty][tx+1] - jc; + } + else if ( ty == 0 && tx == BLOCK_SIZE-1 ){ //ne + n = north[ty][tx] - jc; + s = temp[ty+1][tx] - jc; + w = temp[ty][tx-1] - jc; + e = east[ty][tx] - jc; + } + else if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se + n = temp[ty-1][tx] - jc; + s = south[ty][tx] - jc; + w = temp[ty][tx-1] - jc; + e = east[ty][tx] - jc; + } + else if ( ty == BLOCK_SIZE -1 && tx == 0 ){//sw + n = temp[ty-1][tx] - jc; + s = south[ty][tx] - jc; + w = west[ty][tx] - jc; + e = temp[ty][tx+1] - jc; + } + + else if ( ty == 0 ){ //n + n = north[ty][tx] - jc; + s = temp[ty+1][tx] - jc; + w = temp[ty][tx-1] - jc; + e = temp[ty][tx+1] - jc; + } + else if ( tx == BLOCK_SIZE -1 ){ //e + n = temp[ty-1][tx] - jc; + s = temp[ty+1][tx] - jc; + w = temp[ty][tx-1] - jc; + e = east[ty][tx] - jc; + } + else if ( ty == BLOCK_SIZE -1){ //s + n = temp[ty-1][tx] - jc; + s = south[ty][tx] - jc; + w = temp[ty][tx-1] - jc; + e = temp[ty][tx+1] - jc; + } + else if ( tx == 0 ){ //w + n = temp[ty-1][tx] - jc; + s = temp[ty+1][tx] - jc; + w = west[ty][tx] - jc; + e = temp[ty][tx+1] - jc; + } + else{ //the data elements which are not on the borders + n = temp[ty-1][tx] - jc; + s = temp[ty+1][tx] - jc; + w = temp[ty][tx-1] - jc; + e = temp[ty][tx+1] - jc; + } + + + g2 = ( n * n + s * s + w * w + e * e ) / (jc * jc); + + l = ( n + s + w + e ) / jc; + + num = (0.5*g2) - ((1.0/16.0)*(l*l)) ; + den = 1 + (.25*l); + qsqr = num/(den*den); + + // diffusion coefficent (equ 33) + den = (qsqr-q0sqr) / (q0sqr * (1+q0sqr)) ; + c = 1.0 / (1.0+den) ; + + // saturate diffusion coefficent + if (c < 0){temp_result[ty][tx] = 0;} + else if (c > 1) {temp_result[ty][tx] = 1;} + else {temp_result[ty][tx] = c;} + + __syncthreads(); + + C_cuda[index] = temp_result[ty][tx]; + E_C[index] = e; + W_C[index] = w; + S_C[index] = s; + N_C[index] = n; + +} + +__global__ void +srad_cuda_2( + float *E_C, + float *W_C, + float *N_C, + float *S_C, + float * J_cuda, + float * C_cuda, + int cols, + int rows, + float lambda, + float q0sqr +) +{ + //block id + int bx = blockIdx.x; + int by = blockIdx.y; + + //thread id + int tx = threadIdx.x; + int ty = threadIdx.y; + + //indices + int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx; + int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx; + int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE; + float cc, cn, cs, ce, cw, d_sum; + + //shared memory allocation + __shared__ float south_c[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float east_c[BLOCK_SIZE][BLOCK_SIZE]; + + __shared__ float c_cuda_temp[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float c_cuda_result[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE]; + + //load data to shared memory + temp[ty][tx] = J_cuda[index]; + + __syncthreads(); + + south_c[ty][tx] = C_cuda[index_s]; + + if ( by == gridDim.y - 1 ){ + south_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx]; + } + __syncthreads(); + + + east_c[ty][tx] = C_cuda[index_e]; + + if ( bx == gridDim.x - 1 ){ + east_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1]; + } + + __syncthreads(); + + c_cuda_temp[ty][tx] = C_cuda[index]; + + __syncthreads(); + + cc = c_cuda_temp[ty][tx]; + + if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se + cn = cc; + cs = south_c[ty][tx]; + cw = cc; + ce = east_c[ty][tx]; + } + else if ( tx == BLOCK_SIZE -1 ){ //e + cn = cc; + cs = c_cuda_temp[ty+1][tx]; + cw = cc; + ce = east_c[ty][tx]; + } + else if ( ty == BLOCK_SIZE -1){ //s + cn = cc; + cs = south_c[ty][tx]; + cw = cc; + ce = c_cuda_temp[ty][tx+1]; + } + else{ //the data elements which are not on the borders + cn = cc; + cs = c_cuda_temp[ty+1][tx]; + cw = cc; + ce = c_cuda_temp[ty][tx+1]; + } + + // divergence (equ 58) + d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index]; + + // image update (equ 61) + c_cuda_result[ty][tx] = temp[ty][tx] + 0.25 * lambda * d_sum; + + __syncthreads(); + + J_cuda[index] = c_cuda_result[ty][tx]; + +} diff --git a/examples/streamcluster/run.sh b/examples/streamcluster/run.sh new file mode 100644 index 0000000..0e4db2e --- /dev/null +++ b/examples/streamcluster/run.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e +llvm-as streamcluster_cuda_cpu-cuda-nvptx64-nvidia-cuda-sm_61.ll +llvm-as streamcluster_cuda_cpu-host-x86_64-unknown-linux-gnu.ll +../../build/compilation/kernelTranslator streamcluster_cuda_cpu-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc +../../build/compilation/hostTranslator streamcluster_cuda_cpu-host-x86_64-unknown-linux-gnu.bc host.bc +llc --relocation-model=pic --filetype=obj kernel.bc +llc --relocation-model=pic --filetype=obj host.bc + +g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o sc_gpu -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH +./sc_gpu 10 20 256 32 32 1000 none output.txt 1 +if grep -q -e "0.966199 0.918044 0.348125" output.txt; then + echo "Pass" +else + echo "Error result" + exit 1 +fi diff --git a/examples/streamcluster/streamcluster_cuda.cu b/examples/streamcluster/streamcluster_cuda.cu new file mode 100644 index 0000000..42465da --- /dev/null +++ b/examples/streamcluster/streamcluster_cuda.cu @@ -0,0 +1,363 @@ +/*********************************************** + streamcluster_cuda.cu + : parallelized code of streamcluster + + - original code from PARSEC Benchmark Suite + - parallelization with CUDA API has been applied by + + Shawn Sang-Ha Lee - sl4ge@virginia.edu + University of Virginia + Department of Electrical and Computer Engineering + Department of Computer Science + +***********************************************/ +#include "streamcluster_header.h" + +using namespace std; + +// AUTO-ERROR CHECK FOR ALL CUDA FUNCTIONS +#define CUDA_SAFE_CALL(call) \ + do { \ + cudaError err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define THREADS_PER_BLOCK 512 +#define MAXBLOCKS 65536 + +// host memory +float *work_mem_h; +float *coord_h; + +// device memory +float *work_mem_d; +float *coord_d; +int *center_table_d; +bool *switch_membership_d; +Point *p; + +static int iter = 0; // counter for total# of iteration + +//======================================= +// Euclidean Distance +//======================================= +__device__ float d_dist(int p1, int p2, int num, int dim, float *coord_d) { + float retval = 0.0; + for (int i = 0; i < dim; i++) { + float tmp = coord_d[(i * num) + p1] - coord_d[(i * num) + p2]; + retval += tmp * tmp; + } + return retval; +} + +//======================================= +// Kernel - Compute Cost +//======================================= +__global__ void kernel_compute_cost(int num, int dim, long x, Point *p, int K, + int stride, float *coord_d, + float *work_mem_d, int *center_table_d, + bool *switch_membership_d) { + // block ID and global thread ID + const int bid = blockIdx.x + gridDim.x * blockIdx.y; + const int tid = blockDim.x * bid + threadIdx.x; + + if (tid < num) { + float *lower = &work_mem_d[tid * stride]; + + // cost between this point and point[x]: euclidean distance multiplied by + // weight + float x_cost = d_dist(tid, x, num, dim, coord_d) * p[tid].weight; + + // if computed cost is less then original (it saves), mark it as to reassign + if (x_cost < p[tid].cost) { + switch_membership_d[tid] = 1; + lower[K] += x_cost - p[tid].cost; + } + // if computed cost is larger, save the difference + else { + lower[center_table_d[p[tid].assign]] += p[tid].cost - x_cost; + } + } +} + +//======================================= +// Allocate Device Memory +//======================================= +void allocDevMem(int num, int dim) { + CUDA_SAFE_CALL(cudaMalloc((void **)¢er_table_d, num * sizeof(int))); + CUDA_SAFE_CALL(cudaMalloc((void **)&switch_membership_d, num * sizeof(bool))); + CUDA_SAFE_CALL(cudaMalloc((void **)&p, num * sizeof(Point))); + CUDA_SAFE_CALL(cudaMalloc((void **)&coord_d, num * dim * sizeof(float))); +} + +//======================================= +// Allocate Host Memory +//======================================= +void allocHostMem(int num, int dim) { + coord_h = (float *)malloc(num * dim * sizeof(float)); +} + +//======================================= +// Free Device Memory +//======================================= +void freeDevMem() { + CUDA_SAFE_CALL(cudaFree(center_table_d)); + CUDA_SAFE_CALL(cudaFree(switch_membership_d)); + CUDA_SAFE_CALL(cudaFree(p)); + CUDA_SAFE_CALL(cudaFree(coord_d)); +} + +//======================================= +// Free Host Memory +//======================================= +void freeHostMem() { free(coord_h); } + +//======================================= +// pgain Entry - CUDA SETUP + CUDA CALL +//======================================= +float pgain(long x, Points *points, float z, long int *numcenters, int kmax, + bool *is_center, int *center_table, bool *switch_membership, + bool isCoordChanged, double *serial_t, double *cpu_to_gpu_t, + double *gpu_to_cpu_t, double *alloc_t, double *kernel_t, + double *free_t) { +#ifdef CUDATIME + float tmp_t; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start, 0); +#endif + + cudaError_t error; + + int stride = *numcenters + 1; // size of each work_mem segment + int K = *numcenters; // number of centers + int num = points->num; // number of points + int dim = points->dim; // number of dimension + int nThread = num; // number of threads == number of data points + + //========================================= + // ALLOCATE HOST MEMORY + DATA PREPARATION + //========================================= + work_mem_h = (float *)malloc(stride * (nThread + 1) * sizeof(float)); + // Only on the first iteration + if (iter == 0) { + allocHostMem(num, dim); + } + + // build center-index table + int count = 0; + for (int i = 0; i < num; i++) { + if (is_center[i]) { + center_table[i] = count++; + } + } + + // Extract 'coord' + // Only if first iteration OR coord has changed + if (isCoordChanged || iter == 0) { + for (int i = 0; i < dim; i++) { + for (int j = 0; j < num; j++) { + coord_h[(num * i) + j] = points->p[j].coord[i]; + } + } + } + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *serial_t += (double)tmp_t; + + cudaEventRecord(start, 0); +#endif + + //======================================= + // ALLOCATE GPU MEMORY + //======================================= + CUDA_SAFE_CALL( + cudaMalloc((void **)&work_mem_d, stride * (nThread + 1) * sizeof(float))); + // Only on the first iteration + if (iter == 0) { + allocDevMem(num, dim); + } + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *alloc_t += (double)tmp_t; + + cudaEventRecord(start, 0); +#endif + + //======================================= + // CPU-TO-GPU MEMORY COPY + //======================================= + // Only if first iteration OR coord has changed + if (isCoordChanged || iter == 0) { + CUDA_SAFE_CALL(cudaMemcpy(coord_d, coord_h, num * dim * sizeof(float), + cudaMemcpyHostToDevice)); + } + CUDA_SAFE_CALL(cudaMemcpy(center_table_d, center_table, num * sizeof(int), + cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL( + cudaMemcpy(p, points->p, num * sizeof(Point), cudaMemcpyHostToDevice)); + + CUDA_SAFE_CALL( + cudaMemset((void *)switch_membership_d, 0, num * sizeof(bool))); + CUDA_SAFE_CALL(cudaMemset((void *)work_mem_d, 0, + stride * (nThread + 1) * sizeof(float))); + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *cpu_to_gpu_t += (double)tmp_t; + + cudaEventRecord(start, 0); +#endif + + //======================================= + // KERNEL: CALCULATE COST + //======================================= + // Determine the number of thread blocks in the x- and y-dimension + int num_blocks = + (int)((float)(num + THREADS_PER_BLOCK - 1) / (float)THREADS_PER_BLOCK); + int num_blocks_y = + (int)((float)(num_blocks + MAXBLOCKS - 1) / (float)MAXBLOCKS); + int num_blocks_x = + (int)((float)(num_blocks + num_blocks_y - 1) / (float)num_blocks_y); + dim3 grid_size(num_blocks_x, num_blocks_y, 1); + + kernel_compute_cost<<>>( + num, // in: # of data + dim, // in: dimension of point coordinates + x, // in: point to open a center at + p, // in: data point array + K, // in: number of centers + stride, // in: size of each work_mem segment + coord_d, // in: array of point coordinates + work_mem_d, // out: cost and lower field array + center_table_d, // in: center index table + switch_membership_d // out: changes in membership + ); + cudaThreadSynchronize(); + + // error check + error = cudaGetLastError(); + if (error != cudaSuccess) { + printf("kernel error: %s\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *kernel_t += (double)tmp_t; + + cudaEventRecord(start, 0); +#endif + + //======================================= + // GPU-TO-CPU MEMORY COPY + //======================================= + CUDA_SAFE_CALL(cudaMemcpy(work_mem_h, work_mem_d, + stride * (nThread + 1) * sizeof(float), + cudaMemcpyDeviceToHost)); + CUDA_SAFE_CALL(cudaMemcpy(switch_membership, switch_membership_d, + num * sizeof(bool), cudaMemcpyDeviceToHost)); + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *gpu_to_cpu_t += (double)tmp_t; + + cudaEventRecord(start, 0); +#endif + + //======================================= + // CPU (SERIAL) WORK + //======================================= + int number_of_centers_to_close = 0; + float gl_cost_of_opening_x = z; + float *gl_lower = &work_mem_h[stride * nThread]; + // compute the number of centers to close if we are to open i + for (int i = 0; i < num; i++) { + if (is_center[i]) { + float low = z; + for (int j = 0; j < num; j++) { + low += work_mem_h[j * stride + center_table[i]]; + } + + gl_lower[center_table[i]] = low; + + if (low > 0) { + ++number_of_centers_to_close; + work_mem_h[i * stride + K] -= low; + } + } + gl_cost_of_opening_x += work_mem_h[i * stride + K]; + } + + // if opening a center at x saves cost (i.e. cost is negative) do so; + // otherwise, do nothing + if (gl_cost_of_opening_x < 0) { + for (int i = 0; i < num; i++) { + bool close_center = gl_lower[center_table[points->p[i].assign]] > 0; + if (switch_membership[i] || close_center) { + points->p[i].cost = + dist(points->p[i], points->p[x], dim) * points->p[i].weight; + points->p[i].assign = x; + } + } + + for (int i = 0; i < num; i++) { + if (is_center[i] && gl_lower[center_table[i]] > 0) { + is_center[i] = false; + } + } + + if (x >= 0 && x < num) { + is_center[x] = true; + } + *numcenters = *numcenters + 1 - number_of_centers_to_close; + } else { + gl_cost_of_opening_x = 0; + } + + //======================================= + // DEALLOCATE HOST MEMORY + //======================================= + free(work_mem_h); + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *serial_t += (double)tmp_t; + + cudaEventRecord(start, 0); +#endif + + //======================================= + // DEALLOCATE GPU MEMORY + //======================================= + CUDA_SAFE_CALL(cudaFree(work_mem_d)); + +#ifdef CUDATIME + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&tmp_t, start, stop); + *free_t += (double)tmp_t; +#endif + iter++; + return -gl_cost_of_opening_x; +} diff --git a/examples/streamcluster/streamcluster_cuda_cpu-cuda-nvptx64-nvidia-cuda-sm_61.ll b/examples/streamcluster/streamcluster_cuda_cpu-cuda-nvptx64-nvidia-cuda-sm_61.ll new file mode 100644 index 0000000..6c0306b --- /dev/null +++ b/examples/streamcluster/streamcluster_cuda_cpu-cuda-nvptx64-nvidia-cuda-sm_61.ll @@ -0,0 +1,366 @@ +; ModuleID = 'streamcluster_cuda_cpu-cuda-nvptx64-nvidia-cuda-sm_61.bc' +source_filename = "streamcluster_cuda_cpu.cu" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.__cuda_builtin_blockIdx_t = type { i8 } +%struct.__cuda_builtin_gridDim_t = type { i8 } +%struct.__cuda_builtin_blockDim_t = type { i8 } +%struct.__cuda_builtin_threadIdx_t = type { i8 } +%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } +%struct.Point = type { float, float*, i64, float } + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any + +$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any + +$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any + +$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any + +$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any + +@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 +@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1 +@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 +@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { +entry: + %p.addr = alloca i8**, align 8 + %s.addr = alloca i64, align 8 + store i8** %p, i8*** %p.addr, align 8 + store i64 %s, i64* %s.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { +entry: + %p.addr = alloca %struct.cudaFuncAttributes*, align 8 + %c.addr = alloca i8*, align 8 + store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 + store i8* %c, i8** %c.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { +entry: + %value.addr = alloca i32*, align 8 + %attr.addr = alloca i32, align 4 + %device.addr = alloca i32, align 4 + store i32* %value, i32** %value.addr, align 8 + store i32 %attr, i32* %attr.addr, align 4 + store i32 %device, i32* %device.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { +entry: + %device.addr = alloca i32*, align 8 + store i32* %device, i32** %device.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { +entry: + %numBlocks.addr = alloca i32*, align 8 + %func.addr = alloca i8*, align 8 + %blockSize.addr = alloca i32, align 4 + %dynamicSmemSize.addr = alloca i64, align 8 + %flags.addr = alloca i32, align 4 + store i32* %numBlocks, i32** %numBlocks.addr, align 8 + store i8* %func, i8** %func.addr, align 8 + store i32 %blockSize, i32* %blockSize.addr, align 4 + store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 + store i32 %flags, i32* %flags.addr, align 4 + ret i32 999 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local float @_Z6d_distiiiiPf(i32 %p1, i32 %p2, i32 %num, i32 %dim, float* %coord_d) #0 { +entry: + %p1.addr = alloca i32, align 4 + %p2.addr = alloca i32, align 4 + %num.addr = alloca i32, align 4 + %dim.addr = alloca i32, align 4 + %coord_d.addr = alloca float*, align 8 + %retval1 = alloca float, align 4 + %i = alloca i32, align 4 + %tmp = alloca float, align 4 + store i32 %p1, i32* %p1.addr, align 4 + store i32 %p2, i32* %p2.addr, align 4 + store i32 %num, i32* %num.addr, align 4 + store i32 %dim, i32* %dim.addr, align 4 + store float* %coord_d, float** %coord_d.addr, align 8 + store float 0.000000e+00, float* %retval1, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %dim.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load float*, float** %coord_d.addr, align 8 + %3 = load i32, i32* %i, align 4 + %4 = load i32, i32* %num.addr, align 4 + %mul = mul nsw i32 %3, %4 + %5 = load i32, i32* %p1.addr, align 4 + %add = add nsw i32 %mul, %5 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom + %6 = load float, float* %arrayidx, align 4 + %7 = load float*, float** %coord_d.addr, align 8 + %8 = load i32, i32* %i, align 4 + %9 = load i32, i32* %num.addr, align 4 + %mul2 = mul nsw i32 %8, %9 + %10 = load i32, i32* %p2.addr, align 4 + %add3 = add nsw i32 %mul2, %10 + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4 + %11 = load float, float* %arrayidx5, align 4 + %sub = fsub contract float %6, %11 + store float %sub, float* %tmp, align 4 + %12 = load float, float* %tmp, align 4 + %13 = load float, float* %tmp, align 4 + %mul6 = fmul contract float %12, %13 + %14 = load float, float* %retval1, align 4 + %add7 = fadd contract float %14, %mul6 + store float %add7, float* %retval1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %16 = load float, float* %retval1, align 4 + ret float %16 +} + +; Function Attrs: convergent noinline nounwind optnone +define dso_local void @_Z19kernel_compute_costiilP5PointiiPfS1_PiPb(i32 %num, i32 %dim, i64 %x, %struct.Point* %p, i32 %K, i32 %stride, float* %coord_d, float* %work_mem_d, i32* %center_table_d, i8* %switch_membership_d) #0 { +entry: + %num.addr = alloca i32, align 4 + %dim.addr = alloca i32, align 4 + %x.addr = alloca i64, align 8 + %p.addr = alloca %struct.Point*, align 8 + %K.addr = alloca i32, align 4 + %stride.addr = alloca i32, align 4 + %coord_d.addr = alloca float*, align 8 + %work_mem_d.addr = alloca float*, align 8 + %center_table_d.addr = alloca i32*, align 8 + %switch_membership_d.addr = alloca i8*, align 8 + %bid = alloca i32, align 4 + %tid = alloca i32, align 4 + %lower = alloca float*, align 8 + %x_cost = alloca float, align 4 + store i32 %num, i32* %num.addr, align 4 + store i32 %dim, i32* %dim.addr, align 4 + store i64 %x, i64* %x.addr, align 8 + store %struct.Point* %p, %struct.Point** %p.addr, align 8 + store i32 %K, i32* %K.addr, align 4 + store i32 %stride, i32* %stride.addr, align 4 + store float* %coord_d, float** %coord_d.addr, align 8 + store float* %work_mem_d, float** %work_mem_d.addr, align 8 + store i32* %center_table_d, i32** %center_table_d.addr, align 8 + store i8* %switch_membership_d, i8** %switch_membership_d.addr, align 8 + %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3 + %call1 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #3 + %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3 + %mul = mul i32 %call1, %call2 + %add = add i32 %call, %mul + store i32 %add, i32* %bid, align 4 + %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3 + %0 = load i32, i32* %bid, align 4 + %mul4 = mul i32 %call3, %0 + %call5 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3 + %add6 = add i32 %mul4, %call5 + store i32 %add6, i32* %tid, align 4 + %1 = load i32, i32* %tid, align 4 + %2 = load i32, i32* %num.addr, align 4 + %cmp = icmp slt i32 %1, %2 + br i1 %cmp, label %if.then, label %if.end34 + +if.then: ; preds = %entry + %3 = load float*, float** %work_mem_d.addr, align 8 + %4 = load i32, i32* %tid, align 4 + %5 = load i32, i32* %stride.addr, align 4 + %mul7 = mul nsw i32 %4, %5 + %idxprom = sext i32 %mul7 to i64 + %arrayidx = getelementptr inbounds float, float* %3, i64 %idxprom + store float* %arrayidx, float** %lower, align 8 + %6 = load i32, i32* %tid, align 4 + %7 = load i64, i64* %x.addr, align 8 + %conv = trunc i64 %7 to i32 + %8 = load i32, i32* %num.addr, align 4 + %9 = load i32, i32* %dim.addr, align 4 + %10 = load float*, float** %coord_d.addr, align 8 + %call8 = call float @_Z6d_distiiiiPf(i32 %6, i32 %conv, i32 %8, i32 %9, float* %10) #3 + %11 = load %struct.Point*, %struct.Point** %p.addr, align 8 + %12 = load i32, i32* %tid, align 4 + %idxprom9 = sext i32 %12 to i64 + %arrayidx10 = getelementptr inbounds %struct.Point, %struct.Point* %11, i64 %idxprom9 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx10, i32 0, i32 0 + %13 = load float, float* %weight, align 8 + %mul11 = fmul contract float %call8, %13 + store float %mul11, float* %x_cost, align 4 + %14 = load float, float* %x_cost, align 4 + %15 = load %struct.Point*, %struct.Point** %p.addr, align 8 + %16 = load i32, i32* %tid, align 4 + %idxprom12 = sext i32 %16 to i64 + %arrayidx13 = getelementptr inbounds %struct.Point, %struct.Point* %15, i64 %idxprom12 + %cost = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx13, i32 0, i32 3 + %17 = load float, float* %cost, align 8 + %cmp14 = fcmp olt float %14, %17 + br i1 %cmp14, label %if.then15, label %if.else + +if.then15: ; preds = %if.then + %18 = load i8*, i8** %switch_membership_d.addr, align 8 + %19 = load i32, i32* %tid, align 4 + %idxprom16 = sext i32 %19 to i64 + %arrayidx17 = getelementptr inbounds i8, i8* %18, i64 %idxprom16 + store i8 1, i8* %arrayidx17, align 1 + %20 = load float, float* %x_cost, align 4 + %21 = load %struct.Point*, %struct.Point** %p.addr, align 8 + %22 = load i32, i32* %tid, align 4 + %idxprom18 = sext i32 %22 to i64 + %arrayidx19 = getelementptr inbounds %struct.Point, %struct.Point* %21, i64 %idxprom18 + %cost20 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx19, i32 0, i32 3 + %23 = load float, float* %cost20, align 8 + %sub = fsub contract float %20, %23 + %24 = load float*, float** %lower, align 8 + %25 = load i32, i32* %K.addr, align 4 + %idxprom21 = sext i32 %25 to i64 + %arrayidx22 = getelementptr inbounds float, float* %24, i64 %idxprom21 + %26 = load float, float* %arrayidx22, align 4 + %add23 = fadd contract float %26, %sub + store float %add23, float* %arrayidx22, align 4 + br label %if.end + +if.else: ; preds = %if.then + %27 = load %struct.Point*, %struct.Point** %p.addr, align 8 + %28 = load i32, i32* %tid, align 4 + %idxprom24 = sext i32 %28 to i64 + %arrayidx25 = getelementptr inbounds %struct.Point, %struct.Point* %27, i64 %idxprom24 + %cost26 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx25, i32 0, i32 3 + %29 = load float, float* %cost26, align 8 + %30 = load float, float* %x_cost, align 4 + %sub27 = fsub contract float %29, %30 + %31 = load float*, float** %lower, align 8 + %32 = load i32*, i32** %center_table_d.addr, align 8 + %33 = load %struct.Point*, %struct.Point** %p.addr, align 8 + %34 = load i32, i32* %tid, align 4 + %idxprom28 = sext i32 %34 to i64 + %arrayidx29 = getelementptr inbounds %struct.Point, %struct.Point* %33, i64 %idxprom28 + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx29, i32 0, i32 2 + %35 = load i64, i64* %assign, align 8 + %arrayidx30 = getelementptr inbounds i32, i32* %32, i64 %35 + %36 = load i32, i32* %arrayidx30, align 4 + %idxprom31 = sext i32 %36 to i64 + %arrayidx32 = getelementptr inbounds float, float* %31, i64 %idxprom31 + %37 = load float, float* %arrayidx32, align 4 + %add33 = fadd contract float %37, %sub27 + store float %add33, float* %arrayidx32, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then15 + br label %if.end34 + +if.end34: ; preds = %if.end, %entry + ret void +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + ret i32 %0 +} + +; Function Attrs: alwaysinline convergent nounwind +define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { +entry: + %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %0 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6} +!llvm.ident = !{!8} +!nvvmir.version = !{!9} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32, i32, i64, %struct.Point*, i32, i32, float*, float*, i32*, i8*)* @_Z19kernel_compute_costiilP5PointiiPfS1_PiPb, !"kernel", i32 1} +!4 = !{null, !"align", i32 8} +!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!6 = !{null, !"align", i32 16} +!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} +!9 = !{i32 1, i32 4} diff --git a/examples/streamcluster/streamcluster_cuda_cpu-host-x86_64-unknown-linux-gnu.ll b/examples/streamcluster/streamcluster_cuda_cpu-host-x86_64-unknown-linux-gnu.ll new file mode 100644 index 0000000..e7bc5f9 --- /dev/null +++ b/examples/streamcluster/streamcluster_cuda_cpu-host-x86_64-unknown-linux-gnu.ll @@ -0,0 +1,5115 @@ +; ModuleID = 'streamcluster_cuda_cpu-host-x86_64-unknown-linux-gnu.bc' +source_filename = "streamcluster_cuda_cpu.cu" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::ios_base::Init" = type { i8 } +%struct.Point = type { float, float*, i64, float } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.dim3 = type { i32, i32, i32 } +%struct.CUstream_st = type opaque +%struct.Points = type { i64, i32, %struct.Point* } +%struct.timeval = type { i64, i64 } +%struct.timezone = type { i32, i32 } +%union.pthread_barrier_t = type { i64, [24 x i8] } +%struct.pkmedian_arg_t = type { %struct.Points*, i64, i64, i64*, i32, %union.pthread_barrier_t* } +%class.PStream = type { i32 (...)** } +%class.SimStream = type { %class.PStream, i64 } +%class.FileStream = type { %class.PStream, %struct._IO_FILE* } + +$_ZN4dim3C2Ejjj = comdat any + +$_ZSt3logf = comdat any + +$_ZN9SimStreamC2El = comdat any + +$_ZN10FileStreamC2EPc = comdat any + +$_ZN7PStreamC2Ev = comdat any + +$_ZN9SimStream4readEPfii = comdat any + +$_ZN9SimStream6ferrorEv = comdat any + +$_ZN9SimStream4feofEv = comdat any + +$_ZN9SimStreamD2Ev = comdat any + +$_ZN9SimStreamD0Ev = comdat any + +$_ZN7PStreamD2Ev = comdat any + +$_ZN7PStreamD0Ev = comdat any + +$__clang_call_terminate = comdat any + +$_ZN10FileStream4readEPfii = comdat any + +$_ZN10FileStream6ferrorEv = comdat any + +$_ZN10FileStream4feofEv = comdat any + +$_ZN10FileStreamD2Ev = comdat any + +$_ZN10FileStreamD0Ev = comdat any + +$_ZTV9SimStream = comdat any + +$_ZTS9SimStream = comdat any + +$_ZTS7PStream = comdat any + +$_ZTI7PStream = comdat any + +$_ZTI9SimStream = comdat any + +$_ZTV7PStream = comdat any + +$_ZTV10FileStream = comdat any + +$_ZTS10FileStream = comdat any + +$_ZTI10FileStream = comdat any + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external hidden global i8 +@work_mem_h = dso_local global float* null, align 8 +@coord_h = dso_local global float* null, align 8 +@work_mem_d = dso_local global float* null, align 8 +@coord_d = dso_local global float* null, align 8 +@center_table_d = dso_local global i32* null, align 8 +@switch_membership_d = dso_local global i8* null, align 8 +@p = dso_local global %struct.Point* null, align 8 +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str = private unnamed_addr constant [42 x i8] c"Cuda error in file '%s' in line %i : %s.\0A\00", align 1 +@.str.1 = private unnamed_addr constant [24 x i8] c"./streamcluster_cuda.cu\00", align 1 +@_ZL4iter = internal global i32 0, align 4 +@.str.2 = private unnamed_addr constant [18 x i8] c"kernel error: %s\0A\00", align 1 +@isCoordChanged = dso_local global i8 0, align 1 +@serial_t = dso_local global double 0.000000e+00, align 8 +@cpu_to_gpu_t = dso_local global double 0.000000e+00, align 8 +@gpu_to_cpu_t = dso_local global double 0.000000e+00, align 8 +@alloc_t = dso_local global double 0.000000e+00, align 8 +@kernel_t = dso_local global double 0.000000e+00, align 8 +@free_t = dso_local global double 0.000000e+00, align 8 +@time_local_search = dso_local global double 0.000000e+00, align 8 +@time_speedy = dso_local global double 0.000000e+00, align 8 +@time_select_feasible = dso_local global double 0.000000e+00, align 8 +@time_gain = dso_local global double 0.000000e+00, align 8 +@time_shuffle = dso_local global double 0.000000e+00, align 8 +@time_gain_dist = dso_local global double 0.000000e+00, align 8 +@time_gain_init = dso_local global double 0.000000e+00, align 8 +@.str.3 = private unnamed_addr constant [2 x i8] c"w\00", align 1 +@.str.4 = private unnamed_addr constant [4 x i8] c"%d \00", align 1 +@_ZL5nproc = internal global i32 0, align 4 +@_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE9totalcost = internal global float 0.000000e+00, align 4 +@_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE4open = internal global i8 0, align 1 +@_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE5costs = internal global float* null, align 8 +@_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i = internal global i32 0, align 4 +@_ZL9is_center = internal global i8* null, align 8 +@_ZL12center_table = internal global i32* null, align 8 +@_ZL17switch_membership = internal global i8* null, align 8 +@_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k = internal global i64 0, align 8 +@_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE8feasible = internal global i32* null, align 8 +@_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE11numfeasible = internal global i32 0, align 4 +@_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE4hizs = internal global float* null, align 8 +@.str.5 = private unnamed_addr constant [18 x i8] c"error opening %s\0A\00", align 1 +@.str.6 = private unnamed_addr constant [4 x i8] c"%u\0A\00", align 1 +@.str.7 = private unnamed_addr constant [5 x i8] c"%lf\0A\00", align 1 +@.str.8 = private unnamed_addr constant [5 x i8] c"%lf \00", align 1 +@.str.9 = private unnamed_addr constant [3 x i8] c"\0A\0A\00", align 1 +@.str.10 = private unnamed_addr constant [32 x i8] c"not enough memory for a chunk!\0A\00", align 1 +@.str.11 = private unnamed_addr constant [16 x i8] c"read %d points\0A\00", align 1 +@.str.12 = private unnamed_addr constant [21 x i8] c"error reading data!\0A\00", align 1 +@.str.13 = private unnamed_addr constant [21 x i8] c"finish local search\0A\00", align 1 +@.str.14 = private unnamed_addr constant [33 x i8] c"oops! no more space for centers\0A\00", align 1 +@.str.15 = private unnamed_addr constant [24 x i8] c"PARSEC Benchmark Suite\0A\00", align 1 +@.str.16 = private unnamed_addr constant [64 x i8] c"usage: %s k1 k2 d n chunksize clustersize infile outfile nproc\0A\00", align 1 +@.str.17 = private unnamed_addr constant [47 x i8] c" k1: Min. number of centers allowed\0A\00", align 1 +@.str.18 = private unnamed_addr constant [47 x i8] c" k2: Max. number of centers allowed\0A\00", align 1 +@.str.19 = private unnamed_addr constant [45 x i8] c" d: Dimension of each data point\0A\00", align 1 +@.str.20 = private unnamed_addr constant [38 x i8] c" n: Number of data points\0A\00", align 1 +@.str.21 = private unnamed_addr constant [57 x i8] c" chunksize: Number of data points to handle per step\0A\00", align 1 +@.str.22 = private unnamed_addr constant [55 x i8] c" clustersize: Maximum number of intermediate centers\0A\00", align 1 +@.str.23 = private unnamed_addr constant [37 x i8] c" infile: Input file (if n<=0)\0A\00", align 1 +@.str.24 = private unnamed_addr constant [28 x i8] c" outfile: Output file\0A\00", align 1 +@.str.25 = private unnamed_addr constant [41 x i8] c" nproc: Number of threads to use\0A\00", align 1 +@.str.26 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 +@.str.27 = private unnamed_addr constant [77 x i8] c"if n > 0, points will be randomly generated instead of reading from infile.\0A\00", align 1 +@.str.28 = private unnamed_addr constant [13 x i8] c"time = %lfs\0A\00", align 1 +@.str.29 = private unnamed_addr constant [19 x i8] c"time pgain = %lfs\0A\00", align 1 +@.str.30 = private unnamed_addr constant [24 x i8] c"time pgain_dist = %lfs\0A\00", align 1 +@.str.31 = private unnamed_addr constant [24 x i8] c"time pgain_init = %lfs\0A\00", align 1 +@.str.32 = private unnamed_addr constant [21 x i8] c"time pselect = %lfs\0A\00", align 1 +@.str.33 = private unnamed_addr constant [21 x i8] c"time pspeedy = %lfs\0A\00", align 1 +@.str.34 = private unnamed_addr constant [22 x i8] c"time pshuffle = %lfs\0A\00", align 1 +@.str.35 = private unnamed_addr constant [25 x i8] c"time localSearch = %lfs\0A\00", align 1 +@.str.36 = private unnamed_addr constant [34 x i8] c"====CUDA Timing info (pgain)====\0A\00", align 1 +@.str.37 = private unnamed_addr constant [20 x i8] c"time serial = %lfs\0A\00", align 1 +@.str.38 = private unnamed_addr constant [36 x i8] c"time CPU to GPU memory copy = %lfs\0A\00", align 1 +@.str.39 = private unnamed_addr constant [41 x i8] c"time GPU to CPU memory copy back = %lfs\0A\00", align 1 +@.str.40 = private unnamed_addr constant [24 x i8] c"time GPU malloc = %lfs\0A\00", align 1 +@.str.41 = private unnamed_addr constant [22 x i8] c"time GPU free = %lfs\0A\00", align 1 +@.str.42 = private unnamed_addr constant [20 x i8] c"time kernel = %lfs\0A\00", align 1 +@_ZTV9SimStream = linkonce_odr dso_local unnamed_addr constant { [7 x i8*] } { [7 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI9SimStream to i8*), i8* bitcast (i64 (%class.SimStream*, float*, i32, i32)* @_ZN9SimStream4readEPfii to i8*), i8* bitcast (i32 (%class.SimStream*)* @_ZN9SimStream6ferrorEv to i8*), i8* bitcast (i32 (%class.SimStream*)* @_ZN9SimStream4feofEv to i8*), i8* bitcast (void (%class.SimStream*)* @_ZN9SimStreamD2Ev to i8*), i8* bitcast (void (%class.SimStream*)* @_ZN9SimStreamD0Ev to i8*)] }, comdat, align 8 +@_ZTVN10__cxxabiv120__si_class_type_infoE = external dso_local global i8* +@_ZTS9SimStream = linkonce_odr dso_local constant [11 x i8] c"9SimStream\00", comdat, align 1 +@_ZTVN10__cxxabiv117__class_type_infoE = external dso_local global i8* +@_ZTS7PStream = linkonce_odr dso_local constant [9 x i8] c"7PStream\00", comdat, align 1 +@_ZTI7PStream = linkonce_odr dso_local constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @_ZTS7PStream, i32 0, i32 0) }, comdat, align 8 +@_ZTI9SimStream = linkonce_odr dso_local constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @_ZTS9SimStream, i32 0, i32 0), i8* bitcast ({ i8*, i8* }* @_ZTI7PStream to i8*) }, comdat, align 8 +@_ZTV7PStream = linkonce_odr dso_local unnamed_addr constant { [7 x i8*] } { [7 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI7PStream to i8*), i8* bitcast (void ()* @__cxa_pure_virtual to i8*), i8* bitcast (void ()* @__cxa_pure_virtual to i8*), i8* bitcast (void ()* @__cxa_pure_virtual to i8*), i8* bitcast (void (%class.PStream*)* @_ZN7PStreamD2Ev to i8*), i8* bitcast (void (%class.PStream*)* @_ZN7PStreamD0Ev to i8*)] }, comdat, align 8 +@_ZTV10FileStream = linkonce_odr dso_local unnamed_addr constant { [7 x i8*] } { [7 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI10FileStream to i8*), i8* bitcast (i64 (%class.FileStream*, float*, i32, i32)* @_ZN10FileStream4readEPfii to i8*), i8* bitcast (i32 (%class.FileStream*)* @_ZN10FileStream6ferrorEv to i8*), i8* bitcast (i32 (%class.FileStream*)* @_ZN10FileStream4feofEv to i8*), i8* bitcast (void (%class.FileStream*)* @_ZN10FileStreamD2Ev to i8*), i8* bitcast (void (%class.FileStream*)* @_ZN10FileStreamD0Ev to i8*)] }, comdat, align 8 +@.str.43 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 +@.str.44 = private unnamed_addr constant [24 x i8] c"error opening file %s\0A.\00", align 1 +@_ZTS10FileStream = linkonce_odr dso_local constant [13 x i8] c"10FileStream\00", comdat, align 1 +@_ZTI10FileStream = linkonce_odr dso_local constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @_ZTS10FileStream, i32 0, i32 0), i8* bitcast ({ i8*, i8* }* @_ZTI7PStream to i8*) }, comdat, align 8 +@.str.45 = private unnamed_addr constant [21 x i8] c"closing file stream\0A\00", align 1 +@0 = private unnamed_addr constant [45 x i8] c"_Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00", align 1 +@1 = private constant [15713 x i8] c"P\EDU\BA\01\00\10\00P=\00\00\00\00\00\00\02\00\01\01@\00\00\00(2\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\801\00\00\00\00\00\00@/\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.nv.info._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.nv.shared._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.nv.global\00.nv.constant0._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.text._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.nv.info._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.nv.shared._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00.nv.global\00blockIdx\00gridDim\00blockDim\00threadIdx\00$_Z19kernel_compute_costiilP5PointiiPfS1_PiPb$_Z6d_distiiiiPf\00.nv.constant0._Z19kernel_compute_costiilP5PointiiPfS1_PiPb\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00_\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0B\01\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\14\01\00\00\01\00\08\00\03\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\1C\01\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00%\01\00\00\01\00\08\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00m\01\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00\80(\00\00\00\00\00\00/\01\00\00\12\02\07\008\1D\00\00\00\00\00\00H\0B\00\00\00\00\00\00\04/\08\00\08\00\00\00\18\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00\00\00\00\00\04\11\08\00\09\00\00\00\00\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\80\00\00\00\04\11\08\00\08\00\00\00\80\00\00\00\010\00\00\01*\00\00\04\0A\08\00\07\00\00\00@\01@\00\03\19@\00\04\17\0C\00\00\00\00\00\09\008\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\000\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\06\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\05\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\04\00\00\F0\11\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0\11\00\03\1B\FF\00\04\1D\08\00\E8\07\00\00\08\08\00\00\04\1C\04\000\1D\00\00\04\1E\04\00P\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveBV\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\7Fvisible\D9\04\0F\F9\01_Z6d_distiiiiPf(\82\03\0B\1D\00\0E\9F\0C\0F%\00\07\1F1%\00\11\1F2%\00\11\1A3b\04\0E%\00\0F\EF\03\18\1F6\EF\03\18wpred %p\C1\0A\10f\91\01Kf<8>\12\04-17\13\04\1F9\13\04\0C\1F6\E4\0A\1D\0F\F7\00\04\0E\98\03\1E4-\00\0F\C5\03\07\1E3-\00\0FS\0D\07\1F2-\00\06\1E1\1F\04\0F\B4\00\07*0]\F5\02\1F0.\0D\01\144\09\03\0EX\0D\1F3X\0D\02\1A4a\03/16\F0\0B\00J5, 0=\00\03\88\03\1B5\16\00\148\16\00\F2\01bra.uni LBB6_1;\0A\08\00\10:\ED\00\02S\00%6,3\00\16;\16\00%7,\9C\00\B0;\0Asetp.ge.s\1B\002p1,6\00\D2%r7;\0A@%p1 bra`\00\1B4p\00\132p\00\122p\00\06\22\05\03\E1\00\07q\00\1F8\87\00\02$9,8\01\82;\0Amul.lo\85\00Br10,5\007%r90\00411,\93\01T;\0Aadd.\00%2,4\00\121\F4\11C.s64\1E\00!d3\18\00\00\AB\01$hl\00\03#4,\1A\00\132I\00\02\17\00#5,\C3\00\00#\00\01y\00\02S\03\01\D5\008rd5\BD\00413,\0B\02\09\8D\00\174\8D\00\1D3\8D\00\01Z\01)14\8D\00#7,\1A\00\0A\8D\00\178\8D\00\177\8D\00\123\8D\00\005\01csub.rn\18\00\224,\AA\00\22%f\82\02\1Af\B6\05\18f\D1\00%5,\1B\00\07\16\00\05-\02\00\D3\006fma\\\00\227,5\00\03\05\00\1A6a\00\02\A6\02+f7 \02\133 \02\173\90\02(15\0A\02\077\01#6,\1E\00\1F1\EC\02\02/16\ED\02\04&4:\C0\00\181\C0\00\05\90\06\1Ff\90\06\03/f1\90\06\02\F0\06entry _Z19kernel_compa\11 co(\04\80lP5Point0\04vS1_PiPb\93\06\00\B5\00\0F:\00\19\0E\B0\06\0FB\00$\07\CD\06\00\87\03\0FB\00 \1F2B\00.\1F3\C6\00.\1F4B\00.\1F5\C6\00.\1F6B\00.\1F7B\00.\1F8B\00.\1F9n\08\13O7[88n\08\1D\07\A6\10n16 %rs\80\08,17\81\08\1E2\94\0C\1F5\BD\17\0D\1F7\82\08\19\00\BA\04\0F(\01!\0F\8E\0B\00\1F5K\00#\1F8K\00\00\1F4K\00#\1F7\22\0D\01\0FK\00#\1E6\80\09\0F\95\00$\1E5\9D\09\0F\94\00$\1F4X\0D\01\0FK\00#/3]\8D\0A\00\0FK\00#\0FP\0A\00\0F\95\00$\0Fm\0A\00\0F\94\00$#0]\04\03#to\E2\1D).u3\08\03\1F\00\0A\1C\00\1482\08\0F;\00\03\119\1F\00\1F5;\00\02\02\B7\08/d9<\00\05$11\1B\09\0F=\00\01\02\AB\08/11>\00\06\143T\0E\0F>\00\01\02\93\09/13>\00\06\03\9D\09\0F>\00\03\226,$\00\0F\BB\0B\1D\0B\1D\0F\1F1\A6\0B\04\1F6\97\0B\02\1F3\AB\08\02\0B\EA\0B\034\0F+d1\18\00\144\A6\0F\0B\8A\00\144\8B\00\1A0\18\00\135\8C\00\1B82\0C\88%ctaid.x\17\00\00\0E\01\1Fn\18\00\00\157/\00\1Byr\0B\188\F7\0B\06\97\09#9,e\00*r8\EC\00\126\02\01\199\AF\18\00>\02\22ntu\00\0E\B5\0B-64\E7\0B\0B\B9\0B\07K\00\00+\02\03J\00\0D\\\0B\142\\\0B\09\91\00\03(\0A/4;m\0A\00\196w\0C'16G\0C\0F\FE\0C\00$15r\0A\08\00\0D*7_p\0D\137\83\0A87_1\00\0D517,\E0\01\08t\00\1F7\8B\00\03\09\19\0D\0A\04\0D#9,8\00\00'\00\0DI\0C!18\19\00\199J\0C\024\00\01\1C\00\0AL\0C420,\9D\00\01'\00\0A[\02\137\A3\02'20\9B\00/20\B2\00\02\142\CF\01\0A\16\00\172S\01\06\16\00\08#\0D\07#\0E\181~\0CF{ \0A\09-\08Ctemp\FE\04Ireg;\7F\11\01\0B\00\1C0j\12\02\16\00\04:\17\1F03\00\00\1F13\00\02\1413\00\1F13\00\00\1F23\00\02\1423\00\1F23\00\00\1F33\00\02\1433\00)3;&\12\01\0B\00\1843\00#643\00\144\C0\13\0C\9A\00\03\BF\0Ca;\0Acall\87\02\14('\13;, \0A\CD\10R, \0A(\0A\1C\01\22, \09\00\141\09\00\142\09\00\143\09\0074\0A)\B5\06\06N\0D\062\0Dg;\0A} \0A\09\C5\01\0C\E9\0F\03\98\02(237\02\07\9C\02424, \00\1A5\9C\02$5,Q\00\01'\00\0E\EB\0E#25\1D\03\0A\EC\0E\1F1\EC\0E\00/80\EC\0E\06*80\86\02\1F6\C1\00\04\1F7\C1\00\05\03\91\0F\1D7\C1\00$9,Q\00\01'\00\0CR\0F\00\22\00\03\94\0E\02m\04\12u\1E\00(p2Q\0FF@%p2l\04\1B3l\04\132l\04\09l\11\144\84\01\1A5\C3\00\194\84\01\07\AA\00444,8\00\01'\00\03s\05\02\C9\0B\151|\0F\118\B5\00\124\E3\05's1\D0\00/12M\01\03/45M\01\03/46M\01\04447, \00\0AM\01448,Q\00\01'\00\08}\00\03\0F\02#48N\01\08\FE\10\01\1C\06\01\A0\00\00)\00\09\9C\00%9,\E7\04\08\9C\00)50/\10\06\9C\00451, \00\0B\1F\12$2,Q\00\01'\00\08\9C\00\135#\12\132r\01\06\99\00#6, \00\00\A5\00\07\AE\02\01-\00\01\16\00\1B6\ED\01\134\ED\01\183\ED\01/30c\01\03/31c\01\04432, \00\0Ac\01433,Q\00\01'\00\07\C7\00\127\C6\00\2233\15\01\05\1A\12\188\F7\01\09x\01$9,\1F\12(8;\AE\00\1B4u\01\04\18\00%5,\E7\08\09\18\00\156z\00+16\C8\00\02+\02\1D3!\13438,S\00\01'\00\09\16\01\139P\00\0Cx\02\03\F5\00\1C9M\00441,\B8\00\01'\00\08\DC\01\130x\02\1D1\DC\01#1, \009%f9\DB\01\2241\DB\01\1F1\DB\01\04*4:\18\00\135\18\00\B05:\0Aret;\0A\0A}\0A\00\00", section ".nv_fatbin", align 8 +@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([15713 x i8], [15713 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 +@__cuda_gpubin_handle = internal global i8** null, align 8 +@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_streamcluster_cuda_cpu.cu, i8* null }, { i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] + +; Function Attrs: noinline uwtable +define internal void @__cxx_global_var_init() #0 section ".text.startup" { +entry: + call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) + %0 = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init", %"class.std::ios_base::Init"* @_ZStL8__ioinit, i32 0, i32 0), i8* @__dso_handle) #2 + ret void +} + +declare dso_local void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1 + +declare dso_local void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1 + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #2 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z19kernel_compute_costiilP5PointiiPfS1_PiPb(i32 %num, i32 %dim, i64 %x, %struct.Point* %p, i32 %K, i32 %stride, float* %coord_d, float* %work_mem_d, i32* %center_table_d, i8* %switch_membership_d) #3 { +entry: + %num.addr = alloca i32, align 4 + %dim.addr = alloca i32, align 4 + %x.addr = alloca i64, align 8 + %p.addr = alloca %struct.Point*, align 8 + %K.addr = alloca i32, align 4 + %stride.addr = alloca i32, align 4 + %coord_d.addr = alloca float*, align 8 + %work_mem_d.addr = alloca float*, align 8 + %center_table_d.addr = alloca i32*, align 8 + %switch_membership_d.addr = alloca i8*, align 8 + %grid_dim = alloca %struct.dim3, align 8 + %block_dim = alloca %struct.dim3, align 8 + %shmem_size = alloca i64, align 8 + %stream = alloca i8*, align 8 + %grid_dim.coerce = alloca { i64, i32 }, align 8 + %block_dim.coerce = alloca { i64, i32 }, align 8 + store i32 %num, i32* %num.addr, align 4 + store i32 %dim, i32* %dim.addr, align 4 + store i64 %x, i64* %x.addr, align 8 + store %struct.Point* %p, %struct.Point** %p.addr, align 8 + store i32 %K, i32* %K.addr, align 4 + store i32 %stride, i32* %stride.addr, align 4 + store float* %coord_d, float** %coord_d.addr, align 8 + store float* %work_mem_d, float** %work_mem_d.addr, align 8 + store i32* %center_table_d, i32** %center_table_d.addr, align 8 + store i8* %switch_membership_d, i8** %switch_membership_d.addr, align 8 + %kernel_args = alloca i8*, i64 10, align 16 + %0 = bitcast i32* %num.addr to i8* + %1 = getelementptr i8*, i8** %kernel_args, i32 0 + store i8* %0, i8** %1 + %2 = bitcast i32* %dim.addr to i8* + %3 = getelementptr i8*, i8** %kernel_args, i32 1 + store i8* %2, i8** %3 + %4 = bitcast i64* %x.addr to i8* + %5 = getelementptr i8*, i8** %kernel_args, i32 2 + store i8* %4, i8** %5 + %6 = bitcast %struct.Point** %p.addr to i8* + %7 = getelementptr i8*, i8** %kernel_args, i32 3 + store i8* %6, i8** %7 + %8 = bitcast i32* %K.addr to i8* + %9 = getelementptr i8*, i8** %kernel_args, i32 4 + store i8* %8, i8** %9 + %10 = bitcast i32* %stride.addr to i8* + %11 = getelementptr i8*, i8** %kernel_args, i32 5 + store i8* %10, i8** %11 + %12 = bitcast float** %coord_d.addr to i8* + %13 = getelementptr i8*, i8** %kernel_args, i32 6 + store i8* %12, i8** %13 + %14 = bitcast float** %work_mem_d.addr to i8* + %15 = getelementptr i8*, i8** %kernel_args, i32 7 + store i8* %14, i8** %15 + %16 = bitcast i32** %center_table_d.addr to i8* + %17 = getelementptr i8*, i8** %kernel_args, i32 8 + store i8* %16, i8** %17 + %18 = bitcast i8** %switch_membership_d.addr to i8* + %19 = getelementptr i8*, i8** %kernel_args, i32 9 + store i8* %18, i8** %19 + %20 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) + %21 = load i64, i64* %shmem_size, align 8 + %22 = load i8*, i8** %stream, align 8 + %23 = bitcast { i64, i32 }* %grid_dim.coerce to i8* + %24 = bitcast %struct.dim3* %grid_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 12, i1 false) + %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 + %26 = load i64, i64* %25, align 8 + %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 + %28 = load i32, i32* %27, align 8 + %29 = bitcast { i64, i32 }* %block_dim.coerce to i8* + %30 = bitcast %struct.dim3* %block_dim to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %29, i8* align 8 %30, i64 12, i1 false) + %31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 + %32 = load i64, i64* %31, align 8 + %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 + %34 = load i32, i32* %33, align 8 + %35 = bitcast i8* %22 to %struct.CUstream_st* + %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32, i32, i64, %struct.Point*, i32, i32, float*, float*, i32*, i8*)* @_Z19kernel_compute_costiilP5PointiiPfS1_PiPb to i8*), i64 %26, i32 %28, i64 %32, i32 %34, i8** %kernel_args, i64 %21, %struct.CUstream_st* %35) + br label %setup.end + +setup.end: ; preds = %entry + ret void +} + +declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) + +declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #4 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z11allocDevMemii(i32 %num, i32 %dim) #3 { +entry: + %num.addr = alloca i32, align 4 + %dim.addr = alloca i32, align 4 + %err = alloca i32, align 4 + %err4 = alloca i32, align 4 + %err15 = alloca i32, align 4 + %err26 = alloca i32, align 4 + store i32 %num, i32* %num.addr, align 4 + store i32 %dim, i32* %dim.addr, align 4 + br label %do.body + +do.body: ; preds = %entry + %0 = load i32, i32* %num.addr, align 4 + %conv = sext i32 %0 to i64 + %mul = mul i64 %conv, 4 + %call = call i32 @cudaMalloc(i8** bitcast (i32** @center_table_d to i8**), i64 %mul) + store i32 %call, i32* %err, align 4 + %1 = load i32, i32* %err, align 4 + %cmp = icmp ne i32 0, %1 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %do.body + %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %3 = load i32, i32* %err, align 4 + %call1 = call i8* @cudaGetErrorString(i32 %3) + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 91, i8* %call1) + call void @exit(i32 1) #15 + unreachable + +if.end: ; preds = %do.body + br label %do.end + +do.end: ; preds = %if.end + br label %do.body3 + +do.body3: ; preds = %do.end + %4 = load i32, i32* %num.addr, align 4 + %conv5 = sext i32 %4 to i64 + %mul6 = mul i64 %conv5, 1 + %call7 = call i32 @cudaMalloc(i8** @switch_membership_d, i64 %mul6) + store i32 %call7, i32* %err4, align 4 + %5 = load i32, i32* %err4, align 4 + %cmp8 = icmp ne i32 0, %5 + br i1 %cmp8, label %if.then9, label %if.end12 + +if.then9: ; preds = %do.body3 + %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %7 = load i32, i32* %err4, align 4 + %call10 = call i8* @cudaGetErrorString(i32 %7) + %call11 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 92, i8* %call10) + call void @exit(i32 1) #15 + unreachable + +if.end12: ; preds = %do.body3 + br label %do.end13 + +do.end13: ; preds = %if.end12 + br label %do.body14 + +do.body14: ; preds = %do.end13 + %8 = load i32, i32* %num.addr, align 4 + %conv16 = sext i32 %8 to i64 + %mul17 = mul i64 %conv16, 32 + %call18 = call i32 @cudaMalloc(i8** bitcast (%struct.Point** @p to i8**), i64 %mul17) + store i32 %call18, i32* %err15, align 4 + %9 = load i32, i32* %err15, align 4 + %cmp19 = icmp ne i32 0, %9 + br i1 %cmp19, label %if.then20, label %if.end23 + +if.then20: ; preds = %do.body14 + %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %11 = load i32, i32* %err15, align 4 + %call21 = call i8* @cudaGetErrorString(i32 %11) + %call22 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %10, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 93, i8* %call21) + call void @exit(i32 1) #15 + unreachable + +if.end23: ; preds = %do.body14 + br label %do.end24 + +do.end24: ; preds = %if.end23 + br label %do.body25 + +do.body25: ; preds = %do.end24 + %12 = load i32, i32* %num.addr, align 4 + %13 = load i32, i32* %dim.addr, align 4 + %mul27 = mul nsw i32 %12, %13 + %conv28 = sext i32 %mul27 to i64 + %mul29 = mul i64 %conv28, 4 + %call30 = call i32 @cudaMalloc(i8** bitcast (float** @coord_d to i8**), i64 %mul29) + store i32 %call30, i32* %err26, align 4 + %14 = load i32, i32* %err26, align 4 + %cmp31 = icmp ne i32 0, %14 + br i1 %cmp31, label %if.then32, label %if.end35 + +if.then32: ; preds = %do.body25 + %15 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %16 = load i32, i32* %err26, align 4 + %call33 = call i8* @cudaGetErrorString(i32 %16) + %call34 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %15, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 94, i8* %call33) + call void @exit(i32 1) #15 + unreachable + +if.end35: ; preds = %do.body25 + br label %do.end36 + +do.end36: ; preds = %if.end35 + ret void +} + +declare dso_local i32 @cudaMalloc(i8**, i64) #1 + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 + +declare dso_local i8* @cudaGetErrorString(i32) #1 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #5 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z12allocHostMemii(i32 %num, i32 %dim) #6 { +entry: + %num.addr = alloca i32, align 4 + %dim.addr = alloca i32, align 4 + store i32 %num, i32* %num.addr, align 4 + store i32 %dim, i32* %dim.addr, align 4 + %0 = load i32, i32* %num.addr, align 4 + %1 = load i32, i32* %dim.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv = sext i32 %mul to i64 + %mul1 = mul i64 %conv, 4 + %call = call noalias i8* @malloc(i64 %mul1) #2 + %2 = bitcast i8* %call to float* + store float* %2, float** @coord_h, align 8 + ret void +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #7 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z10freeDevMemv() #3 { +entry: + %err = alloca i32, align 4 + %err4 = alloca i32, align 4 + %err13 = alloca i32, align 4 + %err22 = alloca i32, align 4 + br label %do.body + +do.body: ; preds = %entry + %0 = load i32*, i32** @center_table_d, align 8 + %1 = bitcast i32* %0 to i8* + %call = call i32 @cudaFree(i8* %1) + store i32 %call, i32* %err, align 4 + %2 = load i32, i32* %err, align 4 + %cmp = icmp ne i32 0, %2 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %do.body + %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %4 = load i32, i32* %err, align 4 + %call1 = call i8* @cudaGetErrorString(i32 %4) + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 108, i8* %call1) + call void @exit(i32 1) #15 + unreachable + +if.end: ; preds = %do.body + br label %do.end + +do.end: ; preds = %if.end + br label %do.body3 + +do.body3: ; preds = %do.end + %5 = load i8*, i8** @switch_membership_d, align 8 + %call5 = call i32 @cudaFree(i8* %5) + store i32 %call5, i32* %err4, align 4 + %6 = load i32, i32* %err4, align 4 + %cmp6 = icmp ne i32 0, %6 + br i1 %cmp6, label %if.then7, label %if.end10 + +if.then7: ; preds = %do.body3 + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %8 = load i32, i32* %err4, align 4 + %call8 = call i8* @cudaGetErrorString(i32 %8) + %call9 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 109, i8* %call8) + call void @exit(i32 1) #15 + unreachable + +if.end10: ; preds = %do.body3 + br label %do.end11 + +do.end11: ; preds = %if.end10 + br label %do.body12 + +do.body12: ; preds = %do.end11 + %9 = load %struct.Point*, %struct.Point** @p, align 8 + %10 = bitcast %struct.Point* %9 to i8* + %call14 = call i32 @cudaFree(i8* %10) + store i32 %call14, i32* %err13, align 4 + %11 = load i32, i32* %err13, align 4 + %cmp15 = icmp ne i32 0, %11 + br i1 %cmp15, label %if.then16, label %if.end19 + +if.then16: ; preds = %do.body12 + %12 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %13 = load i32, i32* %err13, align 4 + %call17 = call i8* @cudaGetErrorString(i32 %13) + %call18 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %12, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 110, i8* %call17) + call void @exit(i32 1) #15 + unreachable + +if.end19: ; preds = %do.body12 + br label %do.end20 + +do.end20: ; preds = %if.end19 + br label %do.body21 + +do.body21: ; preds = %do.end20 + %14 = load float*, float** @coord_d, align 8 + %15 = bitcast float* %14 to i8* + %call23 = call i32 @cudaFree(i8* %15) + store i32 %call23, i32* %err22, align 4 + %16 = load i32, i32* %err22, align 4 + %cmp24 = icmp ne i32 0, %16 + br i1 %cmp24, label %if.then25, label %if.end28 + +if.then25: ; preds = %do.body21 + %17 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %18 = load i32, i32* %err22, align 4 + %call26 = call i8* @cudaGetErrorString(i32 %18) + %call27 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %17, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 111, i8* %call26) + call void @exit(i32 1) #15 + unreachable + +if.end28: ; preds = %do.body21 + br label %do.end29 + +do.end29: ; preds = %if.end28 + ret void +} + +declare dso_local i32 @cudaFree(i8*) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z11freeHostMemv() #6 { +entry: + %0 = load float*, float** @coord_h, align 8 + %1 = bitcast float* %0 to i8* + call void @free(i8* %1) #2 + ret void +} + +; Function Attrs: nounwind +declare dso_local void @free(i8*) #7 + +; Function Attrs: noinline optnone uwtable +define dso_local float @_Z5pgainlP6PointsfPliPbPiS2_bPdS4_S4_S4_S4_S4_(i64 %x, %struct.Points* %points, float %z, i64* %numcenters, i32 %kmax, i8* %is_center, i32* %center_table, i8* %switch_membership, i1 zeroext %isCoordChanged, double* %serial_t, double* %cpu_to_gpu_t, double* %gpu_to_cpu_t, double* %alloc_t, double* %kernel_t, double* %free_t) #3 { +entry: + %x.addr = alloca i64, align 8 + %points.addr = alloca %struct.Points*, align 8 + %z.addr = alloca float, align 4 + %numcenters.addr = alloca i64*, align 8 + %kmax.addr = alloca i32, align 4 + %is_center.addr = alloca i8*, align 8 + %center_table.addr = alloca i32*, align 8 + %switch_membership.addr = alloca i8*, align 8 + %isCoordChanged.addr = alloca i8, align 1 + %serial_t.addr = alloca double*, align 8 + %cpu_to_gpu_t.addr = alloca double*, align 8 + %gpu_to_cpu_t.addr = alloca double*, align 8 + %alloc_t.addr = alloca double*, align 8 + %kernel_t.addr = alloca double*, align 8 + %free_t.addr = alloca double*, align 8 + %error = alloca i32, align 4 + %stride = alloca i32, align 4 + %K = alloca i32, align 4 + %num = alloca i32, align 4 + %dim = alloca i32, align 4 + %nThread = alloca i32, align 4 + %count = alloca i32, align 4 + %i = alloca i32, align 4 + %i17 = alloca i32, align 4 + %j = alloca i32, align 4 + %err = alloca i32, align 4 + %err57 = alloca i32, align 4 + %err70 = alloca i32, align 4 + %err81 = alloca i32, align 4 + %err93 = alloca i32, align 4 + %err104 = alloca i32, align 4 + %num_blocks = alloca i32, align 4 + %num_blocks_y = alloca i32, align 4 + %num_blocks_x = alloca i32, align 4 + %grid_size = alloca %struct.dim3, align 4 + %agg.tmp = alloca %struct.dim3, align 4 + %agg.tmp130 = alloca %struct.dim3, align 4 + %agg.tmp.coerce = alloca { i64, i32 }, align 4 + %agg.tmp130.coerce = alloca { i64, i32 }, align 4 + %err141 = alloca i32, align 4 + %err154 = alloca i32, align 4 + %number_of_centers_to_close = alloca i32, align 4 + %gl_cost_of_opening_x = alloca float, align 4 + %gl_lower = alloca float*, align 8 + %i167 = alloca i32, align 4 + %low = alloca float, align 4 + %j175 = alloca i32, align 4 + %i213 = alloca i32, align 4 + %close_center = alloca i8, align 1 + %agg.tmp231 = alloca %struct.Point, align 8 + %agg.tmp235 = alloca %struct.Point, align 8 + %i254 = alloca i32, align 4 + %err285 = alloca i32, align 4 + store i64 %x, i64* %x.addr, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store float %z, float* %z.addr, align 4 + store i64* %numcenters, i64** %numcenters.addr, align 8 + store i32 %kmax, i32* %kmax.addr, align 4 + store i8* %is_center, i8** %is_center.addr, align 8 + store i32* %center_table, i32** %center_table.addr, align 8 + store i8* %switch_membership, i8** %switch_membership.addr, align 8 + %frombool = zext i1 %isCoordChanged to i8 + store i8 %frombool, i8* %isCoordChanged.addr, align 1 + store double* %serial_t, double** %serial_t.addr, align 8 + store double* %cpu_to_gpu_t, double** %cpu_to_gpu_t.addr, align 8 + store double* %gpu_to_cpu_t, double** %gpu_to_cpu_t.addr, align 8 + store double* %alloc_t, double** %alloc_t.addr, align 8 + store double* %kernel_t, double** %kernel_t.addr, align 8 + store double* %free_t, double** %free_t.addr, align 8 + %0 = load i64*, i64** %numcenters.addr, align 8 + %1 = load i64, i64* %0, align 8 + %add = add nsw i64 %1, 1 + %conv = trunc i64 %add to i32 + store i32 %conv, i32* %stride, align 4 + %2 = load i64*, i64** %numcenters.addr, align 8 + %3 = load i64, i64* %2, align 8 + %conv1 = trunc i64 %3 to i32 + store i32 %conv1, i32* %K, align 4 + %4 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num2 = getelementptr inbounds %struct.Points, %struct.Points* %4, i32 0, i32 0 + %5 = load i64, i64* %num2, align 8 + %conv3 = trunc i64 %5 to i32 + store i32 %conv3, i32* %num, align 4 + %6 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim4 = getelementptr inbounds %struct.Points, %struct.Points* %6, i32 0, i32 1 + %7 = load i32, i32* %dim4, align 8 + store i32 %7, i32* %dim, align 4 + %8 = load i32, i32* %num, align 4 + store i32 %8, i32* %nThread, align 4 + %9 = load i32, i32* %stride, align 4 + %10 = load i32, i32* %nThread, align 4 + %add5 = add nsw i32 %10, 1 + %mul = mul nsw i32 %9, %add5 + %conv6 = sext i32 %mul to i64 + %mul7 = mul i64 %conv6, 4 + %call = call noalias i8* @malloc(i64 %mul7) #2 + %11 = bitcast i8* %call to float* + store float* %11, float** @work_mem_h, align 8 + %12 = load i32, i32* @_ZL4iter, align 4 + %cmp = icmp eq i32 %12, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %13 = load i32, i32* %num, align 4 + %14 = load i32, i32* %dim, align 4 + call void @_Z12allocHostMemii(i32 %13, i32 %14) + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %count, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %15 = load i32, i32* %i, align 4 + %16 = load i32, i32* %num, align 4 + %cmp8 = icmp slt i32 %15, %16 + br i1 %cmp8, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %17 = load i8*, i8** %is_center.addr, align 8 + %18 = load i32, i32* %i, align 4 + %idxprom = sext i32 %18 to i64 + %arrayidx = getelementptr inbounds i8, i8* %17, i64 %idxprom + %19 = load i8, i8* %arrayidx, align 1 + %tobool = trunc i8 %19 to i1 + br i1 %tobool, label %if.then9, label %if.end12 + +if.then9: ; preds = %for.body + %20 = load i32, i32* %count, align 4 + %inc = add nsw i32 %20, 1 + store i32 %inc, i32* %count, align 4 + %21 = load i32*, i32** %center_table.addr, align 8 + %22 = load i32, i32* %i, align 4 + %idxprom10 = sext i32 %22 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %21, i64 %idxprom10 + store i32 %20, i32* %arrayidx11, align 4 + br label %if.end12 + +if.end12: ; preds = %if.then9, %for.body + br label %for.inc + +for.inc: ; preds = %if.end12 + %23 = load i32, i32* %i, align 4 + %inc13 = add nsw i32 %23, 1 + store i32 %inc13, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %24 = load i8, i8* %isCoordChanged.addr, align 1 + %tobool14 = trunc i8 %24 to i1 + br i1 %tobool14, label %if.then16, label %lor.lhs.false + +lor.lhs.false: ; preds = %for.end + %25 = load i32, i32* @_ZL4iter, align 4 + %cmp15 = icmp eq i32 %25, 0 + br i1 %cmp15, label %if.then16, label %if.end38 + +if.then16: ; preds = %lor.lhs.false, %for.end + store i32 0, i32* %i17, align 4 + br label %for.cond18 + +for.cond18: ; preds = %for.inc35, %if.then16 + %26 = load i32, i32* %i17, align 4 + %27 = load i32, i32* %dim, align 4 + %cmp19 = icmp slt i32 %26, %27 + br i1 %cmp19, label %for.body20, label %for.end37 + +for.body20: ; preds = %for.cond18 + store i32 0, i32* %j, align 4 + br label %for.cond21 + +for.cond21: ; preds = %for.inc32, %for.body20 + %28 = load i32, i32* %j, align 4 + %29 = load i32, i32* %num, align 4 + %cmp22 = icmp slt i32 %28, %29 + br i1 %cmp22, label %for.body23, label %for.end34 + +for.body23: ; preds = %for.cond21 + %30 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %30, i32 0, i32 2 + %31 = load %struct.Point*, %struct.Point** %p, align 8 + %32 = load i32, i32* %j, align 4 + %idxprom24 = sext i32 %32 to i64 + %arrayidx25 = getelementptr inbounds %struct.Point, %struct.Point* %31, i64 %idxprom24 + %coord = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx25, i32 0, i32 1 + %33 = load float*, float** %coord, align 8 + %34 = load i32, i32* %i17, align 4 + %idxprom26 = sext i32 %34 to i64 + %arrayidx27 = getelementptr inbounds float, float* %33, i64 %idxprom26 + %35 = load float, float* %arrayidx27, align 4 + %36 = load float*, float** @coord_h, align 8 + %37 = load i32, i32* %num, align 4 + %38 = load i32, i32* %i17, align 4 + %mul28 = mul nsw i32 %37, %38 + %39 = load i32, i32* %j, align 4 + %add29 = add nsw i32 %mul28, %39 + %idxprom30 = sext i32 %add29 to i64 + %arrayidx31 = getelementptr inbounds float, float* %36, i64 %idxprom30 + store float %35, float* %arrayidx31, align 4 + br label %for.inc32 + +for.inc32: ; preds = %for.body23 + %40 = load i32, i32* %j, align 4 + %inc33 = add nsw i32 %40, 1 + store i32 %inc33, i32* %j, align 4 + br label %for.cond21 + +for.end34: ; preds = %for.cond21 + br label %for.inc35 + +for.inc35: ; preds = %for.end34 + %41 = load i32, i32* %i17, align 4 + %inc36 = add nsw i32 %41, 1 + store i32 %inc36, i32* %i17, align 4 + br label %for.cond18 + +for.end37: ; preds = %for.cond18 + br label %if.end38 + +if.end38: ; preds = %for.end37, %lor.lhs.false + br label %do.body + +do.body: ; preds = %if.end38 + %42 = load i32, i32* %stride, align 4 + %43 = load i32, i32* %nThread, align 4 + %add39 = add nsw i32 %43, 1 + %mul40 = mul nsw i32 %42, %add39 + %conv41 = sext i32 %mul40 to i64 + %mul42 = mul i64 %conv41, 4 + %call43 = call i32 @cudaMalloc(i8** bitcast (float** @work_mem_d to i8**), i64 %mul42) + store i32 %call43, i32* %err, align 4 + %44 = load i32, i32* %err, align 4 + %cmp44 = icmp ne i32 0, %44 + br i1 %cmp44, label %if.then45, label %if.end48 + +if.then45: ; preds = %do.body + %45 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %46 = load i32, i32* %err, align 4 + %call46 = call i8* @cudaGetErrorString(i32 %46) + %call47 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %45, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 184, i8* %call46) + call void @exit(i32 1) #15 + unreachable + +if.end48: ; preds = %do.body + br label %do.end + +do.end: ; preds = %if.end48 + %47 = load i32, i32* @_ZL4iter, align 4 + %cmp49 = icmp eq i32 %47, 0 + br i1 %cmp49, label %if.then50, label %if.end51 + +if.then50: ; preds = %do.end + %48 = load i32, i32* %num, align 4 + %49 = load i32, i32* %dim, align 4 + call void @_Z11allocDevMemii(i32 %48, i32 %49) + br label %if.end51 + +if.end51: ; preds = %if.then50, %do.end + %50 = load i8, i8* %isCoordChanged.addr, align 1 + %tobool52 = trunc i8 %50 to i1 + br i1 %tobool52, label %if.then55, label %lor.lhs.false53 + +lor.lhs.false53: ; preds = %if.end51 + %51 = load i32, i32* @_ZL4iter, align 4 + %cmp54 = icmp eq i32 %51, 0 + br i1 %cmp54, label %if.then55, label %if.end68 + +if.then55: ; preds = %lor.lhs.false53, %if.end51 + br label %do.body56 + +do.body56: ; preds = %if.then55 + %52 = load float*, float** @coord_d, align 8 + %53 = bitcast float* %52 to i8* + %54 = load float*, float** @coord_h, align 8 + %55 = bitcast float* %54 to i8* + %56 = load i32, i32* %num, align 4 + %57 = load i32, i32* %dim, align 4 + %mul58 = mul nsw i32 %56, %57 + %conv59 = sext i32 %mul58 to i64 + %mul60 = mul i64 %conv59, 4 + %call61 = call i32 @cudaMemcpy(i8* %53, i8* %55, i64 %mul60, i32 1) + store i32 %call61, i32* %err57, align 4 + %58 = load i32, i32* %err57, align 4 + %cmp62 = icmp ne i32 0, %58 + br i1 %cmp62, label %if.then63, label %if.end66 + +if.then63: ; preds = %do.body56 + %59 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %60 = load i32, i32* %err57, align 4 + %call64 = call i8* @cudaGetErrorString(i32 %60) + %call65 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %59, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 205, i8* %call64) + call void @exit(i32 1) #15 + unreachable + +if.end66: ; preds = %do.body56 + br label %do.end67 + +do.end67: ; preds = %if.end66 + br label %if.end68 + +if.end68: ; preds = %do.end67, %lor.lhs.false53 + br label %do.body69 + +do.body69: ; preds = %if.end68 + %61 = load i32*, i32** @center_table_d, align 8 + %62 = bitcast i32* %61 to i8* + %63 = load i32*, i32** %center_table.addr, align 8 + %64 = bitcast i32* %63 to i8* + %65 = load i32, i32* %num, align 4 + %conv71 = sext i32 %65 to i64 + %mul72 = mul i64 %conv71, 4 + %call73 = call i32 @cudaMemcpy(i8* %62, i8* %64, i64 %mul72, i32 1) + store i32 %call73, i32* %err70, align 4 + %66 = load i32, i32* %err70, align 4 + %cmp74 = icmp ne i32 0, %66 + br i1 %cmp74, label %if.then75, label %if.end78 + +if.then75: ; preds = %do.body69 + %67 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %68 = load i32, i32* %err70, align 4 + %call76 = call i8* @cudaGetErrorString(i32 %68) + %call77 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %67, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 208, i8* %call76) + call void @exit(i32 1) #15 + unreachable + +if.end78: ; preds = %do.body69 + br label %do.end79 + +do.end79: ; preds = %if.end78 + br label %do.body80 + +do.body80: ; preds = %do.end79 + %69 = load %struct.Point*, %struct.Point** @p, align 8 + %70 = bitcast %struct.Point* %69 to i8* + %71 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p82 = getelementptr inbounds %struct.Points, %struct.Points* %71, i32 0, i32 2 + %72 = load %struct.Point*, %struct.Point** %p82, align 8 + %73 = bitcast %struct.Point* %72 to i8* + %74 = load i32, i32* %num, align 4 + %conv83 = sext i32 %74 to i64 + %mul84 = mul i64 %conv83, 32 + %call85 = call i32 @cudaMemcpy(i8* %70, i8* %73, i64 %mul84, i32 1) + store i32 %call85, i32* %err81, align 4 + %75 = load i32, i32* %err81, align 4 + %cmp86 = icmp ne i32 0, %75 + br i1 %cmp86, label %if.then87, label %if.end90 + +if.then87: ; preds = %do.body80 + %76 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %77 = load i32, i32* %err81, align 4 + %call88 = call i8* @cudaGetErrorString(i32 %77) + %call89 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %76, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 210, i8* %call88) + call void @exit(i32 1) #15 + unreachable + +if.end90: ; preds = %do.body80 + br label %do.end91 + +do.end91: ; preds = %if.end90 + br label %do.body92 + +do.body92: ; preds = %do.end91 + %78 = load i8*, i8** @switch_membership_d, align 8 + %79 = load i32, i32* %num, align 4 + %conv94 = sext i32 %79 to i64 + %mul95 = mul i64 %conv94, 1 + %call96 = call i32 @cudaMemset(i8* %78, i32 0, i64 %mul95) + store i32 %call96, i32* %err93, align 4 + %80 = load i32, i32* %err93, align 4 + %cmp97 = icmp ne i32 0, %80 + br i1 %cmp97, label %if.then98, label %if.end101 + +if.then98: ; preds = %do.body92 + %81 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %82 = load i32, i32* %err93, align 4 + %call99 = call i8* @cudaGetErrorString(i32 %82) + %call100 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %81, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 213, i8* %call99) + call void @exit(i32 1) #15 + unreachable + +if.end101: ; preds = %do.body92 + br label %do.end102 + +do.end102: ; preds = %if.end101 + br label %do.body103 + +do.body103: ; preds = %do.end102 + %83 = load float*, float** @work_mem_d, align 8 + %84 = bitcast float* %83 to i8* + %85 = load i32, i32* %stride, align 4 + %86 = load i32, i32* %nThread, align 4 + %add105 = add nsw i32 %86, 1 + %mul106 = mul nsw i32 %85, %add105 + %conv107 = sext i32 %mul106 to i64 + %mul108 = mul i64 %conv107, 4 + %call109 = call i32 @cudaMemset(i8* %84, i32 0, i64 %mul108) + store i32 %call109, i32* %err104, align 4 + %87 = load i32, i32* %err104, align 4 + %cmp110 = icmp ne i32 0, %87 + br i1 %cmp110, label %if.then111, label %if.end114 + +if.then111: ; preds = %do.body103 + %88 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %89 = load i32, i32* %err104, align 4 + %call112 = call i8* @cudaGetErrorString(i32 %89) + %call113 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %88, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 215, i8* %call112) + call void @exit(i32 1) #15 + unreachable + +if.end114: ; preds = %do.body103 + br label %do.end115 + +do.end115: ; preds = %if.end114 + %90 = load i32, i32* %num, align 4 + %add116 = add nsw i32 %90, 512 + %sub = sub nsw i32 %add116, 1 + %conv117 = sitofp i32 %sub to float + %div = fdiv float %conv117, 5.120000e+02 + %conv118 = fptosi float %div to i32 + store i32 %conv118, i32* %num_blocks, align 4 + %91 = load i32, i32* %num_blocks, align 4 + %add119 = add nsw i32 %91, 65536 + %sub120 = sub nsw i32 %add119, 1 + %conv121 = sitofp i32 %sub120 to float + %div122 = fdiv float %conv121, 6.553600e+04 + %conv123 = fptosi float %div122 to i32 + store i32 %conv123, i32* %num_blocks_y, align 4 + %92 = load i32, i32* %num_blocks, align 4 + %93 = load i32, i32* %num_blocks_y, align 4 + %add124 = add nsw i32 %92, %93 + %sub125 = sub nsw i32 %add124, 1 + %conv126 = sitofp i32 %sub125 to float + %94 = load i32, i32* %num_blocks_y, align 4 + %conv127 = sitofp i32 %94 to float + %div128 = fdiv float %conv126, %conv127 + %conv129 = fptosi float %div128 to i32 + store i32 %conv129, i32* %num_blocks_x, align 4 + %95 = load i32, i32* %num_blocks_x, align 4 + %96 = load i32, i32* %num_blocks_y, align 4 + call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid_size, i32 %95, i32 %96, i32 1) + %97 = bitcast %struct.dim3* %agg.tmp to i8* + %98 = bitcast %struct.dim3* %grid_size to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %97, i8* align 4 %98, i64 12, i1 false) + call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp130, i32 512, i32 1, i32 1) + %99 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* + %100 = bitcast %struct.dim3* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %99, i8* align 4 %100, i64 12, i1 false) + %101 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 + %102 = load i64, i64* %101, align 4 + %103 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 + %104 = load i32, i32* %103, align 4 + %105 = bitcast { i64, i32 }* %agg.tmp130.coerce to i8* + %106 = bitcast %struct.dim3* %agg.tmp130 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %105, i8* align 4 %106, i64 12, i1 false) + %107 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp130.coerce, i32 0, i32 0 + %108 = load i64, i64* %107, align 4 + %109 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp130.coerce, i32 0, i32 1 + %110 = load i32, i32* %109, align 4 + %call131 = call i32 @__cudaPushCallConfiguration(i64 %102, i32 %104, i64 %108, i32 %110, i64 0, i8* null) + %tobool132 = icmp ne i32 %call131, 0 + br i1 %tobool132, label %kcall.end, label %kcall.configok + +kcall.configok: ; preds = %do.end115 + %111 = load i32, i32* %num, align 4 + %112 = load i32, i32* %dim, align 4 + %113 = load i64, i64* %x.addr, align 8 + %114 = load %struct.Point*, %struct.Point** @p, align 8 + %115 = load i32, i32* %K, align 4 + %116 = load i32, i32* %stride, align 4 + %117 = load float*, float** @coord_d, align 8 + %118 = load float*, float** @work_mem_d, align 8 + %119 = load i32*, i32** @center_table_d, align 8 + %120 = load i8*, i8** @switch_membership_d, align 8 + call void @_Z19kernel_compute_costiilP5PointiiPfS1_PiPb(i32 %111, i32 %112, i64 %113, %struct.Point* %114, i32 %115, i32 %116, float* %117, float* %118, i32* %119, i8* %120) + br label %kcall.end + +kcall.end: ; preds = %kcall.configok, %do.end115 + %call133 = call i32 @cudaThreadSynchronize() + %call134 = call i32 @cudaGetLastError() + store i32 %call134, i32* %error, align 4 + %121 = load i32, i32* %error, align 4 + %cmp135 = icmp ne i32 %121, 0 + br i1 %cmp135, label %if.then136, label %if.end139 + +if.then136: ; preds = %kcall.end + %122 = load i32, i32* %error, align 4 + %call137 = call i8* @cudaGetErrorString(i32 %122) + %call138 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.2, i64 0, i64 0), i8* %call137) + call void @exit(i32 1) #15 + unreachable + +if.end139: ; preds = %kcall.end + br label %do.body140 + +do.body140: ; preds = %if.end139 + %123 = load float*, float** @work_mem_h, align 8 + %124 = bitcast float* %123 to i8* + %125 = load float*, float** @work_mem_d, align 8 + %126 = bitcast float* %125 to i8* + %127 = load i32, i32* %stride, align 4 + %128 = load i32, i32* %nThread, align 4 + %add142 = add nsw i32 %128, 1 + %mul143 = mul nsw i32 %127, %add142 + %conv144 = sext i32 %mul143 to i64 + %mul145 = mul i64 %conv144, 4 + %call146 = call i32 @cudaMemcpy(i8* %124, i8* %126, i64 %mul145, i32 2) + store i32 %call146, i32* %err141, align 4 + %129 = load i32, i32* %err141, align 4 + %cmp147 = icmp ne i32 0, %129 + br i1 %cmp147, label %if.then148, label %if.end151 + +if.then148: ; preds = %do.body140 + %130 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %131 = load i32, i32* %err141, align 4 + %call149 = call i8* @cudaGetErrorString(i32 %131) + %call150 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %130, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 273, i8* %call149) + call void @exit(i32 1) #15 + unreachable + +if.end151: ; preds = %do.body140 + br label %do.end152 + +do.end152: ; preds = %if.end151 + br label %do.body153 + +do.body153: ; preds = %do.end152 + %132 = load i8*, i8** %switch_membership.addr, align 8 + %133 = load i8*, i8** @switch_membership_d, align 8 + %134 = load i32, i32* %num, align 4 + %conv155 = sext i32 %134 to i64 + %mul156 = mul i64 %conv155, 1 + %call157 = call i32 @cudaMemcpy(i8* %132, i8* %133, i64 %mul156, i32 2) + store i32 %call157, i32* %err154, align 4 + %135 = load i32, i32* %err154, align 4 + %cmp158 = icmp ne i32 0, %135 + br i1 %cmp158, label %if.then159, label %if.end162 + +if.then159: ; preds = %do.body153 + %136 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %137 = load i32, i32* %err154, align 4 + %call160 = call i8* @cudaGetErrorString(i32 %137) + %call161 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %136, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 275, i8* %call160) + call void @exit(i32 1) #15 + unreachable + +if.end162: ; preds = %do.body153 + br label %do.end163 + +do.end163: ; preds = %if.end162 + store i32 0, i32* %number_of_centers_to_close, align 4 + %138 = load float, float* %z.addr, align 4 + store float %138, float* %gl_cost_of_opening_x, align 4 + %139 = load float*, float** @work_mem_h, align 8 + %140 = load i32, i32* %stride, align 4 + %141 = load i32, i32* %nThread, align 4 + %mul164 = mul nsw i32 %140, %141 + %idxprom165 = sext i32 %mul164 to i64 + %arrayidx166 = getelementptr inbounds float, float* %139, i64 %idxprom165 + store float* %arrayidx166, float** %gl_lower, align 8 + store i32 0, i32* %i167, align 4 + br label %for.cond168 + +for.cond168: ; preds = %for.inc208, %do.end163 + %142 = load i32, i32* %i167, align 4 + %143 = load i32, i32* %num, align 4 + %cmp169 = icmp slt i32 %142, %143 + br i1 %cmp169, label %for.body170, label %for.end210 + +for.body170: ; preds = %for.cond168 + %144 = load i8*, i8** %is_center.addr, align 8 + %145 = load i32, i32* %i167, align 4 + %idxprom171 = sext i32 %145 to i64 + %arrayidx172 = getelementptr inbounds i8, i8* %144, i64 %idxprom171 + %146 = load i8, i8* %arrayidx172, align 1 + %tobool173 = trunc i8 %146 to i1 + br i1 %tobool173, label %if.then174, label %if.end202 + +if.then174: ; preds = %for.body170 + %147 = load float, float* %z.addr, align 4 + store float %147, float* %low, align 4 + store i32 0, i32* %j175, align 4 + br label %for.cond176 + +for.cond176: ; preds = %for.inc186, %if.then174 + %148 = load i32, i32* %j175, align 4 + %149 = load i32, i32* %num, align 4 + %cmp177 = icmp slt i32 %148, %149 + br i1 %cmp177, label %for.body178, label %for.end188 + +for.body178: ; preds = %for.cond176 + %150 = load float*, float** @work_mem_h, align 8 + %151 = load i32, i32* %j175, align 4 + %152 = load i32, i32* %stride, align 4 + %mul179 = mul nsw i32 %151, %152 + %153 = load i32*, i32** %center_table.addr, align 8 + %154 = load i32, i32* %i167, align 4 + %idxprom180 = sext i32 %154 to i64 + %arrayidx181 = getelementptr inbounds i32, i32* %153, i64 %idxprom180 + %155 = load i32, i32* %arrayidx181, align 4 + %add182 = add nsw i32 %mul179, %155 + %idxprom183 = sext i32 %add182 to i64 + %arrayidx184 = getelementptr inbounds float, float* %150, i64 %idxprom183 + %156 = load float, float* %arrayidx184, align 4 + %157 = load float, float* %low, align 4 + %add185 = fadd contract float %157, %156 + store float %add185, float* %low, align 4 + br label %for.inc186 + +for.inc186: ; preds = %for.body178 + %158 = load i32, i32* %j175, align 4 + %inc187 = add nsw i32 %158, 1 + store i32 %inc187, i32* %j175, align 4 + br label %for.cond176 + +for.end188: ; preds = %for.cond176 + %159 = load float, float* %low, align 4 + %160 = load float*, float** %gl_lower, align 8 + %161 = load i32*, i32** %center_table.addr, align 8 + %162 = load i32, i32* %i167, align 4 + %idxprom189 = sext i32 %162 to i64 + %arrayidx190 = getelementptr inbounds i32, i32* %161, i64 %idxprom189 + %163 = load i32, i32* %arrayidx190, align 4 + %idxprom191 = sext i32 %163 to i64 + %arrayidx192 = getelementptr inbounds float, float* %160, i64 %idxprom191 + store float %159, float* %arrayidx192, align 4 + %164 = load float, float* %low, align 4 + %cmp193 = fcmp ogt float %164, 0.000000e+00 + br i1 %cmp193, label %if.then194, label %if.end201 + +if.then194: ; preds = %for.end188 + %165 = load i32, i32* %number_of_centers_to_close, align 4 + %inc195 = add nsw i32 %165, 1 + store i32 %inc195, i32* %number_of_centers_to_close, align 4 + %166 = load float, float* %low, align 4 + %167 = load float*, float** @work_mem_h, align 8 + %168 = load i32, i32* %i167, align 4 + %169 = load i32, i32* %stride, align 4 + %mul196 = mul nsw i32 %168, %169 + %170 = load i32, i32* %K, align 4 + %add197 = add nsw i32 %mul196, %170 + %idxprom198 = sext i32 %add197 to i64 + %arrayidx199 = getelementptr inbounds float, float* %167, i64 %idxprom198 + %171 = load float, float* %arrayidx199, align 4 + %sub200 = fsub contract float %171, %166 + store float %sub200, float* %arrayidx199, align 4 + br label %if.end201 + +if.end201: ; preds = %if.then194, %for.end188 + br label %if.end202 + +if.end202: ; preds = %if.end201, %for.body170 + %172 = load float*, float** @work_mem_h, align 8 + %173 = load i32, i32* %i167, align 4 + %174 = load i32, i32* %stride, align 4 + %mul203 = mul nsw i32 %173, %174 + %175 = load i32, i32* %K, align 4 + %add204 = add nsw i32 %mul203, %175 + %idxprom205 = sext i32 %add204 to i64 + %arrayidx206 = getelementptr inbounds float, float* %172, i64 %idxprom205 + %176 = load float, float* %arrayidx206, align 4 + %177 = load float, float* %gl_cost_of_opening_x, align 4 + %add207 = fadd contract float %177, %176 + store float %add207, float* %gl_cost_of_opening_x, align 4 + br label %for.inc208 + +for.inc208: ; preds = %if.end202 + %178 = load i32, i32* %i167, align 4 + %inc209 = add nsw i32 %178, 1 + store i32 %inc209, i32* %i167, align 4 + br label %for.cond168 + +for.end210: ; preds = %for.cond168 + %179 = load float, float* %gl_cost_of_opening_x, align 4 + %cmp211 = fcmp olt float %179, 0.000000e+00 + br i1 %cmp211, label %if.then212, label %if.else + +if.then212: ; preds = %for.end210 + store i32 0, i32* %i213, align 4 + br label %for.cond214 + +for.cond214: ; preds = %for.inc251, %if.then212 + %180 = load i32, i32* %i213, align 4 + %181 = load i32, i32* %num, align 4 + %cmp215 = icmp slt i32 %180, %181 + br i1 %cmp215, label %for.body216, label %for.end253 + +for.body216: ; preds = %for.cond214 + %182 = load float*, float** %gl_lower, align 8 + %183 = load i32*, i32** %center_table.addr, align 8 + %184 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p217 = getelementptr inbounds %struct.Points, %struct.Points* %184, i32 0, i32 2 + %185 = load %struct.Point*, %struct.Point** %p217, align 8 + %186 = load i32, i32* %i213, align 4 + %idxprom218 = sext i32 %186 to i64 + %arrayidx219 = getelementptr inbounds %struct.Point, %struct.Point* %185, i64 %idxprom218 + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx219, i32 0, i32 2 + %187 = load i64, i64* %assign, align 8 + %arrayidx220 = getelementptr inbounds i32, i32* %183, i64 %187 + %188 = load i32, i32* %arrayidx220, align 4 + %idxprom221 = sext i32 %188 to i64 + %arrayidx222 = getelementptr inbounds float, float* %182, i64 %idxprom221 + %189 = load float, float* %arrayidx222, align 4 + %cmp223 = fcmp ogt float %189, 0.000000e+00 + %frombool224 = zext i1 %cmp223 to i8 + store i8 %frombool224, i8* %close_center, align 1 + %190 = load i8*, i8** %switch_membership.addr, align 8 + %191 = load i32, i32* %i213, align 4 + %idxprom225 = sext i32 %191 to i64 + %arrayidx226 = getelementptr inbounds i8, i8* %190, i64 %idxprom225 + %192 = load i8, i8* %arrayidx226, align 1 + %tobool227 = trunc i8 %192 to i1 + br i1 %tobool227, label %if.then230, label %lor.lhs.false228 + +lor.lhs.false228: ; preds = %for.body216 + %193 = load i8, i8* %close_center, align 1 + %tobool229 = trunc i8 %193 to i1 + br i1 %tobool229, label %if.then230, label %if.end250 + +if.then230: ; preds = %lor.lhs.false228, %for.body216 + %194 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p232 = getelementptr inbounds %struct.Points, %struct.Points* %194, i32 0, i32 2 + %195 = load %struct.Point*, %struct.Point** %p232, align 8 + %196 = load i32, i32* %i213, align 4 + %idxprom233 = sext i32 %196 to i64 + %arrayidx234 = getelementptr inbounds %struct.Point, %struct.Point* %195, i64 %idxprom233 + %197 = bitcast %struct.Point* %agg.tmp231 to i8* + %198 = bitcast %struct.Point* %arrayidx234 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %197, i8* align 8 %198, i64 32, i1 false) + %199 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p236 = getelementptr inbounds %struct.Points, %struct.Points* %199, i32 0, i32 2 + %200 = load %struct.Point*, %struct.Point** %p236, align 8 + %201 = load i64, i64* %x.addr, align 8 + %arrayidx237 = getelementptr inbounds %struct.Point, %struct.Point* %200, i64 %201 + %202 = bitcast %struct.Point* %agg.tmp235 to i8* + %203 = bitcast %struct.Point* %arrayidx237 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %202, i8* align 8 %203, i64 32, i1 false) + %204 = load i32, i32* %dim, align 4 + %call238 = call float @_Z4dist5PointS_i(%struct.Point* byval(%struct.Point) align 8 %agg.tmp231, %struct.Point* byval(%struct.Point) align 8 %agg.tmp235, i32 %204) + %205 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p239 = getelementptr inbounds %struct.Points, %struct.Points* %205, i32 0, i32 2 + %206 = load %struct.Point*, %struct.Point** %p239, align 8 + %207 = load i32, i32* %i213, align 4 + %idxprom240 = sext i32 %207 to i64 + %arrayidx241 = getelementptr inbounds %struct.Point, %struct.Point* %206, i64 %idxprom240 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx241, i32 0, i32 0 + %208 = load float, float* %weight, align 8 + %mul242 = fmul contract float %call238, %208 + %209 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p243 = getelementptr inbounds %struct.Points, %struct.Points* %209, i32 0, i32 2 + %210 = load %struct.Point*, %struct.Point** %p243, align 8 + %211 = load i32, i32* %i213, align 4 + %idxprom244 = sext i32 %211 to i64 + %arrayidx245 = getelementptr inbounds %struct.Point, %struct.Point* %210, i64 %idxprom244 + %cost = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx245, i32 0, i32 3 + store float %mul242, float* %cost, align 8 + %212 = load i64, i64* %x.addr, align 8 + %213 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p246 = getelementptr inbounds %struct.Points, %struct.Points* %213, i32 0, i32 2 + %214 = load %struct.Point*, %struct.Point** %p246, align 8 + %215 = load i32, i32* %i213, align 4 + %idxprom247 = sext i32 %215 to i64 + %arrayidx248 = getelementptr inbounds %struct.Point, %struct.Point* %214, i64 %idxprom247 + %assign249 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx248, i32 0, i32 2 + store i64 %212, i64* %assign249, align 8 + br label %if.end250 + +if.end250: ; preds = %if.then230, %lor.lhs.false228 + br label %for.inc251 + +for.inc251: ; preds = %if.end250 + %216 = load i32, i32* %i213, align 4 + %inc252 = add nsw i32 %216, 1 + store i32 %inc252, i32* %i213, align 4 + br label %for.cond214 + +for.end253: ; preds = %for.cond214 + store i32 0, i32* %i254, align 4 + br label %for.cond255 + +for.cond255: ; preds = %for.inc270, %for.end253 + %217 = load i32, i32* %i254, align 4 + %218 = load i32, i32* %num, align 4 + %cmp256 = icmp slt i32 %217, %218 + br i1 %cmp256, label %for.body257, label %for.end272 + +for.body257: ; preds = %for.cond255 + %219 = load i8*, i8** %is_center.addr, align 8 + %220 = load i32, i32* %i254, align 4 + %idxprom258 = sext i32 %220 to i64 + %arrayidx259 = getelementptr inbounds i8, i8* %219, i64 %idxprom258 + %221 = load i8, i8* %arrayidx259, align 1 + %tobool260 = trunc i8 %221 to i1 + br i1 %tobool260, label %land.lhs.true, label %if.end269 + +land.lhs.true: ; preds = %for.body257 + %222 = load float*, float** %gl_lower, align 8 + %223 = load i32*, i32** %center_table.addr, align 8 + %224 = load i32, i32* %i254, align 4 + %idxprom261 = sext i32 %224 to i64 + %arrayidx262 = getelementptr inbounds i32, i32* %223, i64 %idxprom261 + %225 = load i32, i32* %arrayidx262, align 4 + %idxprom263 = sext i32 %225 to i64 + %arrayidx264 = getelementptr inbounds float, float* %222, i64 %idxprom263 + %226 = load float, float* %arrayidx264, align 4 + %cmp265 = fcmp ogt float %226, 0.000000e+00 + br i1 %cmp265, label %if.then266, label %if.end269 + +if.then266: ; preds = %land.lhs.true + %227 = load i8*, i8** %is_center.addr, align 8 + %228 = load i32, i32* %i254, align 4 + %idxprom267 = sext i32 %228 to i64 + %arrayidx268 = getelementptr inbounds i8, i8* %227, i64 %idxprom267 + store i8 0, i8* %arrayidx268, align 1 + br label %if.end269 + +if.end269: ; preds = %if.then266, %land.lhs.true, %for.body257 + br label %for.inc270 + +for.inc270: ; preds = %if.end269 + %229 = load i32, i32* %i254, align 4 + %inc271 = add nsw i32 %229, 1 + store i32 %inc271, i32* %i254, align 4 + br label %for.cond255 + +for.end272: ; preds = %for.cond255 + %230 = load i64, i64* %x.addr, align 8 + %cmp273 = icmp sge i64 %230, 0 + br i1 %cmp273, label %land.lhs.true274, label %if.end279 + +land.lhs.true274: ; preds = %for.end272 + %231 = load i64, i64* %x.addr, align 8 + %232 = load i32, i32* %num, align 4 + %conv275 = sext i32 %232 to i64 + %cmp276 = icmp slt i64 %231, %conv275 + br i1 %cmp276, label %if.then277, label %if.end279 + +if.then277: ; preds = %land.lhs.true274 + %233 = load i8*, i8** %is_center.addr, align 8 + %234 = load i64, i64* %x.addr, align 8 + %arrayidx278 = getelementptr inbounds i8, i8* %233, i64 %234 + store i8 1, i8* %arrayidx278, align 1 + br label %if.end279 + +if.end279: ; preds = %if.then277, %land.lhs.true274, %for.end272 + %235 = load i64*, i64** %numcenters.addr, align 8 + %236 = load i64, i64* %235, align 8 + %add280 = add nsw i64 %236, 1 + %237 = load i32, i32* %number_of_centers_to_close, align 4 + %conv281 = sext i32 %237 to i64 + %sub282 = sub nsw i64 %add280, %conv281 + %238 = load i64*, i64** %numcenters.addr, align 8 + store i64 %sub282, i64* %238, align 8 + br label %if.end283 + +if.else: ; preds = %for.end210 + store float 0.000000e+00, float* %gl_cost_of_opening_x, align 4 + br label %if.end283 + +if.end283: ; preds = %if.else, %if.end279 + %239 = load float*, float** @work_mem_h, align 8 + %240 = bitcast float* %239 to i8* + call void @free(i8* %240) #2 + br label %do.body284 + +do.body284: ; preds = %if.end283 + %241 = load float*, float** @work_mem_d, align 8 + %242 = bitcast float* %241 to i8* + %call286 = call i32 @cudaFree(i8* %242) + store i32 %call286, i32* %err285, align 4 + %243 = load i32, i32* %err285, align 4 + %cmp287 = icmp ne i32 0, %243 + br i1 %cmp287, label %if.then288, label %if.end291 + +if.then288: ; preds = %do.body284 + %244 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %245 = load i32, i32* %err285, align 4 + %call289 = call i8* @cudaGetErrorString(i32 %245) + %call290 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %244, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.1, i64 0, i64 0), i32 353, i8* %call289) + call void @exit(i32 1) #15 + unreachable + +if.end291: ; preds = %do.body284 + br label %do.end292 + +do.end292: ; preds = %if.end291 + %246 = load i32, i32* @_ZL4iter, align 4 + %inc293 = add nsw i32 %246, 1 + store i32 %inc293, i32* @_ZL4iter, align 4 + %247 = load float, float* %gl_cost_of_opening_x, align 4 + %fneg = fneg float %247 + ret float %fneg +} + +declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1 + +declare dso_local i32 @cudaMemset(i8*, i32, i64) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %struct.dim3*, align 8 + %vx.addr = alloca i32, align 4 + %vy.addr = alloca i32, align 4 + %vz.addr = alloca i32, align 4 + store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 + store i32 %vx, i32* %vx.addr, align 4 + store i32 %vy, i32* %vy.addr, align 4 + store i32 %vz, i32* %vz.addr, align 4 + %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 + %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 + %0 = load i32, i32* %vx.addr, align 4 + store i32 %0, i32* %x, align 4 + %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 + %1 = load i32, i32* %vy.addr, align 4 + store i32 %1, i32* %y, align 4 + %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 + %2 = load i32, i32* %vz.addr, align 4 + store i32 %2, i32* %z, align 4 + ret void +} + +declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #1 + +declare dso_local i32 @cudaThreadSynchronize() #1 + +declare dso_local i32 @cudaGetLastError() #1 + +declare dso_local i32 @printf(i8*, ...) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local float @_Z4dist5PointS_i(%struct.Point* byval(%struct.Point) align 8 %p1, %struct.Point* byval(%struct.Point) align 8 %p2, i32 %dim) #6 { +entry: + %dim.addr = alloca i32, align 4 + %i = alloca i32, align 4 + %result = alloca float, align 4 + store i32 %dim, i32* %dim.addr, align 4 + store float 0.000000e+00, float* %result, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %dim.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %coord = getelementptr inbounds %struct.Point, %struct.Point* %p1, i32 0, i32 1 + %2 = load float*, float** %coord, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom + %4 = load float, float* %arrayidx, align 4 + %coord1 = getelementptr inbounds %struct.Point, %struct.Point* %p2, i32 0, i32 1 + %5 = load float*, float** %coord1, align 8 + %6 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %6 to i64 + %arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2 + %7 = load float, float* %arrayidx3, align 4 + %sub = fsub contract float %4, %7 + %coord4 = getelementptr inbounds %struct.Point, %struct.Point* %p1, i32 0, i32 1 + %8 = load float*, float** %coord4, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom5 = sext i32 %9 to i64 + %arrayidx6 = getelementptr inbounds float, float* %8, i64 %idxprom5 + %10 = load float, float* %arrayidx6, align 4 + %coord7 = getelementptr inbounds %struct.Point, %struct.Point* %p2, i32 0, i32 1 + %11 = load float*, float** %coord7, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom8 = sext i32 %12 to i64 + %arrayidx9 = getelementptr inbounds float, float* %11, i64 %idxprom8 + %13 = load float, float* %arrayidx9, align 4 + %sub10 = fsub contract float %10, %13 + %mul = fmul contract float %sub, %sub10 + %14 = load float, float* %result, align 4 + %add = fadd contract float %14, %mul + store float %add, float* %result, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %16 = load float, float* %result, align 4 + ret float %16 +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z9inttofileiPc(i32 %data, i8* %filename) #3 { +entry: + %data.addr = alloca i32, align 4 + %filename.addr = alloca i8*, align 8 + %fp = alloca %struct._IO_FILE*, align 8 + store i32 %data, i32* %data.addr, align 4 + store i8* %filename, i8** %filename.addr, align 8 + %0 = load i8*, i8** %filename.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %1 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %2 = load i32, i32* %data.addr, align 4 + %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.4, i64 0, i64 0), i32 %2) + %3 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call2 = call i32 @fclose(%struct._IO_FILE* %3) + ret void +} + +declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #1 + +declare dso_local i32 @fclose(%struct._IO_FILE*) #1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local double @_Z7gettimev() #6 { +entry: + %t = alloca %struct.timeval, align 8 + %call = call i32 @gettimeofday(%struct.timeval* %t, %struct.timezone* null) #2 + %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %t, i32 0, i32 0 + %0 = load i64, i64* %tv_sec, align 8 + %conv = sitofp i64 %0 to double + %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %t, i32 0, i32 1 + %1 = load i64, i64* %tv_usec, align 8 + %conv1 = sitofp i64 %1 to double + %mul = fmul contract double %conv1, 0x3EB0C6F7A0B5ED8D + %add = fadd contract double %conv, %mul + ret double %add +} + +; Function Attrs: nounwind +declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #7 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @_Z11isIdenticalPfS_i(float* %i, float* %j, i32 %D) #6 { +entry: + %retval = alloca i32, align 4 + %i.addr = alloca float*, align 8 + %j.addr = alloca float*, align 8 + %D.addr = alloca i32, align 4 + %a = alloca i32, align 4 + %equal = alloca i32, align 4 + store float* %i, float** %i.addr, align 8 + store float* %j, float** %j.addr, align 8 + store i32 %D, i32* %D.addr, align 4 + store i32 0, i32* %a, align 4 + store i32 1, i32* %equal, align 4 + br label %while.cond + +while.cond: ; preds = %if.end, %entry + %0 = load i32, i32* %equal, align 4 + %tobool = icmp ne i32 %0, 0 + br i1 %tobool, label %land.rhs, label %land.end + +land.rhs: ; preds = %while.cond + %1 = load i32, i32* %a, align 4 + %2 = load i32, i32* %D.addr, align 4 + %cmp = icmp slt i32 %1, %2 + br label %land.end + +land.end: ; preds = %land.rhs, %while.cond + %3 = phi i1 [ false, %while.cond ], [ %cmp, %land.rhs ] + br i1 %3, label %while.body, label %while.end + +while.body: ; preds = %land.end + %4 = load float*, float** %i.addr, align 8 + %5 = load i32, i32* %a, align 4 + %idxprom = sext i32 %5 to i64 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %6 = load float, float* %arrayidx, align 4 + %7 = load float*, float** %j.addr, align 8 + %8 = load i32, i32* %a, align 4 + %idxprom1 = sext i32 %8 to i64 + %arrayidx2 = getelementptr inbounds float, float* %7, i64 %idxprom1 + %9 = load float, float* %arrayidx2, align 4 + %cmp3 = fcmp une float %6, %9 + br i1 %cmp3, label %if.then, label %if.else + +if.then: ; preds = %while.body + store i32 0, i32* %equal, align 4 + br label %if.end + +if.else: ; preds = %while.body + %10 = load i32, i32* %a, align 4 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %a, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond + +while.end: ; preds = %land.end + %11 = load i32, i32* %equal, align 4 + %tobool4 = icmp ne i32 %11, 0 + br i1 %tobool4, label %if.then5, label %if.else6 + +if.then5: ; preds = %while.end + store i32 1, i32* %retval, align 4 + br label %return + +if.else6: ; preds = %while.end + store i32 0, i32* %retval, align 4 + br label %return + +return: ; preds = %if.else6, %if.then5 + %12 = load i32, i32* %retval, align 4 + ret i32 %12 +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z7shuffleP6Points(%struct.Points* %points) #6 { +entry: + %points.addr = alloca %struct.Points*, align 8 + %t1 = alloca double, align 8 + %i = alloca i64, align 8 + %j = alloca i64, align 8 + %temp = alloca %struct.Point, align 8 + %t2 = alloca double, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + %call = call double @_Z7gettimev() + store double %call, double* %t1, align 8 + store i64 0, i64* %i, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i64, i64* %i, align 8 + %1 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %1, i32 0, i32 0 + %2 = load i64, i64* %num, align 8 + %sub = sub nsw i64 %2, 1 + %cmp = icmp slt i64 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call i64 @lrand48() #2 + %3 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num2 = getelementptr inbounds %struct.Points, %struct.Points* %3, i32 0, i32 0 + %4 = load i64, i64* %num2, align 8 + %5 = load i64, i64* %i, align 8 + %sub3 = sub nsw i64 %4, %5 + %rem = srem i64 %call1, %sub3 + %6 = load i64, i64* %i, align 8 + %add = add nsw i64 %rem, %6 + store i64 %add, i64* %j, align 8 + %7 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %7, i32 0, i32 2 + %8 = load %struct.Point*, %struct.Point** %p, align 8 + %9 = load i64, i64* %i, align 8 + %arrayidx = getelementptr inbounds %struct.Point, %struct.Point* %8, i64 %9 + %10 = bitcast %struct.Point* %temp to i8* + %11 = bitcast %struct.Point* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %10, i8* align 8 %11, i64 32, i1 false) + %12 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p4 = getelementptr inbounds %struct.Points, %struct.Points* %12, i32 0, i32 2 + %13 = load %struct.Point*, %struct.Point** %p4, align 8 + %14 = load i64, i64* %j, align 8 + %arrayidx5 = getelementptr inbounds %struct.Point, %struct.Point* %13, i64 %14 + %15 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p6 = getelementptr inbounds %struct.Points, %struct.Points* %15, i32 0, i32 2 + %16 = load %struct.Point*, %struct.Point** %p6, align 8 + %17 = load i64, i64* %i, align 8 + %arrayidx7 = getelementptr inbounds %struct.Point, %struct.Point* %16, i64 %17 + %18 = bitcast %struct.Point* %arrayidx7 to i8* + %19 = bitcast %struct.Point* %arrayidx5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %18, i8* align 8 %19, i64 32, i1 false) + %20 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p8 = getelementptr inbounds %struct.Points, %struct.Points* %20, i32 0, i32 2 + %21 = load %struct.Point*, %struct.Point** %p8, align 8 + %22 = load i64, i64* %j, align 8 + %arrayidx9 = getelementptr inbounds %struct.Point, %struct.Point* %21, i64 %22 + %23 = bitcast %struct.Point* %arrayidx9 to i8* + %24 = bitcast %struct.Point* %temp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 32, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body + %25 = load i64, i64* %i, align 8 + %inc = add nsw i64 %25, 1 + store i64 %inc, i64* %i, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %call10 = call double @_Z7gettimev() + store double %call10, double* %t2, align 8 + %26 = load double, double* %t2, align 8 + %27 = load double, double* %t1, align 8 + %sub11 = fsub contract double %26, %27 + %28 = load double, double* @time_shuffle, align 8 + %add12 = fadd contract double %28, %sub11 + store double %add12, double* @time_shuffle, align 8 + ret void +} + +; Function Attrs: nounwind +declare dso_local i64 @lrand48() #7 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z10intshufflePii(i32* %intarray, i32 %length) #6 { +entry: + %intarray.addr = alloca i32*, align 8 + %length.addr = alloca i32, align 4 + %t1 = alloca double, align 8 + %i = alloca i64, align 8 + %j = alloca i64, align 8 + %temp = alloca i32, align 4 + %t2 = alloca double, align 8 + store i32* %intarray, i32** %intarray.addr, align 8 + store i32 %length, i32* %length.addr, align 4 + %call = call double @_Z7gettimev() + store double %call, double* %t1, align 8 + store i64 0, i64* %i, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i64, i64* %i, align 8 + %1 = load i32, i32* %length.addr, align 4 + %conv = sext i32 %1 to i64 + %cmp = icmp slt i64 %0, %conv + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call i64 @lrand48() #2 + %2 = load i32, i32* %length.addr, align 4 + %conv2 = sext i32 %2 to i64 + %3 = load i64, i64* %i, align 8 + %sub = sub nsw i64 %conv2, %3 + %rem = srem i64 %call1, %sub + %4 = load i64, i64* %i, align 8 + %add = add nsw i64 %rem, %4 + store i64 %add, i64* %j, align 8 + %5 = load i32*, i32** %intarray.addr, align 8 + %6 = load i64, i64* %i, align 8 + %arrayidx = getelementptr inbounds i32, i32* %5, i64 %6 + %7 = load i32, i32* %arrayidx, align 4 + store i32 %7, i32* %temp, align 4 + %8 = load i32*, i32** %intarray.addr, align 8 + %9 = load i64, i64* %j, align 8 + %arrayidx3 = getelementptr inbounds i32, i32* %8, i64 %9 + %10 = load i32, i32* %arrayidx3, align 4 + %11 = load i32*, i32** %intarray.addr, align 8 + %12 = load i64, i64* %i, align 8 + %arrayidx4 = getelementptr inbounds i32, i32* %11, i64 %12 + store i32 %10, i32* %arrayidx4, align 4 + %13 = load i32, i32* %temp, align 4 + %14 = load i32*, i32** %intarray.addr, align 8 + %15 = load i64, i64* %j, align 8 + %arrayidx5 = getelementptr inbounds i32, i32* %14, i64 %15 + store i32 %13, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %16 = load i64, i64* %i, align 8 + %inc = add nsw i64 %16, 1 + store i64 %inc, i64* %i, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %call6 = call double @_Z7gettimev() + store double %call6, double* %t2, align 8 + %17 = load double, double* %t2, align 8 + %18 = load double, double* %t1, align 8 + %sub7 = fsub contract double %17, %18 + %19 = load double, double* @time_shuffle, align 8 + %add8 = fadd contract double %19, %sub7 + store double %add8, double* @time_shuffle, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local float @_Z7pspeedyP6PointsfPliP17pthread_barrier_t(%struct.Points* %points, float %z, i64* %kcenter, i32 %pid, %union.pthread_barrier_t* %barrier) #6 { +entry: + %points.addr = alloca %struct.Points*, align 8 + %z.addr = alloca float, align 4 + %kcenter.addr = alloca i64*, align 8 + %pid.addr = alloca i32, align 4 + %barrier.addr = alloca %union.pthread_barrier_t*, align 8 + %t1 = alloca double, align 8 + %bsize = alloca i64, align 8 + %k1 = alloca i64, align 8 + %k2 = alloca i64, align 8 + %k = alloca i32, align 4 + %distance = alloca float, align 4 + %agg.tmp = alloca %struct.Point, align 8 + %agg.tmp6 = alloca %struct.Point, align 8 + %k33 = alloca i32, align 4 + %distance39 = alloca float, align 4 + %agg.tmp40 = alloca %struct.Point, align 8 + %agg.tmp44 = alloca %struct.Point, align 8 + %to_open = alloca i8, align 1 + %k95 = alloca i32, align 4 + %distance101 = alloca float, align 4 + %agg.tmp102 = alloca %struct.Point, align 8 + %agg.tmp106 = alloca %struct.Point, align 8 + %mytotal = alloca float, align 4 + %k146 = alloca i32, align 4 + %i = alloca i32, align 4 + %t2 = alloca double, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store float %z, float* %z.addr, align 4 + store i64* %kcenter, i64** %kcenter.addr, align 8 + store i32 %pid, i32* %pid.addr, align 4 + store %union.pthread_barrier_t* %barrier, %union.pthread_barrier_t** %barrier.addr, align 8 + %call = call double @_Z7gettimev() + store double %call, double* %t1, align 8 + %0 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %0, i32 0, i32 0 + %1 = load i64, i64* %num, align 8 + %2 = load i32, i32* @_ZL5nproc, align 4 + %conv = sext i32 %2 to i64 + %div = sdiv i64 %1, %conv + store i64 %div, i64* %bsize, align 8 + %3 = load i64, i64* %bsize, align 8 + %4 = load i32, i32* %pid.addr, align 4 + %conv1 = sext i32 %4 to i64 + %mul = mul nsw i64 %3, %conv1 + store i64 %mul, i64* %k1, align 8 + %5 = load i64, i64* %k1, align 8 + %6 = load i64, i64* %bsize, align 8 + %add = add nsw i64 %5, %6 + store i64 %add, i64* %k2, align 8 + %7 = load i32, i32* %pid.addr, align 4 + %8 = load i32, i32* @_ZL5nproc, align 4 + %sub = sub nsw i32 %8, 1 + %cmp = icmp eq i32 %7, %sub + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %9 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num2 = getelementptr inbounds %struct.Points, %struct.Points* %9, i32 0, i32 0 + %10 = load i64, i64* %num2, align 8 + store i64 %10, i64* %k2, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + %11 = load i64, i64* %k1, align 8 + %conv3 = trunc i64 %11 to i32 + store i32 %conv3, i32* %k, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %12 = load i32, i32* %k, align 4 + %conv4 = sext i32 %12 to i64 + %13 = load i64, i64* %k2, align 8 + %cmp5 = icmp slt i64 %conv4, %13 + br i1 %cmp5, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %14 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %14, i32 0, i32 2 + %15 = load %struct.Point*, %struct.Point** %p, align 8 + %16 = load i32, i32* %k, align 4 + %idxprom = sext i32 %16 to i64 + %arrayidx = getelementptr inbounds %struct.Point, %struct.Point* %15, i64 %idxprom + %17 = bitcast %struct.Point* %agg.tmp to i8* + %18 = bitcast %struct.Point* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 32, i1 false) + %19 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p7 = getelementptr inbounds %struct.Points, %struct.Points* %19, i32 0, i32 2 + %20 = load %struct.Point*, %struct.Point** %p7, align 8 + %arrayidx8 = getelementptr inbounds %struct.Point, %struct.Point* %20, i64 0 + %21 = bitcast %struct.Point* %agg.tmp6 to i8* + %22 = bitcast %struct.Point* %arrayidx8 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 32, i1 false) + %23 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim = getelementptr inbounds %struct.Points, %struct.Points* %23, i32 0, i32 1 + %24 = load i32, i32* %dim, align 8 + %call9 = call float @_Z4dist5PointS_i(%struct.Point* byval(%struct.Point) align 8 %agg.tmp, %struct.Point* byval(%struct.Point) align 8 %agg.tmp6, i32 %24) + store float %call9, float* %distance, align 4 + %25 = load float, float* %distance, align 4 + %26 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p10 = getelementptr inbounds %struct.Points, %struct.Points* %26, i32 0, i32 2 + %27 = load %struct.Point*, %struct.Point** %p10, align 8 + %28 = load i32, i32* %k, align 4 + %idxprom11 = sext i32 %28 to i64 + %arrayidx12 = getelementptr inbounds %struct.Point, %struct.Point* %27, i64 %idxprom11 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx12, i32 0, i32 0 + %29 = load float, float* %weight, align 8 + %mul13 = fmul contract float %25, %29 + %30 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p14 = getelementptr inbounds %struct.Points, %struct.Points* %30, i32 0, i32 2 + %31 = load %struct.Point*, %struct.Point** %p14, align 8 + %32 = load i32, i32* %k, align 4 + %idxprom15 = sext i32 %32 to i64 + %arrayidx16 = getelementptr inbounds %struct.Point, %struct.Point* %31, i64 %idxprom15 + %cost = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx16, i32 0, i32 3 + store float %mul13, float* %cost, align 8 + %33 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p17 = getelementptr inbounds %struct.Points, %struct.Points* %33, i32 0, i32 2 + %34 = load %struct.Point*, %struct.Point** %p17, align 8 + %35 = load i32, i32* %k, align 4 + %idxprom18 = sext i32 %35 to i64 + %arrayidx19 = getelementptr inbounds %struct.Point, %struct.Point* %34, i64 %idxprom18 + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx19, i32 0, i32 2 + store i64 0, i64* %assign, align 8 + br label %for.inc + +for.inc: ; preds = %for.body + %36 = load i32, i32* %k, align 4 + %inc = add nsw i32 %36, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %37 = load i32, i32* %pid.addr, align 4 + %cmp20 = icmp eq i32 %37, 0 + br i1 %cmp20, label %if.then21, label %if.end25 + +if.then21: ; preds = %for.end + %38 = load i64*, i64** %kcenter.addr, align 8 + store i64 1, i64* %38, align 8 + %39 = load i32, i32* @_ZL5nproc, align 4 + %conv22 = sext i32 %39 to i64 + %mul23 = mul i64 4, %conv22 + %call24 = call noalias i8* @malloc(i64 %mul23) #2 + %40 = bitcast i8* %call24 to float* + store float* %40, float** @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE5costs, align 8 + br label %if.end25 + +if.end25: ; preds = %if.then21, %for.end + %41 = load i32, i32* %pid.addr, align 4 + %cmp26 = icmp ne i32 %41, 0 + br i1 %cmp26, label %if.then27, label %if.else + +if.then27: ; preds = %if.end25 + br label %while.body + +while.body: ; preds = %if.then27, %for.end78 + %42 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %conv28 = sext i32 %42 to i64 + %43 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num29 = getelementptr inbounds %struct.Points, %struct.Points* %43, i32 0, i32 0 + %44 = load i64, i64* %num29, align 8 + %cmp30 = icmp sge i64 %conv28, %44 + br i1 %cmp30, label %if.then31, label %if.end32 + +if.then31: ; preds = %while.body + br label %while.end + +if.end32: ; preds = %while.body + %45 = load i64, i64* %k1, align 8 + %conv34 = trunc i64 %45 to i32 + store i32 %conv34, i32* %k33, align 4 + br label %for.cond35 + +for.cond35: ; preds = %for.inc76, %if.end32 + %46 = load i32, i32* %k33, align 4 + %conv36 = sext i32 %46 to i64 + %47 = load i64, i64* %k2, align 8 + %cmp37 = icmp slt i64 %conv36, %47 + br i1 %cmp37, label %for.body38, label %for.end78 + +for.body38: ; preds = %for.cond35 + %48 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p41 = getelementptr inbounds %struct.Points, %struct.Points* %48, i32 0, i32 2 + %49 = load %struct.Point*, %struct.Point** %p41, align 8 + %50 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %idxprom42 = sext i32 %50 to i64 + %arrayidx43 = getelementptr inbounds %struct.Point, %struct.Point* %49, i64 %idxprom42 + %51 = bitcast %struct.Point* %agg.tmp40 to i8* + %52 = bitcast %struct.Point* %arrayidx43 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %51, i8* align 8 %52, i64 32, i1 false) + %53 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p45 = getelementptr inbounds %struct.Points, %struct.Points* %53, i32 0, i32 2 + %54 = load %struct.Point*, %struct.Point** %p45, align 8 + %55 = load i32, i32* %k33, align 4 + %idxprom46 = sext i32 %55 to i64 + %arrayidx47 = getelementptr inbounds %struct.Point, %struct.Point* %54, i64 %idxprom46 + %56 = bitcast %struct.Point* %agg.tmp44 to i8* + %57 = bitcast %struct.Point* %arrayidx47 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %56, i8* align 8 %57, i64 32, i1 false) + %58 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim48 = getelementptr inbounds %struct.Points, %struct.Points* %58, i32 0, i32 1 + %59 = load i32, i32* %dim48, align 8 + %call49 = call float @_Z4dist5PointS_i(%struct.Point* byval(%struct.Point) align 8 %agg.tmp40, %struct.Point* byval(%struct.Point) align 8 %agg.tmp44, i32 %59) + store float %call49, float* %distance39, align 4 + %60 = load float, float* %distance39, align 4 + %61 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p50 = getelementptr inbounds %struct.Points, %struct.Points* %61, i32 0, i32 2 + %62 = load %struct.Point*, %struct.Point** %p50, align 8 + %63 = load i32, i32* %k33, align 4 + %idxprom51 = sext i32 %63 to i64 + %arrayidx52 = getelementptr inbounds %struct.Point, %struct.Point* %62, i64 %idxprom51 + %weight53 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx52, i32 0, i32 0 + %64 = load float, float* %weight53, align 8 + %mul54 = fmul contract float %60, %64 + %65 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p55 = getelementptr inbounds %struct.Points, %struct.Points* %65, i32 0, i32 2 + %66 = load %struct.Point*, %struct.Point** %p55, align 8 + %67 = load i32, i32* %k33, align 4 + %idxprom56 = sext i32 %67 to i64 + %arrayidx57 = getelementptr inbounds %struct.Point, %struct.Point* %66, i64 %idxprom56 + %cost58 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx57, i32 0, i32 3 + %68 = load float, float* %cost58, align 8 + %cmp59 = fcmp olt float %mul54, %68 + br i1 %cmp59, label %if.then60, label %if.end75 + +if.then60: ; preds = %for.body38 + %69 = load float, float* %distance39, align 4 + %70 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p61 = getelementptr inbounds %struct.Points, %struct.Points* %70, i32 0, i32 2 + %71 = load %struct.Point*, %struct.Point** %p61, align 8 + %72 = load i32, i32* %k33, align 4 + %idxprom62 = sext i32 %72 to i64 + %arrayidx63 = getelementptr inbounds %struct.Point, %struct.Point* %71, i64 %idxprom62 + %weight64 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx63, i32 0, i32 0 + %73 = load float, float* %weight64, align 8 + %mul65 = fmul contract float %69, %73 + %74 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p66 = getelementptr inbounds %struct.Points, %struct.Points* %74, i32 0, i32 2 + %75 = load %struct.Point*, %struct.Point** %p66, align 8 + %76 = load i32, i32* %k33, align 4 + %idxprom67 = sext i32 %76 to i64 + %arrayidx68 = getelementptr inbounds %struct.Point, %struct.Point* %75, i64 %idxprom67 + %cost69 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx68, i32 0, i32 3 + store float %mul65, float* %cost69, align 8 + %77 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %conv70 = sext i32 %77 to i64 + %78 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p71 = getelementptr inbounds %struct.Points, %struct.Points* %78, i32 0, i32 2 + %79 = load %struct.Point*, %struct.Point** %p71, align 8 + %80 = load i32, i32* %k33, align 4 + %idxprom72 = sext i32 %80 to i64 + %arrayidx73 = getelementptr inbounds %struct.Point, %struct.Point* %79, i64 %idxprom72 + %assign74 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx73, i32 0, i32 2 + store i64 %conv70, i64* %assign74, align 8 + br label %if.end75 + +if.end75: ; preds = %if.then60, %for.body38 + br label %for.inc76 + +for.inc76: ; preds = %if.end75 + %81 = load i32, i32* %k33, align 4 + %inc77 = add nsw i32 %81, 1 + store i32 %inc77, i32* %k33, align 4 + br label %for.cond35 + +for.end78: ; preds = %for.cond35 + br label %while.body + +while.end: ; preds = %if.then31 + br label %if.end145 + +if.else: ; preds = %if.end25 + store i32 1, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + br label %for.cond79 + +for.cond79: ; preds = %for.inc142, %if.else + %82 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %conv80 = sext i32 %82 to i64 + %83 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num81 = getelementptr inbounds %struct.Points, %struct.Points* %83, i32 0, i32 0 + %84 = load i64, i64* %num81, align 8 + %cmp82 = icmp slt i64 %conv80, %84 + br i1 %cmp82, label %for.body83, label %for.end144 + +for.body83: ; preds = %for.cond79 + %call84 = call i64 @lrand48() #2 + %conv85 = sitofp i64 %call84 to float + %div86 = fdiv float %conv85, 0x41E0000000000000 + %85 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p87 = getelementptr inbounds %struct.Points, %struct.Points* %85, i32 0, i32 2 + %86 = load %struct.Point*, %struct.Point** %p87, align 8 + %87 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %idxprom88 = sext i32 %87 to i64 + %arrayidx89 = getelementptr inbounds %struct.Point, %struct.Point* %86, i64 %idxprom88 + %cost90 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx89, i32 0, i32 3 + %88 = load float, float* %cost90, align 8 + %89 = load float, float* %z.addr, align 4 + %div91 = fdiv float %88, %89 + %cmp92 = fcmp olt float %div86, %div91 + %frombool = zext i1 %cmp92 to i8 + store i8 %frombool, i8* %to_open, align 1 + %90 = load i8, i8* %to_open, align 1 + %tobool = trunc i8 %90 to i1 + br i1 %tobool, label %if.then93, label %if.end141 + +if.then93: ; preds = %for.body83 + %91 = load i64*, i64** %kcenter.addr, align 8 + %92 = load i64, i64* %91, align 8 + %inc94 = add nsw i64 %92, 1 + store i64 %inc94, i64* %91, align 8 + store i8 1, i8* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE4open, align 1 + %93 = load i64, i64* %k1, align 8 + %conv96 = trunc i64 %93 to i32 + store i32 %conv96, i32* %k95, align 4 + br label %for.cond97 + +for.cond97: ; preds = %for.inc138, %if.then93 + %94 = load i32, i32* %k95, align 4 + %conv98 = sext i32 %94 to i64 + %95 = load i64, i64* %k2, align 8 + %cmp99 = icmp slt i64 %conv98, %95 + br i1 %cmp99, label %for.body100, label %for.end140 + +for.body100: ; preds = %for.cond97 + %96 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p103 = getelementptr inbounds %struct.Points, %struct.Points* %96, i32 0, i32 2 + %97 = load %struct.Point*, %struct.Point** %p103, align 8 + %98 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %idxprom104 = sext i32 %98 to i64 + %arrayidx105 = getelementptr inbounds %struct.Point, %struct.Point* %97, i64 %idxprom104 + %99 = bitcast %struct.Point* %agg.tmp102 to i8* + %100 = bitcast %struct.Point* %arrayidx105 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %99, i8* align 8 %100, i64 32, i1 false) + %101 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p107 = getelementptr inbounds %struct.Points, %struct.Points* %101, i32 0, i32 2 + %102 = load %struct.Point*, %struct.Point** %p107, align 8 + %103 = load i32, i32* %k95, align 4 + %idxprom108 = sext i32 %103 to i64 + %arrayidx109 = getelementptr inbounds %struct.Point, %struct.Point* %102, i64 %idxprom108 + %104 = bitcast %struct.Point* %agg.tmp106 to i8* + %105 = bitcast %struct.Point* %arrayidx109 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %104, i8* align 8 %105, i64 32, i1 false) + %106 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim110 = getelementptr inbounds %struct.Points, %struct.Points* %106, i32 0, i32 1 + %107 = load i32, i32* %dim110, align 8 + %call111 = call float @_Z4dist5PointS_i(%struct.Point* byval(%struct.Point) align 8 %agg.tmp102, %struct.Point* byval(%struct.Point) align 8 %agg.tmp106, i32 %107) + store float %call111, float* %distance101, align 4 + %108 = load float, float* %distance101, align 4 + %109 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p112 = getelementptr inbounds %struct.Points, %struct.Points* %109, i32 0, i32 2 + %110 = load %struct.Point*, %struct.Point** %p112, align 8 + %111 = load i32, i32* %k95, align 4 + %idxprom113 = sext i32 %111 to i64 + %arrayidx114 = getelementptr inbounds %struct.Point, %struct.Point* %110, i64 %idxprom113 + %weight115 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx114, i32 0, i32 0 + %112 = load float, float* %weight115, align 8 + %mul116 = fmul contract float %108, %112 + %113 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p117 = getelementptr inbounds %struct.Points, %struct.Points* %113, i32 0, i32 2 + %114 = load %struct.Point*, %struct.Point** %p117, align 8 + %115 = load i32, i32* %k95, align 4 + %idxprom118 = sext i32 %115 to i64 + %arrayidx119 = getelementptr inbounds %struct.Point, %struct.Point* %114, i64 %idxprom118 + %cost120 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx119, i32 0, i32 3 + %116 = load float, float* %cost120, align 8 + %cmp121 = fcmp olt float %mul116, %116 + br i1 %cmp121, label %if.then122, label %if.end137 + +if.then122: ; preds = %for.body100 + %117 = load float, float* %distance101, align 4 + %118 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p123 = getelementptr inbounds %struct.Points, %struct.Points* %118, i32 0, i32 2 + %119 = load %struct.Point*, %struct.Point** %p123, align 8 + %120 = load i32, i32* %k95, align 4 + %idxprom124 = sext i32 %120 to i64 + %arrayidx125 = getelementptr inbounds %struct.Point, %struct.Point* %119, i64 %idxprom124 + %weight126 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx125, i32 0, i32 0 + %121 = load float, float* %weight126, align 8 + %mul127 = fmul contract float %117, %121 + %122 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p128 = getelementptr inbounds %struct.Points, %struct.Points* %122, i32 0, i32 2 + %123 = load %struct.Point*, %struct.Point** %p128, align 8 + %124 = load i32, i32* %k95, align 4 + %idxprom129 = sext i32 %124 to i64 + %arrayidx130 = getelementptr inbounds %struct.Point, %struct.Point* %123, i64 %idxprom129 + %cost131 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx130, i32 0, i32 3 + store float %mul127, float* %cost131, align 8 + %125 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %conv132 = sext i32 %125 to i64 + %126 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p133 = getelementptr inbounds %struct.Points, %struct.Points* %126, i32 0, i32 2 + %127 = load %struct.Point*, %struct.Point** %p133, align 8 + %128 = load i32, i32* %k95, align 4 + %idxprom134 = sext i32 %128 to i64 + %arrayidx135 = getelementptr inbounds %struct.Point, %struct.Point* %127, i64 %idxprom134 + %assign136 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx135, i32 0, i32 2 + store i64 %conv132, i64* %assign136, align 8 + br label %if.end137 + +if.end137: ; preds = %if.then122, %for.body100 + br label %for.inc138 + +for.inc138: ; preds = %if.end137 + %129 = load i32, i32* %k95, align 4 + %inc139 = add nsw i32 %129, 1 + store i32 %inc139, i32* %k95, align 4 + br label %for.cond97 + +for.end140: ; preds = %for.cond97 + store i8 0, i8* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE4open, align 1 + br label %if.end141 + +if.end141: ; preds = %for.end140, %for.body83 + br label %for.inc142 + +for.inc142: ; preds = %if.end141 + %130 = load i32, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + %inc143 = add nsw i32 %130, 1 + store i32 %inc143, i32* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE1i, align 4 + br label %for.cond79 + +for.end144: ; preds = %for.cond79 + store i8 1, i8* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE4open, align 1 + br label %if.end145 + +if.end145: ; preds = %for.end144, %while.end + store i8 0, i8* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE4open, align 1 + store float 0.000000e+00, float* %mytotal, align 4 + %131 = load i64, i64* %k1, align 8 + %conv147 = trunc i64 %131 to i32 + store i32 %conv147, i32* %k146, align 4 + br label %for.cond148 + +for.cond148: ; preds = %for.inc157, %if.end145 + %132 = load i32, i32* %k146, align 4 + %conv149 = sext i32 %132 to i64 + %133 = load i64, i64* %k2, align 8 + %cmp150 = icmp slt i64 %conv149, %133 + br i1 %cmp150, label %for.body151, label %for.end159 + +for.body151: ; preds = %for.cond148 + %134 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p152 = getelementptr inbounds %struct.Points, %struct.Points* %134, i32 0, i32 2 + %135 = load %struct.Point*, %struct.Point** %p152, align 8 + %136 = load i32, i32* %k146, align 4 + %idxprom153 = sext i32 %136 to i64 + %arrayidx154 = getelementptr inbounds %struct.Point, %struct.Point* %135, i64 %idxprom153 + %cost155 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx154, i32 0, i32 3 + %137 = load float, float* %cost155, align 8 + %138 = load float, float* %mytotal, align 4 + %add156 = fadd contract float %138, %137 + store float %add156, float* %mytotal, align 4 + br label %for.inc157 + +for.inc157: ; preds = %for.body151 + %139 = load i32, i32* %k146, align 4 + %inc158 = add nsw i32 %139, 1 + store i32 %inc158, i32* %k146, align 4 + br label %for.cond148 + +for.end159: ; preds = %for.cond148 + %140 = load float, float* %mytotal, align 4 + %141 = load float*, float** @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE5costs, align 8 + %142 = load i32, i32* %pid.addr, align 4 + %idxprom160 = sext i32 %142 to i64 + %arrayidx161 = getelementptr inbounds float, float* %141, i64 %idxprom160 + store float %140, float* %arrayidx161, align 4 + %143 = load i32, i32* %pid.addr, align 4 + %cmp162 = icmp eq i32 %143, 0 + br i1 %cmp162, label %if.then163, label %if.end175 + +if.then163: ; preds = %for.end159 + %144 = load float, float* %z.addr, align 4 + %145 = load i64*, i64** %kcenter.addr, align 8 + %146 = load i64, i64* %145, align 8 + %conv164 = sitofp i64 %146 to float + %mul165 = fmul contract float %144, %conv164 + store float %mul165, float* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE9totalcost, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond166 + +for.cond166: ; preds = %for.inc172, %if.then163 + %147 = load i32, i32* %i, align 4 + %148 = load i32, i32* @_ZL5nproc, align 4 + %cmp167 = icmp slt i32 %147, %148 + br i1 %cmp167, label %for.body168, label %for.end174 + +for.body168: ; preds = %for.cond166 + %149 = load float*, float** @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE5costs, align 8 + %150 = load i32, i32* %i, align 4 + %idxprom169 = sext i32 %150 to i64 + %arrayidx170 = getelementptr inbounds float, float* %149, i64 %idxprom169 + %151 = load float, float* %arrayidx170, align 4 + %152 = load float, float* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE9totalcost, align 4 + %add171 = fadd contract float %152, %151 + store float %add171, float* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE9totalcost, align 4 + br label %for.inc172 + +for.inc172: ; preds = %for.body168 + %153 = load i32, i32* %i, align 4 + %inc173 = add nsw i32 %153, 1 + store i32 %inc173, i32* %i, align 4 + br label %for.cond166 + +for.end174: ; preds = %for.cond166 + %154 = load float*, float** @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE5costs, align 8 + %155 = bitcast float* %154 to i8* + call void @free(i8* %155) #2 + br label %if.end175 + +if.end175: ; preds = %for.end174, %for.end159 + %call176 = call double @_Z7gettimev() + store double %call176, double* %t2, align 8 + %156 = load i32, i32* %pid.addr, align 4 + %cmp177 = icmp eq i32 %156, 0 + br i1 %cmp177, label %if.then178, label %if.end181 + +if.then178: ; preds = %if.end175 + %157 = load double, double* %t2, align 8 + %158 = load double, double* %t1, align 8 + %sub179 = fsub contract double %157, %158 + %159 = load double, double* @time_speedy, align 8 + %add180 = fadd contract double %159, %sub179 + store double %add180, double* @time_speedy, align 8 + br label %if.end181 + +if.end181: ; preds = %if.then178, %if.end175 + %160 = load float, float* @_ZZ7pspeedyP6PointsfPliP17pthread_barrier_tE9totalcost, align 4 + ret float %160 +} + +; Function Attrs: noinline optnone uwtable +define dso_local float @_Z3pFLP6PointsPiifPliflfiP17pthread_barrier_t(%struct.Points* %points, i32* %feasible, i32 %numfeasible, float %z, i64* %k, i32 %kmax, float %cost, i64 %iter, float %e, i32 %pid, %union.pthread_barrier_t* %barrier) #3 { +entry: + %points.addr = alloca %struct.Points*, align 8 + %feasible.addr = alloca i32*, align 8 + %numfeasible.addr = alloca i32, align 4 + %z.addr = alloca float, align 4 + %k.addr = alloca i64*, align 8 + %kmax.addr = alloca i32, align 4 + %cost.addr = alloca float, align 4 + %iter.addr = alloca i64, align 8 + %e.addr = alloca float, align 4 + %pid.addr = alloca i32, align 4 + %barrier.addr = alloca %union.pthread_barrier_t*, align 8 + %i = alloca i64, align 8 + %x = alloca i64, align 8 + %change = alloca float, align 4 + %numberOfPoints = alloca i64, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store i32* %feasible, i32** %feasible.addr, align 8 + store i32 %numfeasible, i32* %numfeasible.addr, align 4 + store float %z, float* %z.addr, align 4 + store i64* %k, i64** %k.addr, align 8 + store i32 %kmax, i32* %kmax.addr, align 4 + store float %cost, float* %cost.addr, align 4 + store i64 %iter, i64* %iter.addr, align 8 + store float %e, float* %e.addr, align 4 + store i32 %pid, i32* %pid.addr, align 4 + store %union.pthread_barrier_t* %barrier, %union.pthread_barrier_t** %barrier.addr, align 8 + %0 = load float, float* %cost.addr, align 4 + store float %0, float* %change, align 4 + br label %while.cond + +while.cond: ; preds = %for.end, %entry + %1 = load float, float* %change, align 4 + %2 = load float, float* %cost.addr, align 4 + %div = fdiv float %1, %2 + %conv = fpext float %div to double + %3 = load float, float* %e.addr, align 4 + %conv1 = fpext float %3 to double + %mul = fmul contract double 1.000000e+00, %conv1 + %cmp = fcmp ogt double %conv, %mul + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + store float 0.000000e+00, float* %change, align 4 + %4 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %4, i32 0, i32 0 + %5 = load i64, i64* %num, align 8 + store i64 %5, i64* %numberOfPoints, align 8 + %6 = load i32, i32* %pid.addr, align 4 + %cmp2 = icmp eq i32 %6, 0 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %while.body + %7 = load i32*, i32** %feasible.addr, align 8 + %8 = load i32, i32* %numfeasible.addr, align 4 + call void @_Z10intshufflePii(i32* %7, i32 %8) + br label %if.end + +if.end: ; preds = %if.then, %while.body + store i64 0, i64* %i, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %9 = load i64, i64* %i, align 8 + %10 = load i64, i64* %iter.addr, align 8 + %cmp3 = icmp slt i64 %9, %10 + br i1 %cmp3, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %11 = load i64, i64* %i, align 8 + %12 = load i32, i32* %numfeasible.addr, align 4 + %conv4 = sext i32 %12 to i64 + %rem = srem i64 %11, %conv4 + store i64 %rem, i64* %x, align 8 + %13 = load i32*, i32** %feasible.addr, align 8 + %14 = load i64, i64* %x, align 8 + %arrayidx = getelementptr inbounds i32, i32* %13, i64 %14 + %15 = load i32, i32* %arrayidx, align 4 + %conv5 = sext i32 %15 to i64 + %16 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %17 = load float, float* %z.addr, align 4 + %18 = load i64*, i64** %k.addr, align 8 + %19 = load i32, i32* %kmax.addr, align 4 + %20 = load i8*, i8** @_ZL9is_center, align 8 + %21 = load i32*, i32** @_ZL12center_table, align 8 + %22 = load i8*, i8** @_ZL17switch_membership, align 8 + %23 = load i8, i8* @isCoordChanged, align 1 + %tobool = trunc i8 %23 to i1 + %call = call float @_Z5pgainlP6PointsfPliPbPiS2_bPdS4_S4_S4_S4_S4_(i64 %conv5, %struct.Points* %16, float %17, i64* %18, i32 %19, i8* %20, i32* %21, i8* %22, i1 zeroext %tobool, double* @serial_t, double* @cpu_to_gpu_t, double* @gpu_to_cpu_t, double* @alloc_t, double* @kernel_t, double* @free_t) + %24 = load float, float* %change, align 4 + %add = fadd contract float %24, %call + store float %add, float* %change, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %25 = load i64, i64* %i, align 8 + %inc = add nsw i64 %25, 1 + store i64 %inc, i64* %i, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %26 = load float, float* %change, align 4 + %27 = load float, float* %cost.addr, align 4 + %sub = fsub contract float %27, %26 + store float %sub, float* %cost.addr, align 4 + br label %while.cond + +while.end: ; preds = %while.cond + %28 = load float, float* %cost.addr, align 4 + ret float %28 +} + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @_Z19selectfeasible_fastP6PointsPPiiiP17pthread_barrier_t(%struct.Points* %points, i32** %feasible, i32 %kmin, i32 %pid, %union.pthread_barrier_t* %barrier) #3 { +entry: + %retval = alloca i32, align 4 + %points.addr = alloca %struct.Points*, align 8 + %feasible.addr = alloca i32**, align 8 + %kmin.addr = alloca i32, align 4 + %pid.addr = alloca i32, align 4 + %barrier.addr = alloca %union.pthread_barrier_t*, align 8 + %t1 = alloca double, align 8 + %numfeasible = alloca i32, align 4 + %accumweight = alloca float*, align 8 + %totalweight = alloca float, align 4 + %k1 = alloca i64, align 8 + %k2 = alloca i64, align 8 + %w = alloca float, align 4 + %l = alloca i32, align 4 + %r = alloca i32, align 4 + %k = alloca i32, align 4 + %i = alloca i32, align 4 + %i29 = alloca i32, align 4 + %i49 = alloca i32, align 4 + %t2 = alloca double, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store i32** %feasible, i32*** %feasible.addr, align 8 + store i32 %kmin, i32* %kmin.addr, align 4 + store i32 %pid, i32* %pid.addr, align 4 + store %union.pthread_barrier_t* %barrier, %union.pthread_barrier_t** %barrier.addr, align 8 + %call = call double @_Z7gettimev() + store double %call, double* %t1, align 8 + %0 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %0, i32 0, i32 0 + %1 = load i64, i64* %num, align 8 + %conv = trunc i64 %1 to i32 + store i32 %conv, i32* %numfeasible, align 4 + %2 = load i32, i32* %numfeasible, align 4 + %conv1 = sitofp i32 %2 to float + %3 = load i32, i32* %kmin.addr, align 4 + %mul = mul nsw i32 3, %3 + %conv2 = sitofp i32 %mul to float + %4 = load i32, i32* %kmin.addr, align 4 + %conv3 = sitofp i32 %4 to float + %call4 = call float @_ZSt3logf(float %conv3) + %mul5 = fmul contract float %conv2, %call4 + %cmp = fcmp ogt float %conv1, %mul5 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %5 = load i32, i32* %kmin.addr, align 4 + %mul6 = mul nsw i32 3, %5 + %conv7 = sitofp i32 %mul6 to float + %6 = load i32, i32* %kmin.addr, align 4 + %conv8 = sitofp i32 %6 to float + %call9 = call float @_ZSt3logf(float %conv8) + %mul10 = fmul contract float %conv7, %call9 + %conv11 = fptosi float %mul10 to i32 + store i32 %conv11, i32* %numfeasible, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + %7 = load i32, i32* %numfeasible, align 4 + %conv12 = sext i32 %7 to i64 + %mul13 = mul i64 %conv12, 4 + %call14 = call noalias i8* @malloc(i64 %mul13) #2 + %8 = bitcast i8* %call14 to i32* + %9 = load i32**, i32*** %feasible.addr, align 8 + store i32* %8, i32** %9, align 8 + store i64 0, i64* %k1, align 8 + %10 = load i32, i32* %numfeasible, align 4 + %conv15 = sext i32 %10 to i64 + store i64 %conv15, i64* %k2, align 8 + %11 = load i32, i32* %numfeasible, align 4 + %conv16 = sext i32 %11 to i64 + %12 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num17 = getelementptr inbounds %struct.Points, %struct.Points* %12, i32 0, i32 0 + %13 = load i64, i64* %num17, align 8 + %cmp18 = icmp eq i64 %conv16, %13 + br i1 %cmp18, label %if.then19, label %if.end23 + +if.then19: ; preds = %if.end + %14 = load i64, i64* %k1, align 8 + %conv20 = trunc i64 %14 to i32 + store i32 %conv20, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then19 + %15 = load i32, i32* %i, align 4 + %conv21 = sext i32 %15 to i64 + %16 = load i64, i64* %k2, align 8 + %cmp22 = icmp slt i64 %conv21, %16 + br i1 %cmp22, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %17 = load i32, i32* %i, align 4 + %18 = load i32**, i32*** %feasible.addr, align 8 + %19 = load i32*, i32** %18, align 8 + %20 = load i32, i32* %i, align 4 + %idxprom = sext i32 %20 to i64 + %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom + store i32 %17, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %21 = load i32, i32* %i, align 4 + %inc = add nsw i32 %21, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %22 = load i32, i32* %numfeasible, align 4 + store i32 %22, i32* %retval, align 4 + br label %return + +if.end23: ; preds = %if.end + %23 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num24 = getelementptr inbounds %struct.Points, %struct.Points* %23, i32 0, i32 0 + %24 = load i64, i64* %num24, align 8 + %mul25 = mul i64 4, %24 + %call26 = call noalias i8* @malloc(i64 %mul25) #2 + %25 = bitcast i8* %call26 to float* + store float* %25, float** %accumweight, align 8 + %26 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %26, i32 0, i32 2 + %27 = load %struct.Point*, %struct.Point** %p, align 8 + %arrayidx27 = getelementptr inbounds %struct.Point, %struct.Point* %27, i64 0 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx27, i32 0, i32 0 + %28 = load float, float* %weight, align 8 + %29 = load float*, float** %accumweight, align 8 + %arrayidx28 = getelementptr inbounds float, float* %29, i64 0 + store float %28, float* %arrayidx28, align 4 + store float 0.000000e+00, float* %totalweight, align 4 + store i32 1, i32* %i29, align 4 + br label %for.cond30 + +for.cond30: ; preds = %for.inc43, %if.end23 + %30 = load i32, i32* %i29, align 4 + %conv31 = sext i32 %30 to i64 + %31 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num32 = getelementptr inbounds %struct.Points, %struct.Points* %31, i32 0, i32 0 + %32 = load i64, i64* %num32, align 8 + %cmp33 = icmp slt i64 %conv31, %32 + br i1 %cmp33, label %for.body34, label %for.end45 + +for.body34: ; preds = %for.cond30 + %33 = load float*, float** %accumweight, align 8 + %34 = load i32, i32* %i29, align 4 + %sub = sub nsw i32 %34, 1 + %idxprom35 = sext i32 %sub to i64 + %arrayidx36 = getelementptr inbounds float, float* %33, i64 %idxprom35 + %35 = load float, float* %arrayidx36, align 4 + %36 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p37 = getelementptr inbounds %struct.Points, %struct.Points* %36, i32 0, i32 2 + %37 = load %struct.Point*, %struct.Point** %p37, align 8 + %38 = load i32, i32* %i29, align 4 + %idxprom38 = sext i32 %38 to i64 + %arrayidx39 = getelementptr inbounds %struct.Point, %struct.Point* %37, i64 %idxprom38 + %weight40 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx39, i32 0, i32 0 + %39 = load float, float* %weight40, align 8 + %add = fadd contract float %35, %39 + %40 = load float*, float** %accumweight, align 8 + %41 = load i32, i32* %i29, align 4 + %idxprom41 = sext i32 %41 to i64 + %arrayidx42 = getelementptr inbounds float, float* %40, i64 %idxprom41 + store float %add, float* %arrayidx42, align 4 + br label %for.inc43 + +for.inc43: ; preds = %for.body34 + %42 = load i32, i32* %i29, align 4 + %inc44 = add nsw i32 %42, 1 + store i32 %inc44, i32* %i29, align 4 + br label %for.cond30 + +for.end45: ; preds = %for.cond30 + %43 = load float*, float** %accumweight, align 8 + %44 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num46 = getelementptr inbounds %struct.Points, %struct.Points* %44, i32 0, i32 0 + %45 = load i64, i64* %num46, align 8 + %sub47 = sub nsw i64 %45, 1 + %arrayidx48 = getelementptr inbounds float, float* %43, i64 %sub47 + %46 = load float, float* %arrayidx48, align 4 + store float %46, float* %totalweight, align 4 + %47 = load i64, i64* %k1, align 8 + %conv50 = trunc i64 %47 to i32 + store i32 %conv50, i32* %i49, align 4 + br label %for.cond51 + +for.cond51: ; preds = %for.inc78, %for.end45 + %48 = load i32, i32* %i49, align 4 + %conv52 = sext i32 %48 to i64 + %49 = load i64, i64* %k2, align 8 + %cmp53 = icmp slt i64 %conv52, %49 + br i1 %cmp53, label %for.body54, label %for.end80 + +for.body54: ; preds = %for.cond51 + %call55 = call i64 @lrand48() #2 + %conv56 = sitofp i64 %call55 to float + %div = fdiv float %conv56, 0x41E0000000000000 + %50 = load float, float* %totalweight, align 4 + %mul57 = fmul contract float %div, %50 + store float %mul57, float* %w, align 4 + store i32 0, i32* %l, align 4 + %51 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num58 = getelementptr inbounds %struct.Points, %struct.Points* %51, i32 0, i32 0 + %52 = load i64, i64* %num58, align 8 + %sub59 = sub nsw i64 %52, 1 + %conv60 = trunc i64 %sub59 to i32 + store i32 %conv60, i32* %r, align 4 + %53 = load float*, float** %accumweight, align 8 + %arrayidx61 = getelementptr inbounds float, float* %53, i64 0 + %54 = load float, float* %arrayidx61, align 4 + %55 = load float, float* %w, align 4 + %cmp62 = fcmp ogt float %54, %55 + br i1 %cmp62, label %if.then63, label %if.end66 + +if.then63: ; preds = %for.body54 + %56 = load i32**, i32*** %feasible.addr, align 8 + %57 = load i32*, i32** %56, align 8 + %58 = load i32, i32* %i49, align 4 + %idxprom64 = sext i32 %58 to i64 + %arrayidx65 = getelementptr inbounds i32, i32* %57, i64 %idxprom64 + store i32 0, i32* %arrayidx65, align 4 + br label %for.inc78 + +if.end66: ; preds = %for.body54 + br label %while.cond + +while.cond: ; preds = %if.end75, %if.end66 + %59 = load i32, i32* %l, align 4 + %add67 = add nsw i32 %59, 1 + %60 = load i32, i32* %r, align 4 + %cmp68 = icmp slt i32 %add67, %60 + br i1 %cmp68, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %61 = load i32, i32* %l, align 4 + %62 = load i32, i32* %r, align 4 + %add69 = add nsw i32 %61, %62 + %div70 = sdiv i32 %add69, 2 + store i32 %div70, i32* %k, align 4 + %63 = load float*, float** %accumweight, align 8 + %64 = load i32, i32* %k, align 4 + %idxprom71 = sext i32 %64 to i64 + %arrayidx72 = getelementptr inbounds float, float* %63, i64 %idxprom71 + %65 = load float, float* %arrayidx72, align 4 + %66 = load float, float* %w, align 4 + %cmp73 = fcmp ogt float %65, %66 + br i1 %cmp73, label %if.then74, label %if.else + +if.then74: ; preds = %while.body + %67 = load i32, i32* %k, align 4 + store i32 %67, i32* %r, align 4 + br label %if.end75 + +if.else: ; preds = %while.body + %68 = load i32, i32* %k, align 4 + store i32 %68, i32* %l, align 4 + br label %if.end75 + +if.end75: ; preds = %if.else, %if.then74 + br label %while.cond + +while.end: ; preds = %while.cond + %69 = load i32, i32* %r, align 4 + %70 = load i32**, i32*** %feasible.addr, align 8 + %71 = load i32*, i32** %70, align 8 + %72 = load i32, i32* %i49, align 4 + %idxprom76 = sext i32 %72 to i64 + %arrayidx77 = getelementptr inbounds i32, i32* %71, i64 %idxprom76 + store i32 %69, i32* %arrayidx77, align 4 + br label %for.inc78 + +for.inc78: ; preds = %while.end, %if.then63 + %73 = load i32, i32* %i49, align 4 + %inc79 = add nsw i32 %73, 1 + store i32 %inc79, i32* %i49, align 4 + br label %for.cond51 + +for.end80: ; preds = %for.cond51 + %74 = load float*, float** %accumweight, align 8 + %75 = bitcast float* %74 to i8* + call void @free(i8* %75) #2 + %call81 = call double @_Z7gettimev() + store double %call81, double* %t2, align 8 + %76 = load double, double* %t2, align 8 + %77 = load double, double* %t1, align 8 + %sub82 = fsub contract double %76, %77 + %78 = load double, double* @time_select_feasible, align 8 + %add83 = fadd contract double %78, %sub82 + store double %add83, double* @time_select_feasible, align 8 + %79 = load i32, i32* %numfeasible, align 4 + store i32 %79, i32* %retval, align 4 + br label %return + +return: ; preds = %for.end80, %for.end + %80 = load i32, i32* %retval, align 4 + ret i32 %80 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local float @_ZSt3logf(float %__x) #6 comdat { +entry: + %__x.addr = alloca float, align 4 + store float %__x, float* %__x.addr, align 4 + %0 = load float, float* %__x.addr, align 4 + %call = call float @logf(float %0) #2 + ret float %call +} + +; Function Attrs: noinline optnone uwtable +define dso_local float @_Z8pkmedianP6PointsllPliP17pthread_barrier_t(%struct.Points* %points, i64 %kmin, i64 %kmax, i64* %kfinal, i32 %pid, %union.pthread_barrier_t* %barrier) #3 { +entry: + %retval = alloca float, align 4 + %points.addr = alloca %struct.Points*, align 8 + %kmin.addr = alloca i64, align 8 + %kmax.addr = alloca i64, align 8 + %kfinal.addr = alloca i64*, align 8 + %pid.addr = alloca i32, align 4 + %barrier.addr = alloca %union.pthread_barrier_t*, align 8 + %i = alloca i32, align 4 + %cost = alloca float, align 4 + %lastcost = alloca float, align 4 + %hiz = alloca float, align 4 + %loz = alloca float, align 4 + %z = alloca float, align 4 + %numberOfPoints = alloca i64, align 8 + %ptDimension = alloca i64, align 8 + %bsize = alloca i64, align 8 + %k1 = alloca i64, align 8 + %k2 = alloca i64, align 8 + %myhiz = alloca float, align 4 + %kk = alloca i64, align 8 + %agg.tmp = alloca %struct.Point, align 8 + %agg.tmp10 = alloca %struct.Point, align 8 + %i20 = alloca i32, align 4 + %kk37 = alloca i64, align 8 + %i81 = alloca i32, align 4 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store i64 %kmin, i64* %kmin.addr, align 8 + store i64 %kmax, i64* %kmax.addr, align 8 + store i64* %kfinal, i64** %kfinal.addr, align 8 + store i32 %pid, i32* %pid.addr, align 4 + store %union.pthread_barrier_t* %barrier, %union.pthread_barrier_t** %barrier.addr, align 8 + %0 = load i32, i32* %pid.addr, align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* @_ZL5nproc, align 4 + %conv = sext i32 %1 to i64 + %call = call noalias i8* @calloc(i64 %conv, i64 4) #2 + %2 = bitcast i8* %call to float* + store float* %2, float** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE4hizs, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + store float 0.000000e+00, float* %loz, align 4 + store float 0.000000e+00, float* %hiz, align 4 + %3 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %3, i32 0, i32 0 + %4 = load i64, i64* %num, align 8 + store i64 %4, i64* %numberOfPoints, align 8 + %5 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim = getelementptr inbounds %struct.Points, %struct.Points* %5, i32 0, i32 1 + %6 = load i32, i32* %dim, align 8 + %conv1 = sext i32 %6 to i64 + store i64 %conv1, i64* %ptDimension, align 8 + %7 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num2 = getelementptr inbounds %struct.Points, %struct.Points* %7, i32 0, i32 0 + %8 = load i64, i64* %num2, align 8 + %9 = load i32, i32* @_ZL5nproc, align 4 + %conv3 = sext i32 %9 to i64 + %div = sdiv i64 %8, %conv3 + store i64 %div, i64* %bsize, align 8 + %10 = load i64, i64* %bsize, align 8 + %11 = load i32, i32* %pid.addr, align 4 + %conv4 = sext i32 %11 to i64 + %mul = mul nsw i64 %10, %conv4 + store i64 %mul, i64* %k1, align 8 + %12 = load i64, i64* %k1, align 8 + %13 = load i64, i64* %bsize, align 8 + %add = add nsw i64 %12, %13 + store i64 %add, i64* %k2, align 8 + %14 = load i32, i32* %pid.addr, align 4 + %15 = load i32, i32* @_ZL5nproc, align 4 + %sub = sub nsw i32 %15, 1 + %cmp5 = icmp eq i32 %14, %sub + br i1 %cmp5, label %if.then6, label %if.end8 + +if.then6: ; preds = %if.end + %16 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num7 = getelementptr inbounds %struct.Points, %struct.Points* %16, i32 0, i32 0 + %17 = load i64, i64* %num7, align 8 + store i64 %17, i64* %k2, align 8 + br label %if.end8 + +if.end8: ; preds = %if.then6, %if.end + store float 0.000000e+00, float* %myhiz, align 4 + %18 = load i64, i64* %k1, align 8 + store i64 %18, i64* %kk, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end8 + %19 = load i64, i64* %kk, align 8 + %20 = load i64, i64* %k2, align 8 + %cmp9 = icmp slt i64 %19, %20 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %21 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %21, i32 0, i32 2 + %22 = load %struct.Point*, %struct.Point** %p, align 8 + %23 = load i64, i64* %kk, align 8 + %arrayidx = getelementptr inbounds %struct.Point, %struct.Point* %22, i64 %23 + %24 = bitcast %struct.Point* %agg.tmp to i8* + %25 = bitcast %struct.Point* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %24, i8* align 8 %25, i64 32, i1 false) + %26 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p11 = getelementptr inbounds %struct.Points, %struct.Points* %26, i32 0, i32 2 + %27 = load %struct.Point*, %struct.Point** %p11, align 8 + %arrayidx12 = getelementptr inbounds %struct.Point, %struct.Point* %27, i64 0 + %28 = bitcast %struct.Point* %agg.tmp10 to i8* + %29 = bitcast %struct.Point* %arrayidx12 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %28, i8* align 8 %29, i64 32, i1 false) + %30 = load i64, i64* %ptDimension, align 8 + %conv13 = trunc i64 %30 to i32 + %call14 = call float @_Z4dist5PointS_i(%struct.Point* byval(%struct.Point) align 8 %agg.tmp, %struct.Point* byval(%struct.Point) align 8 %agg.tmp10, i32 %conv13) + %31 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p15 = getelementptr inbounds %struct.Points, %struct.Points* %31, i32 0, i32 2 + %32 = load %struct.Point*, %struct.Point** %p15, align 8 + %33 = load i64, i64* %kk, align 8 + %arrayidx16 = getelementptr inbounds %struct.Point, %struct.Point* %32, i64 %33 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx16, i32 0, i32 0 + %34 = load float, float* %weight, align 8 + %mul17 = fmul contract float %call14, %34 + %35 = load float, float* %myhiz, align 4 + %add18 = fadd contract float %35, %mul17 + store float %add18, float* %myhiz, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %36 = load i64, i64* %kk, align 8 + %inc = add nsw i64 %36, 1 + store i64 %inc, i64* %kk, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %37 = load float, float* %myhiz, align 4 + %38 = load float*, float** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE4hizs, align 8 + %39 = load i32, i32* %pid.addr, align 4 + %idxprom = sext i32 %39 to i64 + %arrayidx19 = getelementptr inbounds float, float* %38, i64 %idxprom + store float %37, float* %arrayidx19, align 4 + store i32 0, i32* %i20, align 4 + br label %for.cond21 + +for.cond21: ; preds = %for.inc27, %for.end + %40 = load i32, i32* %i20, align 4 + %41 = load i32, i32* @_ZL5nproc, align 4 + %cmp22 = icmp slt i32 %40, %41 + br i1 %cmp22, label %for.body23, label %for.end29 + +for.body23: ; preds = %for.cond21 + %42 = load float*, float** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE4hizs, align 8 + %43 = load i32, i32* %i20, align 4 + %idxprom24 = sext i32 %43 to i64 + %arrayidx25 = getelementptr inbounds float, float* %42, i64 %idxprom24 + %44 = load float, float* %arrayidx25, align 4 + %45 = load float, float* %hiz, align 4 + %add26 = fadd contract float %45, %44 + store float %add26, float* %hiz, align 4 + br label %for.inc27 + +for.inc27: ; preds = %for.body23 + %46 = load i32, i32* %i20, align 4 + %inc28 = add nsw i32 %46, 1 + store i32 %inc28, i32* %i20, align 4 + br label %for.cond21 + +for.end29: ; preds = %for.cond21 + store float 0.000000e+00, float* %loz, align 4 + %47 = load float, float* %hiz, align 4 + %48 = load float, float* %loz, align 4 + %add30 = fadd contract float %47, %48 + %conv31 = fpext float %add30 to double + %div32 = fdiv double %conv31, 2.000000e+00 + %conv33 = fptrunc double %div32 to float + store float %conv33, float* %z, align 4 + %49 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num34 = getelementptr inbounds %struct.Points, %struct.Points* %49, i32 0, i32 0 + %50 = load i64, i64* %num34, align 8 + %51 = load i64, i64* %kmax.addr, align 8 + %cmp35 = icmp sle i64 %50, %51 + br i1 %cmp35, label %if.then36, label %if.end52 + +if.then36: ; preds = %for.end29 + %52 = load i64, i64* %k1, align 8 + store i64 %52, i64* %kk37, align 8 + br label %for.cond38 + +for.cond38: ; preds = %for.inc46, %if.then36 + %53 = load i64, i64* %kk37, align 8 + %54 = load i64, i64* %k2, align 8 + %cmp39 = icmp slt i64 %53, %54 + br i1 %cmp39, label %for.body40, label %for.end48 + +for.body40: ; preds = %for.cond38 + %55 = load i64, i64* %kk37, align 8 + %56 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p41 = getelementptr inbounds %struct.Points, %struct.Points* %56, i32 0, i32 2 + %57 = load %struct.Point*, %struct.Point** %p41, align 8 + %58 = load i64, i64* %kk37, align 8 + %arrayidx42 = getelementptr inbounds %struct.Point, %struct.Point* %57, i64 %58 + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx42, i32 0, i32 2 + store i64 %55, i64* %assign, align 8 + %59 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p43 = getelementptr inbounds %struct.Points, %struct.Points* %59, i32 0, i32 2 + %60 = load %struct.Point*, %struct.Point** %p43, align 8 + %61 = load i64, i64* %kk37, align 8 + %arrayidx44 = getelementptr inbounds %struct.Point, %struct.Point* %60, i64 %61 + %cost45 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx44, i32 0, i32 3 + store float 0.000000e+00, float* %cost45, align 8 + br label %for.inc46 + +for.inc46: ; preds = %for.body40 + %62 = load i64, i64* %kk37, align 8 + %inc47 = add nsw i64 %62, 1 + store i64 %inc47, i64* %kk37, align 8 + br label %for.cond38 + +for.end48: ; preds = %for.cond38 + store float 0.000000e+00, float* %cost, align 4 + %63 = load i32, i32* %pid.addr, align 4 + %cmp49 = icmp eq i32 %63, 0 + br i1 %cmp49, label %if.then50, label %if.end51 + +if.then50: ; preds = %for.end48 + %64 = load float*, float** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE4hizs, align 8 + %65 = bitcast float* %64 to i8* + call void @free(i8* %65) #2 + %66 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %67 = load i64*, i64** %kfinal.addr, align 8 + store i64 %66, i64* %67, align 8 + br label %if.end51 + +if.end51: ; preds = %if.then50, %for.end48 + %68 = load float, float* %cost, align 4 + store float %68, float* %retval, align 4 + br label %return + +if.end52: ; preds = %for.end29 + %69 = load i32, i32* %pid.addr, align 4 + %cmp53 = icmp eq i32 %69, 0 + br i1 %cmp53, label %if.then54, label %if.end55 + +if.then54: ; preds = %if.end52 + %70 = load %struct.Points*, %struct.Points** %points.addr, align 8 + call void @_Z7shuffleP6Points(%struct.Points* %70) + br label %if.end55 + +if.end55: ; preds = %if.then54, %if.end52 + %71 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %72 = load float, float* %z, align 4 + %73 = load i32, i32* %pid.addr, align 4 + %74 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier.addr, align 8 + %call56 = call float @_Z7pspeedyP6PointsfPliP17pthread_barrier_t(%struct.Points* %71, float %72, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, i32 %73, %union.pthread_barrier_t* %74) + store float %call56, float* %cost, align 4 + store i32 0, i32* %i, align 4 + br label %while.cond + +while.cond: ; preds = %while.body, %if.end55 + %75 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %76 = load i64, i64* %kmin.addr, align 8 + %cmp57 = icmp slt i64 %75, %76 + br i1 %cmp57, label %land.rhs, label %land.end + +land.rhs: ; preds = %while.cond + %77 = load i32, i32* %i, align 4 + %cmp58 = icmp slt i32 %77, 1 + br label %land.end + +land.end: ; preds = %land.rhs, %while.cond + %78 = phi i1 [ false, %while.cond ], [ %cmp58, %land.rhs ] + br i1 %78, label %while.body, label %while.end + +while.body: ; preds = %land.end + %79 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %80 = load float, float* %z, align 4 + %81 = load i32, i32* %pid.addr, align 4 + %82 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier.addr, align 8 + %call59 = call float @_Z7pspeedyP6PointsfPliP17pthread_barrier_t(%struct.Points* %79, float %80, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, i32 %81, %union.pthread_barrier_t* %82) + store float %call59, float* %cost, align 4 + %83 = load i32, i32* %i, align 4 + %inc60 = add nsw i32 %83, 1 + store i32 %inc60, i32* %i, align 4 + br label %while.cond + +while.end: ; preds = %land.end + br label %while.cond61 + +while.cond61: ; preds = %if.end73, %while.end + %84 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %85 = load i64, i64* %kmin.addr, align 8 + %cmp62 = icmp slt i64 %84, %85 + br i1 %cmp62, label %while.body63, label %while.end76 + +while.body63: ; preds = %while.cond61 + %86 = load i32, i32* %i, align 4 + %cmp64 = icmp sge i32 %86, 1 + br i1 %cmp64, label %if.then65, label %if.end70 + +if.then65: ; preds = %while.body63 + %87 = load float, float* %z, align 4 + store float %87, float* %hiz, align 4 + %88 = load float, float* %hiz, align 4 + %89 = load float, float* %loz, align 4 + %add66 = fadd contract float %88, %89 + %conv67 = fpext float %add66 to double + %div68 = fdiv double %conv67, 2.000000e+00 + %conv69 = fptrunc double %div68 to float + store float %conv69, float* %z, align 4 + store i32 0, i32* %i, align 4 + br label %if.end70 + +if.end70: ; preds = %if.then65, %while.body63 + %90 = load i32, i32* %pid.addr, align 4 + %cmp71 = icmp eq i32 %90, 0 + br i1 %cmp71, label %if.then72, label %if.end73 + +if.then72: ; preds = %if.end70 + %91 = load %struct.Points*, %struct.Points** %points.addr, align 8 + call void @_Z7shuffleP6Points(%struct.Points* %91) + br label %if.end73 + +if.end73: ; preds = %if.then72, %if.end70 + %92 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %93 = load float, float* %z, align 4 + %94 = load i32, i32* %pid.addr, align 4 + %95 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier.addr, align 8 + %call74 = call float @_Z7pspeedyP6PointsfPliP17pthread_barrier_t(%struct.Points* %92, float %93, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, i32 %94, %union.pthread_barrier_t* %95) + store float %call74, float* %cost, align 4 + %96 = load i32, i32* %i, align 4 + %inc75 = add nsw i32 %96, 1 + store i32 %inc75, i32* %i, align 4 + br label %while.cond61 + +while.end76: ; preds = %while.cond61 + %97 = load i32, i32* %pid.addr, align 4 + %cmp77 = icmp eq i32 %97, 0 + br i1 %cmp77, label %if.then78, label %if.end95 + +if.then78: ; preds = %while.end76 + %98 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %99 = load i64, i64* %kmin.addr, align 8 + %conv79 = trunc i64 %99 to i32 + %100 = load i32, i32* %pid.addr, align 4 + %101 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier.addr, align 8 + %call80 = call i32 @_Z19selectfeasible_fastP6PointsPPiiiP17pthread_barrier_t(%struct.Points* %98, i32** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE8feasible, i32 %conv79, i32 %100, %union.pthread_barrier_t* %101) + store i32 %call80, i32* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE11numfeasible, align 4 + store i32 0, i32* %i81, align 4 + br label %for.cond82 + +for.cond82: ; preds = %for.inc92, %if.then78 + %102 = load i32, i32* %i81, align 4 + %conv83 = sext i32 %102 to i64 + %103 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num84 = getelementptr inbounds %struct.Points, %struct.Points* %103, i32 0, i32 0 + %104 = load i64, i64* %num84, align 8 + %cmp85 = icmp slt i64 %conv83, %104 + br i1 %cmp85, label %for.body86, label %for.end94 + +for.body86: ; preds = %for.cond82 + %105 = load i8*, i8** @_ZL9is_center, align 8 + %106 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p87 = getelementptr inbounds %struct.Points, %struct.Points* %106, i32 0, i32 2 + %107 = load %struct.Point*, %struct.Point** %p87, align 8 + %108 = load i32, i32* %i81, align 4 + %idxprom88 = sext i32 %108 to i64 + %arrayidx89 = getelementptr inbounds %struct.Point, %struct.Point* %107, i64 %idxprom88 + %assign90 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx89, i32 0, i32 2 + %109 = load i64, i64* %assign90, align 8 + %arrayidx91 = getelementptr inbounds i8, i8* %105, i64 %109 + store i8 1, i8* %arrayidx91, align 1 + br label %for.inc92 + +for.inc92: ; preds = %for.body86 + %110 = load i32, i32* %i81, align 4 + %inc93 = add nsw i32 %110, 1 + store i32 %inc93, i32* %i81, align 4 + br label %for.cond82 + +for.end94: ; preds = %for.cond82 + br label %if.end95 + +if.end95: ; preds = %for.end94, %while.end76 + br label %while.body97 + +while.body97: ; preds = %if.end95, %if.end160 + %111 = load float, float* %cost, align 4 + store float %111, float* %lastcost, align 4 + %112 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %113 = load i32*, i32** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE8feasible, align 8 + %114 = load i32, i32* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE11numfeasible, align 4 + %115 = load float, float* %z, align 4 + %116 = load i64, i64* %kmax.addr, align 8 + %conv98 = trunc i64 %116 to i32 + %117 = load float, float* %cost, align 4 + %118 = load i64, i64* %kmax.addr, align 8 + %mul99 = mul nsw i64 3, %118 + %conv100 = sitofp i64 %mul99 to float + %119 = load i64, i64* %kmax.addr, align 8 + %conv101 = sitofp i64 %119 to float + %call102 = call float @_ZSt3logf(float %conv101) + %mul103 = fmul contract float %conv100, %call102 + %conv104 = fptosi float %mul103 to i64 + %120 = load i32, i32* %pid.addr, align 4 + %121 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier.addr, align 8 + %call105 = call float @_Z3pFLP6PointsPiifPliflfiP17pthread_barrier_t(%struct.Points* %112, i32* %113, i32 %114, float %115, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, i32 %conv98, float %117, i64 %conv104, float 0x3FB99999A0000000, i32 %120, %union.pthread_barrier_t* %121) + store float %call105, float* %cost, align 4 + %122 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %conv106 = sitofp i64 %122 to double + %123 = load i64, i64* %kmax.addr, align 8 + %conv107 = sitofp i64 %123 to double + %mul108 = fmul contract double 1.100000e+00, %conv107 + %cmp109 = fcmp ole double %conv106, %mul108 + br i1 %cmp109, label %land.lhs.true, label %lor.lhs.false + +land.lhs.true: ; preds = %while.body97 + %124 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %conv110 = sitofp i64 %124 to double + %125 = load i64, i64* %kmin.addr, align 8 + %conv111 = sitofp i64 %125 to double + %mul112 = fmul contract double 9.000000e-01, %conv111 + %cmp113 = fcmp oge double %conv110, %mul112 + br i1 %cmp113, label %if.then119, label %lor.lhs.false + +lor.lhs.false: ; preds = %land.lhs.true, %while.body97 + %126 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %127 = load i64, i64* %kmax.addr, align 8 + %add114 = add nsw i64 %127, 2 + %cmp115 = icmp sle i64 %126, %add114 + br i1 %cmp115, label %land.lhs.true116, label %if.end128 + +land.lhs.true116: ; preds = %lor.lhs.false + %128 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %129 = load i64, i64* %kmin.addr, align 8 + %sub117 = sub nsw i64 %129, 2 + %cmp118 = icmp sge i64 %128, %sub117 + br i1 %cmp118, label %if.then119, label %if.end128 + +if.then119: ; preds = %land.lhs.true116, %land.lhs.true + %130 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %131 = load i32*, i32** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE8feasible, align 8 + %132 = load i32, i32* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE11numfeasible, align 4 + %133 = load float, float* %z, align 4 + %134 = load i64, i64* %kmax.addr, align 8 + %conv120 = trunc i64 %134 to i32 + %135 = load float, float* %cost, align 4 + %136 = load i64, i64* %kmax.addr, align 8 + %mul121 = mul nsw i64 3, %136 + %conv122 = sitofp i64 %mul121 to float + %137 = load i64, i64* %kmax.addr, align 8 + %conv123 = sitofp i64 %137 to float + %call124 = call float @_ZSt3logf(float %conv123) + %mul125 = fmul contract float %conv122, %call124 + %conv126 = fptosi float %mul125 to i64 + %138 = load i32, i32* %pid.addr, align 4 + %139 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier.addr, align 8 + %call127 = call float @_Z3pFLP6PointsPiifPliflfiP17pthread_barrier_t(%struct.Points* %130, i32* %131, i32 %132, float %133, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, i32 %conv120, float %135, i64 %conv126, float 0x3F50624DE0000000, i32 %138, %union.pthread_barrier_t* %139) + store float %call127, float* %cost, align 4 + br label %if.end128 + +if.end128: ; preds = %if.then119, %land.lhs.true116, %lor.lhs.false + %140 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %141 = load i64, i64* %kmax.addr, align 8 + %cmp129 = icmp sgt i64 %140, %141 + br i1 %cmp129, label %if.then130, label %if.end139 + +if.then130: ; preds = %if.end128 + %142 = load float, float* %z, align 4 + store float %142, float* %loz, align 4 + %143 = load float, float* %hiz, align 4 + %144 = load float, float* %loz, align 4 + %add131 = fadd contract float %143, %144 + %conv132 = fpext float %add131 to double + %div133 = fdiv double %conv132, 2.000000e+00 + %conv134 = fptrunc double %div133 to float + store float %conv134, float* %z, align 4 + %145 = load float, float* %z, align 4 + %146 = load float, float* %loz, align 4 + %sub135 = fsub contract float %145, %146 + %147 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %conv136 = sitofp i64 %147 to float + %mul137 = fmul contract float %sub135, %conv136 + %148 = load float, float* %cost, align 4 + %add138 = fadd contract float %148, %mul137 + store float %add138, float* %cost, align 4 + br label %if.end139 + +if.end139: ; preds = %if.then130, %if.end128 + %149 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %150 = load i64, i64* %kmin.addr, align 8 + %cmp140 = icmp slt i64 %149, %150 + br i1 %cmp140, label %if.then141, label %if.end150 + +if.then141: ; preds = %if.end139 + %151 = load float, float* %z, align 4 + store float %151, float* %hiz, align 4 + %152 = load float, float* %hiz, align 4 + %153 = load float, float* %loz, align 4 + %add142 = fadd contract float %152, %153 + %conv143 = fpext float %add142 to double + %div144 = fdiv double %conv143, 2.000000e+00 + %conv145 = fptrunc double %div144 to float + store float %conv145, float* %z, align 4 + %154 = load float, float* %z, align 4 + %155 = load float, float* %hiz, align 4 + %sub146 = fsub contract float %154, %155 + %156 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %conv147 = sitofp i64 %156 to float + %mul148 = fmul contract float %sub146, %conv147 + %157 = load float, float* %cost, align 4 + %add149 = fadd contract float %157, %mul148 + store float %add149, float* %cost, align 4 + br label %if.end150 + +if.end150: ; preds = %if.then141, %if.end139 + %158 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %159 = load i64, i64* %kmax.addr, align 8 + %cmp151 = icmp sle i64 %158, %159 + br i1 %cmp151, label %land.lhs.true152, label %lor.lhs.false154 + +land.lhs.true152: ; preds = %if.end150 + %160 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %161 = load i64, i64* %kmin.addr, align 8 + %cmp153 = icmp sge i64 %160, %161 + br i1 %cmp153, label %if.then159, label %lor.lhs.false154 + +lor.lhs.false154: ; preds = %land.lhs.true152, %if.end150 + %162 = load float, float* %loz, align 4 + %conv155 = fpext float %162 to double + %163 = load float, float* %hiz, align 4 + %conv156 = fpext float %163 to double + %mul157 = fmul contract double 0x3FEFF7CED916872B, %conv156 + %cmp158 = fcmp oge double %conv155, %mul157 + br i1 %cmp158, label %if.then159, label %if.end160 + +if.then159: ; preds = %lor.lhs.false154, %land.lhs.true152 + br label %while.end161 + +if.end160: ; preds = %lor.lhs.false154 + br label %while.body97 + +while.end161: ; preds = %if.then159 + %164 = load i32, i32* %pid.addr, align 4 + %cmp162 = icmp eq i32 %164, 0 + br i1 %cmp162, label %if.then163, label %if.end164 + +if.then163: ; preds = %while.end161 + %165 = load i32*, i32** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE8feasible, align 8 + %166 = bitcast i32* %165 to i8* + call void @free(i8* %166) #2 + %167 = load float*, float** @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE4hizs, align 8 + %168 = bitcast float* %167 to i8* + call void @free(i8* %168) #2 + %169 = load i64, i64* @_ZZ8pkmedianP6PointsllPliP17pthread_barrier_tE1k, align 8 + %170 = load i64*, i64** %kfinal.addr, align 8 + store i64 %169, i64* %170, align 8 + br label %if.end164 + +if.end164: ; preds = %if.then163, %while.end161 + %171 = load float, float* %cost, align 4 + store float %171, float* %retval, align 4 + br label %return + +return: ; preds = %if.end164, %if.end51 + %172 = load float, float* %retval, align 4 + ret float %172 +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @calloc(i64, i64) #7 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @_Z11contcentersP6Points(%struct.Points* %points) #6 { +entry: + %points.addr = alloca %struct.Points*, align 8 + %i = alloca i64, align 8 + %ii = alloca i64, align 8 + %relweight = alloca float, align 4 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store i64 0, i64* %i, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc48, %entry + %0 = load i64, i64* %i, align 8 + %1 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %1, i32 0, i32 0 + %2 = load i64, i64* %num, align 8 + %cmp = icmp slt i64 %0, %2 + br i1 %cmp, label %for.body, label %for.end50 + +for.body: ; preds = %for.cond + %3 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %3, i32 0, i32 2 + %4 = load %struct.Point*, %struct.Point** %p, align 8 + %5 = load i64, i64* %i, align 8 + %arrayidx = getelementptr inbounds %struct.Point, %struct.Point* %4, i64 %5 + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx, i32 0, i32 2 + %6 = load i64, i64* %assign, align 8 + %7 = load i64, i64* %i, align 8 + %cmp1 = icmp ne i64 %6, %7 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %8 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p2 = getelementptr inbounds %struct.Points, %struct.Points* %8, i32 0, i32 2 + %9 = load %struct.Point*, %struct.Point** %p2, align 8 + %10 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p3 = getelementptr inbounds %struct.Points, %struct.Points* %10, i32 0, i32 2 + %11 = load %struct.Point*, %struct.Point** %p3, align 8 + %12 = load i64, i64* %i, align 8 + %arrayidx4 = getelementptr inbounds %struct.Point, %struct.Point* %11, i64 %12 + %assign5 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx4, i32 0, i32 2 + %13 = load i64, i64* %assign5, align 8 + %arrayidx6 = getelementptr inbounds %struct.Point, %struct.Point* %9, i64 %13 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx6, i32 0, i32 0 + %14 = load float, float* %weight, align 8 + %15 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p7 = getelementptr inbounds %struct.Points, %struct.Points* %15, i32 0, i32 2 + %16 = load %struct.Point*, %struct.Point** %p7, align 8 + %17 = load i64, i64* %i, align 8 + %arrayidx8 = getelementptr inbounds %struct.Point, %struct.Point* %16, i64 %17 + %weight9 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx8, i32 0, i32 0 + %18 = load float, float* %weight9, align 8 + %add = fadd contract float %14, %18 + store float %add, float* %relweight, align 4 + %19 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p10 = getelementptr inbounds %struct.Points, %struct.Points* %19, i32 0, i32 2 + %20 = load %struct.Point*, %struct.Point** %p10, align 8 + %21 = load i64, i64* %i, align 8 + %arrayidx11 = getelementptr inbounds %struct.Point, %struct.Point* %20, i64 %21 + %weight12 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx11, i32 0, i32 0 + %22 = load float, float* %weight12, align 8 + %23 = load float, float* %relweight, align 4 + %div = fdiv float %22, %23 + store float %div, float* %relweight, align 4 + store i64 0, i64* %ii, align 8 + br label %for.cond13 + +for.cond13: ; preds = %for.inc, %if.then + %24 = load i64, i64* %ii, align 8 + %25 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim = getelementptr inbounds %struct.Points, %struct.Points* %25, i32 0, i32 1 + %26 = load i32, i32* %dim, align 8 + %conv = sext i32 %26 to i64 + %cmp14 = icmp slt i64 %24, %conv + br i1 %cmp14, label %for.body15, label %for.end + +for.body15: ; preds = %for.cond13 + %27 = load float, float* %relweight, align 4 + %conv16 = fpext float %27 to double + %sub = fsub contract double 1.000000e+00, %conv16 + %28 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p17 = getelementptr inbounds %struct.Points, %struct.Points* %28, i32 0, i32 2 + %29 = load %struct.Point*, %struct.Point** %p17, align 8 + %30 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p18 = getelementptr inbounds %struct.Points, %struct.Points* %30, i32 0, i32 2 + %31 = load %struct.Point*, %struct.Point** %p18, align 8 + %32 = load i64, i64* %i, align 8 + %arrayidx19 = getelementptr inbounds %struct.Point, %struct.Point* %31, i64 %32 + %assign20 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx19, i32 0, i32 2 + %33 = load i64, i64* %assign20, align 8 + %arrayidx21 = getelementptr inbounds %struct.Point, %struct.Point* %29, i64 %33 + %coord = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx21, i32 0, i32 1 + %34 = load float*, float** %coord, align 8 + %35 = load i64, i64* %ii, align 8 + %arrayidx22 = getelementptr inbounds float, float* %34, i64 %35 + %36 = load float, float* %arrayidx22, align 4 + %conv23 = fpext float %36 to double + %mul = fmul contract double %conv23, %sub + %conv24 = fptrunc double %mul to float + store float %conv24, float* %arrayidx22, align 4 + %37 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p25 = getelementptr inbounds %struct.Points, %struct.Points* %37, i32 0, i32 2 + %38 = load %struct.Point*, %struct.Point** %p25, align 8 + %39 = load i64, i64* %i, align 8 + %arrayidx26 = getelementptr inbounds %struct.Point, %struct.Point* %38, i64 %39 + %coord27 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx26, i32 0, i32 1 + %40 = load float*, float** %coord27, align 8 + %41 = load i64, i64* %ii, align 8 + %arrayidx28 = getelementptr inbounds float, float* %40, i64 %41 + %42 = load float, float* %arrayidx28, align 4 + %43 = load float, float* %relweight, align 4 + %mul29 = fmul contract float %42, %43 + %44 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p30 = getelementptr inbounds %struct.Points, %struct.Points* %44, i32 0, i32 2 + %45 = load %struct.Point*, %struct.Point** %p30, align 8 + %46 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p31 = getelementptr inbounds %struct.Points, %struct.Points* %46, i32 0, i32 2 + %47 = load %struct.Point*, %struct.Point** %p31, align 8 + %48 = load i64, i64* %i, align 8 + %arrayidx32 = getelementptr inbounds %struct.Point, %struct.Point* %47, i64 %48 + %assign33 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx32, i32 0, i32 2 + %49 = load i64, i64* %assign33, align 8 + %arrayidx34 = getelementptr inbounds %struct.Point, %struct.Point* %45, i64 %49 + %coord35 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx34, i32 0, i32 1 + %50 = load float*, float** %coord35, align 8 + %51 = load i64, i64* %ii, align 8 + %arrayidx36 = getelementptr inbounds float, float* %50, i64 %51 + %52 = load float, float* %arrayidx36, align 4 + %add37 = fadd contract float %52, %mul29 + store float %add37, float* %arrayidx36, align 4 + br label %for.inc + +for.inc: ; preds = %for.body15 + %53 = load i64, i64* %ii, align 8 + %inc = add nsw i64 %53, 1 + store i64 %inc, i64* %ii, align 8 + br label %for.cond13 + +for.end: ; preds = %for.cond13 + %54 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p38 = getelementptr inbounds %struct.Points, %struct.Points* %54, i32 0, i32 2 + %55 = load %struct.Point*, %struct.Point** %p38, align 8 + %56 = load i64, i64* %i, align 8 + %arrayidx39 = getelementptr inbounds %struct.Point, %struct.Point* %55, i64 %56 + %weight40 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx39, i32 0, i32 0 + %57 = load float, float* %weight40, align 8 + %58 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p41 = getelementptr inbounds %struct.Points, %struct.Points* %58, i32 0, i32 2 + %59 = load %struct.Point*, %struct.Point** %p41, align 8 + %60 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p42 = getelementptr inbounds %struct.Points, %struct.Points* %60, i32 0, i32 2 + %61 = load %struct.Point*, %struct.Point** %p42, align 8 + %62 = load i64, i64* %i, align 8 + %arrayidx43 = getelementptr inbounds %struct.Point, %struct.Point* %61, i64 %62 + %assign44 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx43, i32 0, i32 2 + %63 = load i64, i64* %assign44, align 8 + %arrayidx45 = getelementptr inbounds %struct.Point, %struct.Point* %59, i64 %63 + %weight46 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx45, i32 0, i32 0 + %64 = load float, float* %weight46, align 8 + %add47 = fadd contract float %64, %57 + store float %add47, float* %weight46, align 8 + br label %if.end + +if.end: ; preds = %for.end, %for.body + br label %for.inc48 + +for.inc48: ; preds = %if.end + %65 = load i64, i64* %i, align 8 + %inc49 = add nsw i64 %65, 1 + store i64 %inc49, i64* %i, align 8 + br label %for.cond + +for.end50: ; preds = %for.cond + ret i32 0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @_Z11copycentersP6PointsS0_Pll(%struct.Points* %points, %struct.Points* %centers, i64* %centerIDs, i64 %offset) #6 { +entry: + %points.addr = alloca %struct.Points*, align 8 + %centers.addr = alloca %struct.Points*, align 8 + %centerIDs.addr = alloca i64*, align 8 + %offset.addr = alloca i64, align 8 + %i = alloca i64, align 8 + %k = alloca i64, align 8 + %is_a_median = alloca i8*, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store %struct.Points* %centers, %struct.Points** %centers.addr, align 8 + store i64* %centerIDs, i64** %centerIDs.addr, align 8 + store i64 %offset, i64* %offset.addr, align 8 + %0 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %0, i32 0, i32 0 + %1 = load i64, i64* %num, align 8 + %call = call noalias i8* @calloc(i64 %1, i64 1) #2 + store i8* %call, i8** %is_a_median, align 8 + store i64 0, i64* %i, align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %2 = load i64, i64* %i, align 8 + %3 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num1 = getelementptr inbounds %struct.Points, %struct.Points* %3, i32 0, i32 0 + %4 = load i64, i64* %num1, align 8 + %cmp = icmp slt i64 %2, %4 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %5 = load i8*, i8** %is_a_median, align 8 + %6 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %6, i32 0, i32 2 + %7 = load %struct.Point*, %struct.Point** %p, align 8 + %8 = load i64, i64* %i, align 8 + %arrayidx = getelementptr inbounds %struct.Point, %struct.Point* %7, i64 %8 + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx, i32 0, i32 2 + %9 = load i64, i64* %assign, align 8 + %arrayidx2 = getelementptr inbounds i8, i8* %5, i64 %9 + store i8 1, i8* %arrayidx2, align 1 + br label %for.inc + +for.inc: ; preds = %for.body + %10 = load i64, i64* %i, align 8 + %inc = add nsw i64 %10, 1 + store i64 %inc, i64* %i, align 8 + br label %for.cond + +for.end: ; preds = %for.cond + %11 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %num3 = getelementptr inbounds %struct.Points, %struct.Points* %11, i32 0, i32 0 + %12 = load i64, i64* %num3, align 8 + store i64 %12, i64* %k, align 8 + store i64 0, i64* %i, align 8 + br label %for.cond4 + +for.cond4: ; preds = %for.inc21, %for.end + %13 = load i64, i64* %i, align 8 + %14 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %num5 = getelementptr inbounds %struct.Points, %struct.Points* %14, i32 0, i32 0 + %15 = load i64, i64* %num5, align 8 + %cmp6 = icmp slt i64 %13, %15 + br i1 %cmp6, label %for.body7, label %for.end23 + +for.body7: ; preds = %for.cond4 + %16 = load i8*, i8** %is_a_median, align 8 + %17 = load i64, i64* %i, align 8 + %arrayidx8 = getelementptr inbounds i8, i8* %16, i64 %17 + %18 = load i8, i8* %arrayidx8, align 1 + %tobool = trunc i8 %18 to i1 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %for.body7 + %19 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %p9 = getelementptr inbounds %struct.Points, %struct.Points* %19, i32 0, i32 2 + %20 = load %struct.Point*, %struct.Point** %p9, align 8 + %21 = load i64, i64* %k, align 8 + %arrayidx10 = getelementptr inbounds %struct.Point, %struct.Point* %20, i64 %21 + %coord = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx10, i32 0, i32 1 + %22 = load float*, float** %coord, align 8 + %23 = bitcast float* %22 to i8* + %24 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p11 = getelementptr inbounds %struct.Points, %struct.Points* %24, i32 0, i32 2 + %25 = load %struct.Point*, %struct.Point** %p11, align 8 + %26 = load i64, i64* %i, align 8 + %arrayidx12 = getelementptr inbounds %struct.Point, %struct.Point* %25, i64 %26 + %coord13 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx12, i32 0, i32 1 + %27 = load float*, float** %coord13, align 8 + %28 = bitcast float* %27 to i8* + %29 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %dim = getelementptr inbounds %struct.Points, %struct.Points* %29, i32 0, i32 1 + %30 = load i32, i32* %dim, align 8 + %conv = sext i32 %30 to i64 + %mul = mul i64 %conv, 4 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %23, i8* align 4 %28, i64 %mul, i1 false) + %31 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %p14 = getelementptr inbounds %struct.Points, %struct.Points* %31, i32 0, i32 2 + %32 = load %struct.Point*, %struct.Point** %p14, align 8 + %33 = load i64, i64* %i, align 8 + %arrayidx15 = getelementptr inbounds %struct.Point, %struct.Point* %32, i64 %33 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx15, i32 0, i32 0 + %34 = load float, float* %weight, align 8 + %35 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %p16 = getelementptr inbounds %struct.Points, %struct.Points* %35, i32 0, i32 2 + %36 = load %struct.Point*, %struct.Point** %p16, align 8 + %37 = load i64, i64* %k, align 8 + %arrayidx17 = getelementptr inbounds %struct.Point, %struct.Point* %36, i64 %37 + %weight18 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx17, i32 0, i32 0 + store float %34, float* %weight18, align 8 + %38 = load i64, i64* %i, align 8 + %39 = load i64, i64* %offset.addr, align 8 + %add = add nsw i64 %38, %39 + %40 = load i64*, i64** %centerIDs.addr, align 8 + %41 = load i64, i64* %k, align 8 + %arrayidx19 = getelementptr inbounds i64, i64* %40, i64 %41 + store i64 %add, i64* %arrayidx19, align 8 + %42 = load i64, i64* %k, align 8 + %inc20 = add nsw i64 %42, 1 + store i64 %inc20, i64* %k, align 8 + br label %if.end + +if.end: ; preds = %if.then, %for.body7 + br label %for.inc21 + +for.inc21: ; preds = %if.end + %43 = load i64, i64* %i, align 8 + %inc22 = add nsw i64 %43, 1 + store i64 %inc22, i64* %i, align 8 + br label %for.cond4 + +for.end23: ; preds = %for.cond4 + %44 = load i64, i64* %k, align 8 + %45 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %num24 = getelementptr inbounds %struct.Points, %struct.Points* %45, i32 0, i32 0 + store i64 %44, i64* %num24, align 8 + %46 = load i8*, i8** %is_a_median, align 8 + call void @free(i8* %46) #2 + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local i8* @_Z14localSearchSubPv(i8* %arg_) #3 { +entry: + %arg_.addr = alloca i8*, align 8 + %arg = alloca %struct.pkmedian_arg_t*, align 8 + store i8* %arg_, i8** %arg_.addr, align 8 + %0 = load i8*, i8** %arg_.addr, align 8 + %1 = bitcast i8* %0 to %struct.pkmedian_arg_t* + store %struct.pkmedian_arg_t* %1, %struct.pkmedian_arg_t** %arg, align 8 + %2 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %points = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %2, i32 0, i32 0 + %3 = load %struct.Points*, %struct.Points** %points, align 8 + %4 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %kmin = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %4, i32 0, i32 1 + %5 = load i64, i64* %kmin, align 8 + %6 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %kmax = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %6, i32 0, i32 2 + %7 = load i64, i64* %kmax, align 8 + %8 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %kfinal = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %8, i32 0, i32 3 + %9 = load i64*, i64** %kfinal, align 8 + %10 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %pid = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %10, i32 0, i32 4 + %11 = load i32, i32* %pid, align 8 + %12 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %barrier = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %12, i32 0, i32 5 + %13 = load %union.pthread_barrier_t*, %union.pthread_barrier_t** %barrier, align 8 + %call = call float @_Z8pkmedianP6PointsllPliP17pthread_barrier_t(%struct.Points* %3, i64 %5, i64 %7, i64* %9, i32 %11, %union.pthread_barrier_t* %13) + ret i8* null +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z11localSearchP6PointsllPl(%struct.Points* %points, i64 %kmin, i64 %kmax, i64* %kfinal) #3 { +entry: + %points.addr = alloca %struct.Points*, align 8 + %kmin.addr = alloca i64, align 8 + %kmax.addr = alloca i64, align 8 + %kfinal.addr = alloca i64*, align 8 + %t1 = alloca double, align 8 + %barrier = alloca %union.pthread_barrier_t, align 8 + %threads = alloca i64*, align 8 + %arg = alloca %struct.pkmedian_arg_t*, align 8 + %i = alloca i32, align 4 + %i20 = alloca i32, align 4 + %t2 = alloca double, align 8 + store %struct.Points* %points, %struct.Points** %points.addr, align 8 + store i64 %kmin, i64* %kmin.addr, align 8 + store i64 %kmax, i64* %kmax.addr, align 8 + store i64* %kfinal, i64** %kfinal.addr, align 8 + %call = call double @_Z7gettimev() + store double %call, double* %t1, align 8 + %0 = load i32, i32* @_ZL5nproc, align 4 + %1 = sext i32 %0 to i64 + %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %1, i64 8) + %3 = extractvalue { i64, i1 } %2, 1 + %4 = extractvalue { i64, i1 } %2, 0 + %5 = select i1 %3, i64 -1, i64 %4 + %call1 = call i8* @_Znam(i64 %5) #16 + %6 = bitcast i8* %call1 to i64* + store i64* %6, i64** %threads, align 8 + %7 = load i32, i32* @_ZL5nproc, align 4 + %8 = sext i32 %7 to i64 + %9 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %8, i64 48) + %10 = extractvalue { i64, i1 } %9, 1 + %11 = extractvalue { i64, i1 } %9, 0 + %12 = select i1 %10, i64 -1, i64 %11 + %call2 = call i8* @_Znam(i64 %12) #16 + %13 = bitcast i8* %call2 to %struct.pkmedian_arg_t* + store %struct.pkmedian_arg_t* %13, %struct.pkmedian_arg_t** %arg, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %14 = load i32, i32* %i, align 4 + %15 = load i32, i32* @_ZL5nproc, align 4 + %cmp = icmp slt i32 %14, %15 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %16 = load %struct.Points*, %struct.Points** %points.addr, align 8 + %17 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %18 = load i32, i32* %i, align 4 + %idxprom = sext i32 %18 to i64 + %arrayidx = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %17, i64 %idxprom + %points3 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %arrayidx, i32 0, i32 0 + store %struct.Points* %16, %struct.Points** %points3, align 8 + %19 = load i64, i64* %kmin.addr, align 8 + %20 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %21 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %21 to i64 + %arrayidx5 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %20, i64 %idxprom4 + %kmin6 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %arrayidx5, i32 0, i32 1 + store i64 %19, i64* %kmin6, align 8 + %22 = load i64, i64* %kmax.addr, align 8 + %23 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %24 = load i32, i32* %i, align 4 + %idxprom7 = sext i32 %24 to i64 + %arrayidx8 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %23, i64 %idxprom7 + %kmax9 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %arrayidx8, i32 0, i32 2 + store i64 %22, i64* %kmax9, align 8 + %25 = load i32, i32* %i, align 4 + %26 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %27 = load i32, i32* %i, align 4 + %idxprom10 = sext i32 %27 to i64 + %arrayidx11 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %26, i64 %idxprom10 + %pid = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %arrayidx11, i32 0, i32 4 + store i32 %25, i32* %pid, align 8 + %28 = load i64*, i64** %kfinal.addr, align 8 + %29 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %30 = load i32, i32* %i, align 4 + %idxprom12 = sext i32 %30 to i64 + %arrayidx13 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %29, i64 %idxprom12 + %kfinal14 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %arrayidx13, i32 0, i32 3 + store i64* %28, i64** %kfinal14, align 8 + %31 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %32 = load i32, i32* %i, align 4 + %idxprom15 = sext i32 %32 to i64 + %arrayidx16 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %31, i64 %idxprom15 + %barrier17 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %arrayidx16, i32 0, i32 5 + store %union.pthread_barrier_t* %barrier, %union.pthread_barrier_t** %barrier17, align 8 + %33 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %arrayidx18 = getelementptr inbounds %struct.pkmedian_arg_t, %struct.pkmedian_arg_t* %33, i64 0 + %34 = bitcast %struct.pkmedian_arg_t* %arrayidx18 to i8* + %call19 = call i8* @_Z14localSearchSubPv(i8* %34) + br label %for.inc + +for.inc: ; preds = %for.body + %35 = load i32, i32* %i, align 4 + %inc = add nsw i32 %35, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + store i32 0, i32* %i20, align 4 + br label %for.cond21 + +for.cond21: ; preds = %for.inc24, %for.end + %36 = load i32, i32* %i20, align 4 + %37 = load i32, i32* @_ZL5nproc, align 4 + %cmp22 = icmp slt i32 %36, %37 + br i1 %cmp22, label %for.body23, label %for.end26 + +for.body23: ; preds = %for.cond21 + br label %for.inc24 + +for.inc24: ; preds = %for.body23 + %38 = load i32, i32* %i20, align 4 + %inc25 = add nsw i32 %38, 1 + store i32 %inc25, i32* %i20, align 4 + br label %for.cond21 + +for.end26: ; preds = %for.cond21 + %39 = load i64*, i64** %threads, align 8 + %isnull = icmp eq i64* %39, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %for.end26 + %40 = bitcast i64* %39 to i8* + call void @_ZdaPv(i8* %40) #17 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %for.end26 + %41 = load %struct.pkmedian_arg_t*, %struct.pkmedian_arg_t** %arg, align 8 + %isnull27 = icmp eq %struct.pkmedian_arg_t* %41, null + br i1 %isnull27, label %delete.end29, label %delete.notnull28 + +delete.notnull28: ; preds = %delete.end + %42 = bitcast %struct.pkmedian_arg_t* %41 to i8* + call void @_ZdaPv(i8* %42) #17 + br label %delete.end29 + +delete.end29: ; preds = %delete.notnull28, %delete.end + %call30 = call double @_Z7gettimev() + store double %call30, double* %t2, align 8 + %43 = load double, double* %t2, align 8 + %44 = load double, double* %t1, align 8 + %sub = fsub contract double %43, %44 + %45 = load double, double* @time_local_search, align 8 + %add = fadd contract double %45, %sub + store double %add, double* @time_local_search, align 8 + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #8 + +; Function Attrs: nobuiltin +declare dso_local noalias i8* @_Znam(i64) #9 + +; Function Attrs: nobuiltin nounwind +declare dso_local void @_ZdaPv(i8*) #10 + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z12outcenterIDsP6PointsPlPc(%struct.Points* %centers, i64* %centerIDs, i8* %outfile) #3 { +entry: + %centers.addr = alloca %struct.Points*, align 8 + %centerIDs.addr = alloca i64*, align 8 + %outfile.addr = alloca i8*, align 8 + %fp = alloca %struct._IO_FILE*, align 8 + %is_a_median = alloca i32*, align 8 + %i = alloca i32, align 4 + %i6 = alloca i32, align 4 + %k = alloca i32, align 4 + store %struct.Points* %centers, %struct.Points** %centers.addr, align 8 + store i64* %centerIDs, i64** %centerIDs.addr, align 8 + store i8* %outfile, i8** %outfile.addr, align 8 + %0 = load i8*, i8** %outfile.addr, align 8 + %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i64 0, i64 0)) + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %1 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %cmp = icmp eq %struct._IO_FILE* %1, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %3 = load i8*, i8** %outfile.addr, align 8 + %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.5, i64 0, i64 0), i8* %3) + call void @exit(i32 1) #15 + unreachable + +if.end: ; preds = %entry + %4 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %4, i32 0, i32 0 + %5 = load i64, i64* %num, align 8 + %call2 = call noalias i8* @calloc(i64 4, i64 %5) #2 + %6 = bitcast i8* %call2 to i32* + store i32* %6, i32** %is_a_median, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %7 = load i32, i32* %i, align 4 + %conv = sext i32 %7 to i64 + %8 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %num3 = getelementptr inbounds %struct.Points, %struct.Points* %8, i32 0, i32 0 + %9 = load i64, i64* %num3, align 8 + %cmp4 = icmp slt i64 %conv, %9 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %10 = load i32*, i32** %is_a_median, align 8 + %11 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %p = getelementptr inbounds %struct.Points, %struct.Points* %11, i32 0, i32 2 + %12 = load %struct.Point*, %struct.Point** %p, align 8 + %13 = load i32, i32* %i, align 4 + %idxprom = sext i32 %13 to i64 + %arrayidx = getelementptr inbounds %struct.Point, %struct.Point* %12, i64 %idxprom + %assign = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx, i32 0, i32 2 + %14 = load i64, i64* %assign, align 8 + %arrayidx5 = getelementptr inbounds i32, i32* %10, i64 %14 + store i32 1, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + store i32 0, i32* %i6, align 4 + br label %for.cond7 + +for.cond7: ; preds = %for.inc38, %for.end + %16 = load i32, i32* %i6, align 4 + %conv8 = sext i32 %16 to i64 + %17 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %num9 = getelementptr inbounds %struct.Points, %struct.Points* %17, i32 0, i32 0 + %18 = load i64, i64* %num9, align 8 + %cmp10 = icmp slt i64 %conv8, %18 + br i1 %cmp10, label %for.body11, label %for.end40 + +for.body11: ; preds = %for.cond7 + %19 = load i32*, i32** %is_a_median, align 8 + %20 = load i32, i32* %i6, align 4 + %idxprom12 = sext i32 %20 to i64 + %arrayidx13 = getelementptr inbounds i32, i32* %19, i64 %idxprom12 + %21 = load i32, i32* %arrayidx13, align 4 + %tobool = icmp ne i32 %21, 0 + br i1 %tobool, label %if.then14, label %if.end37 + +if.then14: ; preds = %for.body11 + %22 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %23 = load i64*, i64** %centerIDs.addr, align 8 + %24 = load i32, i32* %i6, align 4 + %idxprom15 = sext i32 %24 to i64 + %arrayidx16 = getelementptr inbounds i64, i64* %23, i64 %idxprom15 + %25 = load i64, i64* %arrayidx16, align 8 + %call17 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %22, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.6, i64 0, i64 0), i64 %25) + %26 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %27 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %p18 = getelementptr inbounds %struct.Points, %struct.Points* %27, i32 0, i32 2 + %28 = load %struct.Point*, %struct.Point** %p18, align 8 + %29 = load i32, i32* %i6, align 4 + %idxprom19 = sext i32 %29 to i64 + %arrayidx20 = getelementptr inbounds %struct.Point, %struct.Point* %28, i64 %idxprom19 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx20, i32 0, i32 0 + %30 = load float, float* %weight, align 8 + %conv21 = fpext float %30 to double + %call22 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %26, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.7, i64 0, i64 0), double %conv21) + store i32 0, i32* %k, align 4 + br label %for.cond23 + +for.cond23: ; preds = %for.inc33, %if.then14 + %31 = load i32, i32* %k, align 4 + %32 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %dim = getelementptr inbounds %struct.Points, %struct.Points* %32, i32 0, i32 1 + %33 = load i32, i32* %dim, align 8 + %cmp24 = icmp slt i32 %31, %33 + br i1 %cmp24, label %for.body25, label %for.end35 + +for.body25: ; preds = %for.cond23 + %34 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %35 = load %struct.Points*, %struct.Points** %centers.addr, align 8 + %p26 = getelementptr inbounds %struct.Points, %struct.Points* %35, i32 0, i32 2 + %36 = load %struct.Point*, %struct.Point** %p26, align 8 + %37 = load i32, i32* %i6, align 4 + %idxprom27 = sext i32 %37 to i64 + %arrayidx28 = getelementptr inbounds %struct.Point, %struct.Point* %36, i64 %idxprom27 + %coord = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx28, i32 0, i32 1 + %38 = load float*, float** %coord, align 8 + %39 = load i32, i32* %k, align 4 + %idxprom29 = sext i32 %39 to i64 + %arrayidx30 = getelementptr inbounds float, float* %38, i64 %idxprom29 + %40 = load float, float* %arrayidx30, align 4 + %conv31 = fpext float %40 to double + %call32 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %34, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.8, i64 0, i64 0), double %conv31) + br label %for.inc33 + +for.inc33: ; preds = %for.body25 + %41 = load i32, i32* %k, align 4 + %inc34 = add nsw i32 %41, 1 + store i32 %inc34, i32* %k, align 4 + br label %for.cond23 + +for.end35: ; preds = %for.cond23 + %42 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call36 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %42, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.9, i64 0, i64 0)) + br label %if.end37 + +if.end37: ; preds = %for.end35, %for.body11 + br label %for.inc38 + +for.inc38: ; preds = %if.end37 + %43 = load i32, i32* %i6, align 4 + %inc39 = add nsw i32 %43, 1 + store i32 %inc39, i32* %i6, align 4 + br label %for.cond7 + +for.end40: ; preds = %for.cond7 + %44 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call41 = call i32 @fclose(%struct._IO_FILE* %44) + ret void +} + +; Function Attrs: noinline optnone uwtable +define dso_local void @_Z13streamClusterP7PStreamllillPc(%class.PStream* %stream, i64 %kmin, i64 %kmax, i32 %dim, i64 %chunksize, i64 %centersize, i8* %outfile) #3 { +entry: + %stream.addr = alloca %class.PStream*, align 8 + %kmin.addr = alloca i64, align 8 + %kmax.addr = alloca i64, align 8 + %dim.addr = alloca i32, align 4 + %chunksize.addr = alloca i64, align 8 + %centersize.addr = alloca i64, align 8 + %outfile.addr = alloca i8*, align 8 + %block = alloca float*, align 8 + %centerBlock = alloca float*, align 8 + %centerIDs = alloca i64*, align 8 + %points = alloca %struct.Points, align 8 + %i = alloca i32, align 4 + %centers = alloca %struct.Points, align 8 + %i25 = alloca i32, align 4 + %IDoffset = alloca i64, align 8 + %kfinal = alloca i64, align 8 + %numRead = alloca i64, align 8 + %i60 = alloca i32, align 4 + store %class.PStream* %stream, %class.PStream** %stream.addr, align 8 + store i64 %kmin, i64* %kmin.addr, align 8 + store i64 %kmax, i64* %kmax.addr, align 8 + store i32 %dim, i32* %dim.addr, align 4 + store i64 %chunksize, i64* %chunksize.addr, align 8 + store i64 %centersize, i64* %centersize.addr, align 8 + store i8* %outfile, i8** %outfile.addr, align 8 + %0 = load i64, i64* %chunksize.addr, align 8 + %1 = load i32, i32* %dim.addr, align 4 + %conv = sext i32 %1 to i64 + %mul = mul nsw i64 %0, %conv + %mul1 = mul i64 %mul, 4 + %call = call noalias i8* @malloc(i64 %mul1) #2 + %2 = bitcast i8* %call to float* + store float* %2, float** %block, align 8 + %3 = load i64, i64* %centersize.addr, align 8 + %4 = load i32, i32* %dim.addr, align 4 + %conv2 = sext i32 %4 to i64 + %mul3 = mul nsw i64 %3, %conv2 + %mul4 = mul i64 %mul3, 4 + %call5 = call noalias i8* @malloc(i64 %mul4) #2 + %5 = bitcast i8* %call5 to float* + store float* %5, float** %centerBlock, align 8 + %6 = load i64, i64* %centersize.addr, align 8 + %7 = load i32, i32* %dim.addr, align 4 + %conv6 = sext i32 %7 to i64 + %mul7 = mul nsw i64 %6, %conv6 + %mul8 = mul i64 %mul7, 8 + %call9 = call noalias i8* @malloc(i64 %mul8) #2 + %8 = bitcast i8* %call9 to i64* + store i64* %8, i64** %centerIDs, align 8 + %9 = load float*, float** %block, align 8 + %cmp = icmp eq float* %9, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call10 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.10, i64 0, i64 0)) + call void @exit(i32 1) #15 + unreachable + +if.end: ; preds = %entry + %11 = load i32, i32* %dim.addr, align 4 + %dim11 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 1 + store i32 %11, i32* %dim11, align 8 + %12 = load i64, i64* %chunksize.addr, align 8 + %num = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 0 + store i64 %12, i64* %num, align 8 + %13 = load i64, i64* %chunksize.addr, align 8 + %mul12 = mul i64 %13, 32 + %call13 = call noalias i8* @malloc(i64 %mul12) #2 + %14 = bitcast i8* %call13 to %struct.Point* + %p = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 2 + store %struct.Point* %14, %struct.Point** %p, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %if.end + %15 = load i32, i32* %i, align 4 + %conv14 = sext i32 %15 to i64 + %16 = load i64, i64* %chunksize.addr, align 8 + %cmp15 = icmp slt i64 %conv14, %16 + br i1 %cmp15, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %17 = load float*, float** %block, align 8 + %18 = load i32, i32* %i, align 4 + %19 = load i32, i32* %dim.addr, align 4 + %mul16 = mul nsw i32 %18, %19 + %idxprom = sext i32 %mul16 to i64 + %arrayidx = getelementptr inbounds float, float* %17, i64 %idxprom + %p17 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 2 + %20 = load %struct.Point*, %struct.Point** %p17, align 8 + %21 = load i32, i32* %i, align 4 + %idxprom18 = sext i32 %21 to i64 + %arrayidx19 = getelementptr inbounds %struct.Point, %struct.Point* %20, i64 %idxprom18 + %coord = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx19, i32 0, i32 1 + store float* %arrayidx, float** %coord, align 8 + br label %for.inc + +for.inc: ; preds = %for.body + %22 = load i32, i32* %i, align 4 + %inc = add nsw i32 %22, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %23 = load i32, i32* %dim.addr, align 4 + %dim20 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 1 + store i32 %23, i32* %dim20, align 8 + %24 = load i64, i64* %centersize.addr, align 8 + %mul21 = mul i64 %24, 32 + %call22 = call noalias i8* @malloc(i64 %mul21) #2 + %25 = bitcast i8* %call22 to %struct.Point* + %p23 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 2 + store %struct.Point* %25, %struct.Point** %p23, align 8 + %num24 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 0 + store i64 0, i64* %num24, align 8 + store i32 0, i32* %i25, align 4 + br label %for.cond26 + +for.cond26: ; preds = %for.inc40, %for.end + %26 = load i32, i32* %i25, align 4 + %conv27 = sext i32 %26 to i64 + %27 = load i64, i64* %centersize.addr, align 8 + %cmp28 = icmp slt i64 %conv27, %27 + br i1 %cmp28, label %for.body29, label %for.end42 + +for.body29: ; preds = %for.cond26 + %28 = load float*, float** %centerBlock, align 8 + %29 = load i32, i32* %i25, align 4 + %30 = load i32, i32* %dim.addr, align 4 + %mul30 = mul nsw i32 %29, %30 + %idxprom31 = sext i32 %mul30 to i64 + %arrayidx32 = getelementptr inbounds float, float* %28, i64 %idxprom31 + %p33 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 2 + %31 = load %struct.Point*, %struct.Point** %p33, align 8 + %32 = load i32, i32* %i25, align 4 + %idxprom34 = sext i32 %32 to i64 + %arrayidx35 = getelementptr inbounds %struct.Point, %struct.Point* %31, i64 %idxprom34 + %coord36 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx35, i32 0, i32 1 + store float* %arrayidx32, float** %coord36, align 8 + %p37 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 2 + %33 = load %struct.Point*, %struct.Point** %p37, align 8 + %34 = load i32, i32* %i25, align 4 + %idxprom38 = sext i32 %34 to i64 + %arrayidx39 = getelementptr inbounds %struct.Point, %struct.Point* %33, i64 %idxprom38 + %weight = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx39, i32 0, i32 0 + store float 1.000000e+00, float* %weight, align 8 + br label %for.inc40 + +for.inc40: ; preds = %for.body29 + %35 = load i32, i32* %i25, align 4 + %inc41 = add nsw i32 %35, 1 + store i32 %inc41, i32* %i25, align 4 + br label %for.cond26 + +for.end42: ; preds = %for.cond26 + store i64 0, i64* %IDoffset, align 8 + br label %while.body + +while.body: ; preds = %for.end42, %if.end94 + %36 = load %class.PStream*, %class.PStream** %stream.addr, align 8 + %37 = load float*, float** %block, align 8 + %38 = load i32, i32* %dim.addr, align 4 + %39 = load i64, i64* %chunksize.addr, align 8 + %conv43 = trunc i64 %39 to i32 + %40 = bitcast %class.PStream* %36 to i64 (%class.PStream*, float*, i32, i32)*** + %vtable = load i64 (%class.PStream*, float*, i32, i32)**, i64 (%class.PStream*, float*, i32, i32)*** %40, align 8 + %vfn = getelementptr inbounds i64 (%class.PStream*, float*, i32, i32)*, i64 (%class.PStream*, float*, i32, i32)** %vtable, i64 0 + %41 = load i64 (%class.PStream*, float*, i32, i32)*, i64 (%class.PStream*, float*, i32, i32)** %vfn, align 8 + %call44 = call i64 %41(%class.PStream* %36, float* %37, i32 %38, i32 %conv43) + store i64 %call44, i64* %numRead, align 8 + %42 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %43 = load i64, i64* %numRead, align 8 + %call45 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %42, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.11, i64 0, i64 0), i64 %43) + %44 = load %class.PStream*, %class.PStream** %stream.addr, align 8 + %45 = bitcast %class.PStream* %44 to i32 (%class.PStream*)*** + %vtable46 = load i32 (%class.PStream*)**, i32 (%class.PStream*)*** %45, align 8 + %vfn47 = getelementptr inbounds i32 (%class.PStream*)*, i32 (%class.PStream*)** %vtable46, i64 1 + %46 = load i32 (%class.PStream*)*, i32 (%class.PStream*)** %vfn47, align 8 + %call48 = call i32 %46(%class.PStream* %44) + %tobool = icmp ne i32 %call48, 0 + br i1 %tobool, label %if.then56, label %lor.lhs.false + +lor.lhs.false: ; preds = %while.body + %47 = load i64, i64* %numRead, align 8 + %48 = load i64, i64* %chunksize.addr, align 8 + %conv49 = trunc i64 %48 to i32 + %conv50 = zext i32 %conv49 to i64 + %cmp51 = icmp ult i64 %47, %conv50 + br i1 %cmp51, label %land.lhs.true, label %if.end58 + +land.lhs.true: ; preds = %lor.lhs.false + %49 = load %class.PStream*, %class.PStream** %stream.addr, align 8 + %50 = bitcast %class.PStream* %49 to i32 (%class.PStream*)*** + %vtable52 = load i32 (%class.PStream*)**, i32 (%class.PStream*)*** %50, align 8 + %vfn53 = getelementptr inbounds i32 (%class.PStream*)*, i32 (%class.PStream*)** %vtable52, i64 2 + %51 = load i32 (%class.PStream*)*, i32 (%class.PStream*)** %vfn53, align 8 + %call54 = call i32 %51(%class.PStream* %49) + %tobool55 = icmp ne i32 %call54, 0 + br i1 %tobool55, label %if.end58, label %if.then56 + +if.then56: ; preds = %land.lhs.true, %while.body + %52 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call57 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %52, i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.12, i64 0, i64 0)) + call void @exit(i32 1) #15 + unreachable + +if.end58: ; preds = %land.lhs.true, %lor.lhs.false + %53 = load i64, i64* %numRead, align 8 + %num59 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 0 + store i64 %53, i64* %num59, align 8 + store i32 0, i32* %i60, align 4 + br label %for.cond61 + +for.cond61: ; preds = %for.inc70, %if.end58 + %54 = load i32, i32* %i60, align 4 + %conv62 = sext i32 %54 to i64 + %num63 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 0 + %55 = load i64, i64* %num63, align 8 + %cmp64 = icmp slt i64 %conv62, %55 + br i1 %cmp64, label %for.body65, label %for.end72 + +for.body65: ; preds = %for.cond61 + %p66 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 2 + %56 = load %struct.Point*, %struct.Point** %p66, align 8 + %57 = load i32, i32* %i60, align 4 + %idxprom67 = sext i32 %57 to i64 + %arrayidx68 = getelementptr inbounds %struct.Point, %struct.Point* %56, i64 %idxprom67 + %weight69 = getelementptr inbounds %struct.Point, %struct.Point* %arrayidx68, i32 0, i32 0 + store float 1.000000e+00, float* %weight69, align 8 + br label %for.inc70 + +for.inc70: ; preds = %for.body65 + %58 = load i32, i32* %i60, align 4 + %inc71 = add nsw i32 %58, 1 + store i32 %inc71, i32* %i60, align 4 + br label %for.cond61 + +for.end72: ; preds = %for.cond61 + %num73 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 0 + %59 = load i64, i64* %num73, align 8 + %mul74 = mul i64 %59, 1 + %call75 = call noalias i8* @malloc(i64 %mul74) #2 + store i8* %call75, i8** @_ZL17switch_membership, align 8 + %num76 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 0 + %60 = load i64, i64* %num76, align 8 + %call77 = call noalias i8* @calloc(i64 %60, i64 1) #2 + store i8* %call77, i8** @_ZL9is_center, align 8 + %num78 = getelementptr inbounds %struct.Points, %struct.Points* %points, i32 0, i32 0 + %61 = load i64, i64* %num78, align 8 + %mul79 = mul i64 %61, 4 + %call80 = call noalias i8* @malloc(i64 %mul79) #2 + %62 = bitcast i8* %call80 to i32* + store i32* %62, i32** @_ZL12center_table, align 8 + %63 = load i64, i64* %kmin.addr, align 8 + %64 = load i64, i64* %kmax.addr, align 8 + call void @_Z11localSearchP6PointsllPl(%struct.Points* %points, i64 %63, i64 %64, i64* %kfinal) + %65 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call81 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %65, i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.13, i64 0, i64 0)) + %call82 = call i32 @_Z11contcentersP6Points(%struct.Points* %points) + store i8 1, i8* @isCoordChanged, align 1 + %66 = load i64, i64* %kfinal, align 8 + %num83 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 0 + %67 = load i64, i64* %num83, align 8 + %add = add nsw i64 %66, %67 + %68 = load i64, i64* %centersize.addr, align 8 + %cmp84 = icmp sgt i64 %add, %68 + br i1 %cmp84, label %if.then85, label %if.end87 + +if.then85: ; preds = %for.end72 + %69 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call86 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %69, i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.14, i64 0, i64 0)) + call void @exit(i32 1) #15 + unreachable + +if.end87: ; preds = %for.end72 + %70 = load i64*, i64** %centerIDs, align 8 + %71 = load i64, i64* %IDoffset, align 8 + call void @_Z11copycentersP6PointsS0_Pll(%struct.Points* %points, %struct.Points* %centers, i64* %70, i64 %71) + %72 = load i64, i64* %numRead, align 8 + %73 = load i64, i64* %IDoffset, align 8 + %add88 = add i64 %73, %72 + store i64 %add88, i64* %IDoffset, align 8 + %74 = load i8*, i8** @_ZL9is_center, align 8 + call void @free(i8* %74) #2 + %75 = load i8*, i8** @_ZL17switch_membership, align 8 + call void @free(i8* %75) #2 + %76 = load i32*, i32** @_ZL12center_table, align 8 + %77 = bitcast i32* %76 to i8* + call void @free(i8* %77) #2 + %78 = load %class.PStream*, %class.PStream** %stream.addr, align 8 + %79 = bitcast %class.PStream* %78 to i32 (%class.PStream*)*** + %vtable89 = load i32 (%class.PStream*)**, i32 (%class.PStream*)*** %79, align 8 + %vfn90 = getelementptr inbounds i32 (%class.PStream*)*, i32 (%class.PStream*)** %vtable89, i64 2 + %80 = load i32 (%class.PStream*)*, i32 (%class.PStream*)** %vfn90, align 8 + %call91 = call i32 %80(%class.PStream* %78) + %tobool92 = icmp ne i32 %call91, 0 + br i1 %tobool92, label %if.then93, label %if.end94 + +if.then93: ; preds = %if.end87 + br label %while.end + +if.end94: ; preds = %if.end87 + br label %while.body + +while.end: ; preds = %if.then93 + %num95 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 0 + %81 = load i64, i64* %num95, align 8 + %mul96 = mul i64 %81, 1 + %call97 = call noalias i8* @malloc(i64 %mul96) #2 + store i8* %call97, i8** @_ZL17switch_membership, align 8 + %num98 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 0 + %82 = load i64, i64* %num98, align 8 + %call99 = call noalias i8* @calloc(i64 %82, i64 1) #2 + store i8* %call99, i8** @_ZL9is_center, align 8 + %num100 = getelementptr inbounds %struct.Points, %struct.Points* %centers, i32 0, i32 0 + %83 = load i64, i64* %num100, align 8 + %mul101 = mul i64 %83, 4 + %call102 = call noalias i8* @malloc(i64 %mul101) #2 + %84 = bitcast i8* %call102 to i32* + store i32* %84, i32** @_ZL12center_table, align 8 + %85 = load i64, i64* %kmin.addr, align 8 + %86 = load i64, i64* %kmax.addr, align 8 + call void @_Z11localSearchP6PointsllPl(%struct.Points* %centers, i64 %85, i64 %86, i64* %kfinal) + %call103 = call i32 @_Z11contcentersP6Points(%struct.Points* %centers) + %87 = load i64*, i64** %centerIDs, align 8 + %88 = load i8*, i8** %outfile.addr, align 8 + call void @_Z12outcenterIDsP6PointsPlPc(%struct.Points* %centers, i64* %87, i8* %88) + ret void +} + +; Function Attrs: noinline norecurse optnone uwtable +define dso_local i32 @main(i32 %argc, i8** %argv) #11 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %outfilename = alloca i8*, align 8 + %infilename = alloca i8*, align 8 + %kmin = alloca i64, align 8 + %kmax = alloca i64, align 8 + %n = alloca i64, align 8 + %chunksize = alloca i64, align 8 + %clustersize = alloca i64, align 8 + %dim = alloca i32, align 4 + %stream = alloca %class.PStream*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %t1 = alloca double, align 8 + %t2 = alloca double, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + %call = call i32 @cudaSetDevice(i32 0) + %call1 = call i8* @_Znam(i64 1024) #16 + store i8* %call1, i8** %outfilename, align 8 + %call2 = call i8* @_Znam(i64 1024) #16 + store i8* %call2, i8** %infilename, align 8 + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.15, i64 0, i64 0)) + %call4 = call i32 @fflush(%struct._IO_FILE* null) + %0 = load i32, i32* %argc.addr, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %2 = load i8**, i8*** %argv.addr, align 8 + %arrayidx = getelementptr inbounds i8*, i8** %2, i64 0 + %3 = load i8*, i8** %arrayidx, align 8 + %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @.str.16, i64 0, i64 0), i8* %3) + %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.17, i64 0, i64 0)) + %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call7 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.18, i64 0, i64 0)) + %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([45 x i8], [45 x i8]* @.str.19, i64 0, i64 0)) + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call9 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.20, i64 0, i64 0)) + %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call10 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([57 x i8], [57 x i8]* @.str.21, i64 0, i64 0)) + %9 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call11 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %9, i8* getelementptr inbounds ([55 x i8], [55 x i8]* @.str.22, i64 0, i64 0)) + %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call12 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %10, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.23, i64 0, i64 0)) + %11 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call13 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %11, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.24, i64 0, i64 0)) + %12 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call14 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %12, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.25, i64 0, i64 0)) + %13 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call15 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %13, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.26, i64 0, i64 0)) + %14 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call16 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %14, i8* getelementptr inbounds ([77 x i8], [77 x i8]* @.str.27, i64 0, i64 0)) + call void @exit(i32 1) #15 + unreachable + +if.end: ; preds = %entry + %15 = load i8**, i8*** %argv.addr, align 8 + %arrayidx17 = getelementptr inbounds i8*, i8** %15, i64 1 + %16 = load i8*, i8** %arrayidx17, align 8 + %call18 = call i32 @atoi(i8* %16) #18 + %conv = sext i32 %call18 to i64 + store i64 %conv, i64* %kmin, align 8 + %17 = load i8**, i8*** %argv.addr, align 8 + %arrayidx19 = getelementptr inbounds i8*, i8** %17, i64 2 + %18 = load i8*, i8** %arrayidx19, align 8 + %call20 = call i32 @atoi(i8* %18) #18 + %conv21 = sext i32 %call20 to i64 + store i64 %conv21, i64* %kmax, align 8 + %19 = load i8**, i8*** %argv.addr, align 8 + %arrayidx22 = getelementptr inbounds i8*, i8** %19, i64 3 + %20 = load i8*, i8** %arrayidx22, align 8 + %call23 = call i32 @atoi(i8* %20) #18 + store i32 %call23, i32* %dim, align 4 + %21 = load i8**, i8*** %argv.addr, align 8 + %arrayidx24 = getelementptr inbounds i8*, i8** %21, i64 4 + %22 = load i8*, i8** %arrayidx24, align 8 + %call25 = call i32 @atoi(i8* %22) #18 + %conv26 = sext i32 %call25 to i64 + store i64 %conv26, i64* %n, align 8 + %23 = load i8**, i8*** %argv.addr, align 8 + %arrayidx27 = getelementptr inbounds i8*, i8** %23, i64 5 + %24 = load i8*, i8** %arrayidx27, align 8 + %call28 = call i32 @atoi(i8* %24) #18 + %conv29 = sext i32 %call28 to i64 + store i64 %conv29, i64* %chunksize, align 8 + %25 = load i8**, i8*** %argv.addr, align 8 + %arrayidx30 = getelementptr inbounds i8*, i8** %25, i64 6 + %26 = load i8*, i8** %arrayidx30, align 8 + %call31 = call i32 @atoi(i8* %26) #18 + %conv32 = sext i32 %call31 to i64 + store i64 %conv32, i64* %clustersize, align 8 + %27 = load i8*, i8** %infilename, align 8 + %28 = load i8**, i8*** %argv.addr, align 8 + %arrayidx33 = getelementptr inbounds i8*, i8** %28, i64 7 + %29 = load i8*, i8** %arrayidx33, align 8 + %call34 = call i8* @strcpy(i8* %27, i8* %29) + %30 = load i8*, i8** %outfilename, align 8 + %31 = load i8**, i8*** %argv.addr, align 8 + %arrayidx35 = getelementptr inbounds i8*, i8** %31, i64 8 + %32 = load i8*, i8** %arrayidx35, align 8 + %call36 = call i8* @strcpy(i8* %30, i8* %32) + %33 = load i8**, i8*** %argv.addr, align 8 + %arrayidx37 = getelementptr inbounds i8*, i8** %33, i64 9 + %34 = load i8*, i8** %arrayidx37, align 8 + %call38 = call i32 @atoi(i8* %34) #18 + store i32 %call38, i32* @_ZL5nproc, align 4 + call void @srand48(i64 1) #2 + %35 = load i64, i64* %n, align 8 + %cmp39 = icmp sgt i64 %35, 0 + br i1 %cmp39, label %if.then40, label %if.else + +if.then40: ; preds = %if.end + %call41 = call i8* @_Znwm(i64 16) #16 + %36 = bitcast i8* %call41 to %class.SimStream* + %37 = load i64, i64* %n, align 8 + invoke void @_ZN9SimStreamC2El(%class.SimStream* %36, i64 %37) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %if.then40 + %38 = bitcast %class.SimStream* %36 to %class.PStream* + store %class.PStream* %38, %class.PStream** %stream, align 8 + br label %if.end45 + +lpad: ; preds = %if.then40 + %39 = landingpad { i8*, i32 } + cleanup + %40 = extractvalue { i8*, i32 } %39, 0 + store i8* %40, i8** %exn.slot, align 8 + %41 = extractvalue { i8*, i32 } %39, 1 + store i32 %41, i32* %ehselector.slot, align 4 + call void @_ZdlPv(i8* %call41) #17 + br label %eh.resume + +if.else: ; preds = %if.end + %call42 = call i8* @_Znwm(i64 16) #16 + %42 = bitcast i8* %call42 to %class.FileStream* + %43 = load i8*, i8** %infilename, align 8 + invoke void @_ZN10FileStreamC2EPc(%class.FileStream* %42, i8* %43) + to label %invoke.cont44 unwind label %lpad43 + +invoke.cont44: ; preds = %if.else + %44 = bitcast %class.FileStream* %42 to %class.PStream* + store %class.PStream* %44, %class.PStream** %stream, align 8 + br label %if.end45 + +lpad43: ; preds = %if.else + %45 = landingpad { i8*, i32 } + cleanup + %46 = extractvalue { i8*, i32 } %45, 0 + store i8* %46, i8** %exn.slot, align 8 + %47 = extractvalue { i8*, i32 } %45, 1 + store i32 %47, i32* %ehselector.slot, align 4 + call void @_ZdlPv(i8* %call42) #17 + br label %eh.resume + +if.end45: ; preds = %invoke.cont44, %invoke.cont + %call46 = call double @_Z7gettimev() + store double %call46, double* %t1, align 8 + store double 0.000000e+00, double* @serial_t, align 8 + store double 0.000000e+00, double* @cpu_to_gpu_t, align 8 + store double 0.000000e+00, double* @gpu_to_cpu_t, align 8 + store double 0.000000e+00, double* @alloc_t, align 8 + store double 0.000000e+00, double* @free_t, align 8 + store double 0.000000e+00, double* @kernel_t, align 8 + store i8 0, i8* @isCoordChanged, align 1 + %48 = load %class.PStream*, %class.PStream** %stream, align 8 + %49 = load i64, i64* %kmin, align 8 + %50 = load i64, i64* %kmax, align 8 + %51 = load i32, i32* %dim, align 4 + %52 = load i64, i64* %chunksize, align 8 + %53 = load i64, i64* %clustersize, align 8 + %54 = load i8*, i8** %outfilename, align 8 + call void @_Z13streamClusterP7PStreamllillPc(%class.PStream* %48, i64 %49, i64 %50, i32 %51, i64 %52, i64 %53, i8* %54) + call void @_Z10freeDevMemv() + call void @_Z11freeHostMemv() + %call47 = call double @_Z7gettimev() + store double %call47, double* %t2, align 8 + %55 = load double, double* %t2, align 8 + %56 = load double, double* %t1, align 8 + %sub = fsub contract double %55, %56 + %call48 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.28, i64 0, i64 0), double %sub) + %57 = load %class.PStream*, %class.PStream** %stream, align 8 + %isnull = icmp eq %class.PStream* %57, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %if.end45 + %58 = bitcast %class.PStream* %57 to void (%class.PStream*)*** + %vtable = load void (%class.PStream*)**, void (%class.PStream*)*** %58, align 8 + %vfn = getelementptr inbounds void (%class.PStream*)*, void (%class.PStream*)** %vtable, i64 4 + %59 = load void (%class.PStream*)*, void (%class.PStream*)** %vfn, align 8 + call void %59(%class.PStream* %57) + br label %delete.end + +delete.end: ; preds = %delete.notnull, %if.end45 + %60 = load double, double* @time_gain, align 8 + %call49 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.29, i64 0, i64 0), double %60) + %61 = load double, double* @time_gain_dist, align 8 + %call50 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.30, i64 0, i64 0), double %61) + %62 = load double, double* @time_gain_init, align 8 + %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.31, i64 0, i64 0), double %62) + %63 = load double, double* @time_select_feasible, align 8 + %call52 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.32, i64 0, i64 0), double %63) + %64 = load double, double* @time_speedy, align 8 + %call53 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.33, i64 0, i64 0), double %64) + %65 = load double, double* @time_shuffle, align 8 + %call54 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.34, i64 0, i64 0), double %65) + %66 = load double, double* @time_local_search, align 8 + %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.35, i64 0, i64 0), double %66) + %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.9, i64 0, i64 0)) + %call57 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.36, i64 0, i64 0)) + %67 = load double, double* @serial_t, align 8 + %div = fdiv double %67, 1.000000e+03 + %call58 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.37, i64 0, i64 0), double %div) + %68 = load double, double* @cpu_to_gpu_t, align 8 + %div59 = fdiv double %68, 1.000000e+03 + %call60 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.38, i64 0, i64 0), double %div59) + %69 = load double, double* @gpu_to_cpu_t, align 8 + %div61 = fdiv double %69, 1.000000e+03 + %call62 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.39, i64 0, i64 0), double %div61) + %70 = load double, double* @alloc_t, align 8 + %div63 = fdiv double %70, 1.000000e+03 + %call64 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.40, i64 0, i64 0), double %div63) + %71 = load double, double* @free_t, align 8 + %div65 = fdiv double %71, 1.000000e+03 + %call66 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.41, i64 0, i64 0), double %div65) + %72 = load double, double* @kernel_t, align 8 + %div67 = fdiv double %72, 1.000000e+03 + %call68 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.42, i64 0, i64 0), double %div67) + ret i32 0 + +eh.resume: ; preds = %lpad43, %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val69 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val69 +} + +declare dso_local i32 @cudaSetDevice(i32) #1 + +declare dso_local i32 @fflush(%struct._IO_FILE*) #1 + +; Function Attrs: nounwind readonly +declare dso_local i32 @atoi(i8*) #12 + +declare dso_local i8* @strcpy(i8*, i8*) #1 + +; Function Attrs: nounwind +declare dso_local void @srand48(i64) #7 + +; Function Attrs: nobuiltin +declare dso_local noalias i8* @_Znwm(i64) #9 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9SimStreamC2El(%class.SimStream* %this, i64 %n_) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.SimStream*, align 8 + %n_.addr = alloca i64, align 8 + store %class.SimStream* %this, %class.SimStream** %this.addr, align 8 + store i64 %n_, i64* %n_.addr, align 8 + %this1 = load %class.SimStream*, %class.SimStream** %this.addr, align 8 + %0 = bitcast %class.SimStream* %this1 to %class.PStream* + call void @_ZN7PStreamC2Ev(%class.PStream* %0) #2 + %1 = bitcast %class.SimStream* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*] }, { [7 x i8*] }* @_ZTV9SimStream, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 8 + %2 = load i64, i64* %n_.addr, align 8 + %n = getelementptr inbounds %class.SimStream, %class.SimStream* %this1, i32 0, i32 1 + store i64 %2, i64* %n, align 8 + ret void +} + +declare dso_local i32 @__gxx_personality_v0(...) + +; Function Attrs: nobuiltin nounwind +declare dso_local void @_ZdlPv(i8*) #10 + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN10FileStreamC2EPc(%class.FileStream* %this, i8* %filename) unnamed_addr #3 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.FileStream*, align 8 + %filename.addr = alloca i8*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.FileStream* %this, %class.FileStream** %this.addr, align 8 + store i8* %filename, i8** %filename.addr, align 8 + %this1 = load %class.FileStream*, %class.FileStream** %this.addr, align 8 + %0 = bitcast %class.FileStream* %this1 to %class.PStream* + call void @_ZN7PStreamC2Ev(%class.PStream* %0) #2 + %1 = bitcast %class.FileStream* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*] }, { [7 x i8*] }* @_ZTV10FileStream, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 8 + %2 = load i8*, i8** %filename.addr, align 8 + %call = invoke %struct._IO_FILE* @fopen(i8* %2, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.43, i64 0, i64 0)) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %fp = getelementptr inbounds %class.FileStream, %class.FileStream* %this1, i32 0, i32 1 + store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8 + %fp2 = getelementptr inbounds %class.FileStream, %class.FileStream* %this1, i32 0, i32 1 + %3 = load %struct._IO_FILE*, %struct._IO_FILE** %fp2, align 8 + %cmp = icmp eq %struct._IO_FILE* %3, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %invoke.cont + %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %5 = load i8*, i8** %filename.addr, align 8 + %call4 = invoke i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.44, i64 0, i64 0), i8* %5) + to label %invoke.cont3 unwind label %lpad + +invoke.cont3: ; preds = %if.then + call void @exit(i32 1) #15 + unreachable + +lpad: ; preds = %if.then, %entry + %6 = landingpad { i8*, i32 } + cleanup + %7 = extractvalue { i8*, i32 } %6, 0 + store i8* %7, i8** %exn.slot, align 8 + %8 = extractvalue { i8*, i32 } %6, 1 + store i32 %8, i32* %ehselector.slot, align 4 + %9 = bitcast %class.FileStream* %this1 to %class.PStream* + invoke void @_ZN7PStreamD2Ev(%class.PStream* %9) + to label %invoke.cont5 unwind label %terminate.lpad + +if.end: ; preds = %invoke.cont + ret void + +invoke.cont5: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont5 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val6 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val6 + +terminate.lpad: ; preds = %lpad + %10 = landingpad { i8*, i32 } + catch i8* null + %11 = extractvalue { i8*, i32 } %10, 0 + call void @__clang_call_terminate(i8* %11) #15 + unreachable +} + +; Function Attrs: nounwind +declare dso_local float @logf(float) #7 + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN7PStreamC2Ev(%class.PStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.PStream*, align 8 + store %class.PStream* %this, %class.PStream** %this.addr, align 8 + %this1 = load %class.PStream*, %class.PStream** %this.addr, align 8 + %0 = bitcast %class.PStream* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*] }, { [7 x i8*] }* @_ZTV7PStream, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %0, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i64 @_ZN9SimStream4readEPfii(%class.SimStream* %this, float* %dest, i32 %dim, i32 %num) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.SimStream*, align 8 + %dest.addr = alloca float*, align 8 + %dim.addr = alloca i32, align 4 + %num.addr = alloca i32, align 4 + %count = alloca i64, align 8 + %i = alloca i32, align 4 + %k = alloca i32, align 4 + store %class.SimStream* %this, %class.SimStream** %this.addr, align 8 + store float* %dest, float** %dest.addr, align 8 + store i32 %dim, i32* %dim.addr, align 4 + store i32 %num, i32* %num.addr, align 4 + %this1 = load %class.SimStream*, %class.SimStream** %this.addr, align 8 + store i64 0, i64* %count, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc8, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %num.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %land.rhs, label %land.end + +land.rhs: ; preds = %for.cond + %n = getelementptr inbounds %class.SimStream, %class.SimStream* %this1, i32 0, i32 1 + %2 = load i64, i64* %n, align 8 + %cmp2 = icmp sgt i64 %2, 0 + br label %land.end + +land.end: ; preds = %land.rhs, %for.cond + %3 = phi i1 [ false, %for.cond ], [ %cmp2, %land.rhs ] + br i1 %3, label %for.body, label %for.end10 + +for.body: ; preds = %land.end + store i32 0, i32* %k, align 4 + br label %for.cond3 + +for.cond3: ; preds = %for.inc, %for.body + %4 = load i32, i32* %k, align 4 + %5 = load i32, i32* %dim.addr, align 4 + %cmp4 = icmp slt i32 %4, %5 + br i1 %cmp4, label %for.body5, label %for.end + +for.body5: ; preds = %for.cond3 + %call = call i64 @lrand48() #2 + %conv = sitofp i64 %call to float + %div = fdiv float %conv, 0x41E0000000000000 + %6 = load float*, float** %dest.addr, align 8 + %7 = load i32, i32* %i, align 4 + %8 = load i32, i32* %dim.addr, align 4 + %mul = mul nsw i32 %7, %8 + %9 = load i32, i32* %k, align 4 + %add = add nsw i32 %mul, %9 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %6, i64 %idxprom + store float %div, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body5 + %10 = load i32, i32* %k, align 4 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond3 + +for.end: ; preds = %for.cond3 + %n6 = getelementptr inbounds %class.SimStream, %class.SimStream* %this1, i32 0, i32 1 + %11 = load i64, i64* %n6, align 8 + %dec = add nsw i64 %11, -1 + store i64 %dec, i64* %n6, align 8 + %12 = load i64, i64* %count, align 8 + %inc7 = add i64 %12, 1 + store i64 %inc7, i64* %count, align 8 + br label %for.inc8 + +for.inc8: ; preds = %for.end + %13 = load i32, i32* %i, align 4 + %inc9 = add nsw i32 %13, 1 + store i32 %inc9, i32* %i, align 4 + br label %for.cond + +for.end10: ; preds = %land.end + %14 = load i64, i64* %count, align 8 + ret i64 %14 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i32 @_ZN9SimStream6ferrorEv(%class.SimStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.SimStream*, align 8 + store %class.SimStream* %this, %class.SimStream** %this.addr, align 8 + %this1 = load %class.SimStream*, %class.SimStream** %this.addr, align 8 + ret i32 0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i32 @_ZN9SimStream4feofEv(%class.SimStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.SimStream*, align 8 + store %class.SimStream* %this, %class.SimStream** %this.addr, align 8 + %this1 = load %class.SimStream*, %class.SimStream** %this.addr, align 8 + %n = getelementptr inbounds %class.SimStream, %class.SimStream* %this1, i32 0, i32 1 + %0 = load i64, i64* %n, align 8 + %cmp = icmp sle i64 %0, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN9SimStreamD2Ev(%class.SimStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.SimStream*, align 8 + store %class.SimStream* %this, %class.SimStream** %this.addr, align 8 + %this1 = load %class.SimStream*, %class.SimStream** %this.addr, align 8 + %0 = bitcast %class.SimStream* %this1 to %class.PStream* + call void @_ZN7PStreamD2Ev(%class.PStream* %0) + ret void +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN9SimStreamD0Ev(%class.SimStream* %this) unnamed_addr #3 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.SimStream*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.SimStream* %this, %class.SimStream** %this.addr, align 8 + %this1 = load %class.SimStream*, %class.SimStream** %this.addr, align 8 + invoke void @_ZN9SimStreamD2Ev(%class.SimStream* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %0 = bitcast %class.SimStream* %this1 to i8* + call void @_ZdlPv(i8* %0) #17 + ret void + +lpad: ; preds = %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + %4 = bitcast %class.SimStream* %this1 to i8* + call void @_ZdlPv(i8* %4) #17 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +declare dso_local void @__cxa_pure_virtual() unnamed_addr + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN7PStreamD2Ev(%class.PStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.PStream*, align 8 + store %class.PStream* %this, %class.PStream** %this.addr, align 8 + %this1 = load %class.PStream*, %class.PStream** %this.addr, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local void @_ZN7PStreamD0Ev(%class.PStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.PStream*, align 8 + store %class.PStream* %this, %class.PStream** %this.addr, align 8 + %this1 = load %class.PStream*, %class.PStream** %this.addr, align 8 + call void @llvm.trap() #15 + unreachable +} + +; Function Attrs: cold noreturn nounwind +declare void @llvm.trap() #13 + +; Function Attrs: noinline noreturn nounwind +define linkonce_odr hidden void @__clang_call_terminate(i8* %0) #14 comdat { + %2 = call i8* @__cxa_begin_catch(i8* %0) #2 + call void @_ZSt9terminatev() #15 + unreachable +} + +declare dso_local i8* @__cxa_begin_catch(i8*) + +declare dso_local void @_ZSt9terminatev() + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local i64 @_ZN10FileStream4readEPfii(%class.FileStream* %this, float* %dest, i32 %dim, i32 %num) unnamed_addr #3 comdat align 2 { +entry: + %this.addr = alloca %class.FileStream*, align 8 + %dest.addr = alloca float*, align 8 + %dim.addr = alloca i32, align 4 + %num.addr = alloca i32, align 4 + store %class.FileStream* %this, %class.FileStream** %this.addr, align 8 + store float* %dest, float** %dest.addr, align 8 + store i32 %dim, i32* %dim.addr, align 4 + store i32 %num, i32* %num.addr, align 4 + %this1 = load %class.FileStream*, %class.FileStream** %this.addr, align 8 + %0 = load float*, float** %dest.addr, align 8 + %1 = bitcast float* %0 to i8* + %2 = load i32, i32* %dim.addr, align 4 + %conv = sext i32 %2 to i64 + %mul = mul i64 4, %conv + %3 = load i32, i32* %num.addr, align 4 + %conv2 = sext i32 %3 to i64 + %fp = getelementptr inbounds %class.FileStream, %class.FileStream* %this1, i32 0, i32 1 + %4 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call = call i64 @fread(i8* %1, i64 %mul, i64 %conv2, %struct._IO_FILE* %4) + ret i64 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i32 @_ZN10FileStream6ferrorEv(%class.FileStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.FileStream*, align 8 + store %class.FileStream* %this, %class.FileStream** %this.addr, align 8 + %this1 = load %class.FileStream*, %class.FileStream** %this.addr, align 8 + %fp = getelementptr inbounds %class.FileStream, %class.FileStream* %this1, i32 0, i32 1 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call = call i32 @ferror(%struct._IO_FILE* %0) #2 + ret i32 %call +} + +; Function Attrs: noinline nounwind optnone uwtable +define linkonce_odr dso_local i32 @_ZN10FileStream4feofEv(%class.FileStream* %this) unnamed_addr #6 comdat align 2 { +entry: + %this.addr = alloca %class.FileStream*, align 8 + store %class.FileStream* %this, %class.FileStream** %this.addr, align 8 + %this1 = load %class.FileStream*, %class.FileStream** %this.addr, align 8 + %fp = getelementptr inbounds %class.FileStream, %class.FileStream* %this1, i32 0, i32 1 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call = call i32 @feof(%struct._IO_FILE* %0) #2 + ret i32 %call +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN10FileStreamD2Ev(%class.FileStream* %this) unnamed_addr #3 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.FileStream*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.FileStream* %this, %class.FileStream** %this.addr, align 8 + %this1 = load %class.FileStream*, %class.FileStream** %this.addr, align 8 + %0 = bitcast %class.FileStream* %this1 to i32 (...)*** + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*] }, { [7 x i8*] }* @_ZTV10FileStream, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %0, align 8 + %call = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.45, i64 0, i64 0)) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %fp = getelementptr inbounds %class.FileStream, %class.FileStream* %this1, i32 0, i32 1 + %1 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8 + %call3 = invoke i32 @fclose(%struct._IO_FILE* %1) + to label %invoke.cont2 unwind label %lpad + +invoke.cont2: ; preds = %invoke.cont + %2 = bitcast %class.FileStream* %this1 to %class.PStream* + call void @_ZN7PStreamD2Ev(%class.PStream* %2) + ret void + +lpad: ; preds = %invoke.cont, %entry + %3 = landingpad { i8*, i32 } + cleanup + %4 = extractvalue { i8*, i32 } %3, 0 + store i8* %4, i8** %exn.slot, align 8 + %5 = extractvalue { i8*, i32 } %3, 1 + store i32 %5, i32* %ehselector.slot, align 4 + %6 = bitcast %class.FileStream* %this1 to %class.PStream* + invoke void @_ZN7PStreamD2Ev(%class.PStream* %6) + to label %invoke.cont4 unwind label %terminate.lpad + +invoke.cont4: ; preds = %lpad + br label %eh.resume + +eh.resume: ; preds = %invoke.cont4 + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val5 + +terminate.lpad: ; preds = %lpad + %7 = landingpad { i8*, i32 } + catch i8* null + %8 = extractvalue { i8*, i32 } %7, 0 + call void @__clang_call_terminate(i8* %8) #15 + unreachable +} + +; Function Attrs: noinline optnone uwtable +define linkonce_odr dso_local void @_ZN10FileStreamD0Ev(%class.FileStream* %this) unnamed_addr #3 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %this.addr = alloca %class.FileStream*, align 8 + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + store %class.FileStream* %this, %class.FileStream** %this.addr, align 8 + %this1 = load %class.FileStream*, %class.FileStream** %this.addr, align 8 + invoke void @_ZN10FileStreamD2Ev(%class.FileStream* %this1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + %0 = bitcast %class.FileStream* %this1 to i8* + call void @_ZdlPv(i8* %0) #17 + ret void + +lpad: ; preds = %entry + %1 = landingpad { i8*, i32 } + cleanup + %2 = extractvalue { i8*, i32 } %1, 0 + store i8* %2, i8** %exn.slot, align 8 + %3 = extractvalue { i8*, i32 } %1, 1 + store i32 %3, i32* %ehselector.slot, align 4 + %4 = bitcast %class.FileStream* %this1 to i8* + call void @_ZdlPv(i8* %4) #17 + br label %eh.resume + +eh.resume: ; preds = %lpad + %exn = load i8*, i8** %exn.slot, align 8 + %sel = load i32, i32* %ehselector.slot, align 4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0 + %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1 + resume { i8*, i32 } %lpad.val2 +} + +declare dso_local i64 @fread(i8*, i64, i64, %struct._IO_FILE*) #1 + +; Function Attrs: nounwind +declare dso_local i32 @ferror(%struct._IO_FILE*) #7 + +; Function Attrs: nounwind +declare dso_local i32 @feof(%struct._IO_FILE*) #7 + +; Function Attrs: noinline uwtable +define internal void @_GLOBAL__sub_I_streamcluster_cuda_cpu.cu() #0 section ".text.startup" { +entry: + call void @__cxx_global_var_init() + ret void +} + +define internal void @__cuda_register_globals(i8** %0) { +entry: + %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32, i32, i64, %struct.Point*, i32, i32, float*, float*, i32*, i8*)* @_Z19kernel_compute_costiilP5PointiiPfS1_PiPb to i8*), i8* getelementptr inbounds ([45 x i8], [45 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([45 x i8], [45 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) + ret void +} + +declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) + +declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) + +declare dso_local i8** @__cudaRegisterFatBinary(i8*) + +define internal void @__cuda_module_ctor(i8* %0) { +entry: + %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) + store i8** %1, i8*** @__cuda_gpubin_handle, align 8 + call void @__cuda_register_globals(i8** %1) + call void @__cudaRegisterFatBinaryEnd(i8** %1) + %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) + ret void +} + +declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) + +declare dso_local void @__cudaUnregisterFatBinary(i8**) + +define internal void @__cuda_module_dtor(i8* %0) { +entry: + %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 + call void @__cudaUnregisterFatBinary(i8** %1) + ret void +} + +declare dso_local i32 @atexit(void (i8*)*) + +attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } +attributes #3 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { argmemonly nounwind willreturn } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nounwind readnone speculatable willreturn } +attributes #9 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #10 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #11 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #12 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #13 = { cold noreturn nounwind } +attributes #14 = { noinline noreturn nounwind } +attributes #15 = { noreturn nounwind } +attributes #16 = { builtin } +attributes #17 = { builtin nounwind } +attributes #18 = { nounwind readonly } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} diff --git a/examples/streamcluster/streamcluster_cuda_cpu.cu b/examples/streamcluster/streamcluster_cuda_cpu.cu new file mode 100644 index 0000000..55486f0 --- /dev/null +++ b/examples/streamcluster/streamcluster_cuda_cpu.cu @@ -0,0 +1,963 @@ +/*********************************************** + streamcluster.cpp + : original source code of streamcluster with minor + modification regarding function calls + + - original code from PARSEC Benchmark Suite + - parallelization with CUDA API has been applied by + + Sang-Ha (a.k.a Shawn) Lee - sl4ge@virginia.edu + University of Virginia + Department of Electrical and Computer Engineering + Department of Computer Science + +***********************************************/ + +#include "streamcluster_cuda.cu" +#include "streamcluster_header.h" + +using namespace std; + +#define MAXNAMESIZE 1024 // max filename length +#define SEED 1 +#define SP 1 // number of repetitions of speedy must be >=1 +#define ITER 3 // iterate ITER* k log k times; ITER >= 1 +//#define PRINTINFO // Enables printing output +#define PROFILE // Enables timing info +//#define ENABLE_THREADS // Enables parallel execution +//#define INSERT_WASTE // Enables waste computation in +// dist function +#define CACHE_LINE 512 // cache line in byte + +// GLOBAL +static bool *switch_membership; // whether to switch membership in pgain +static bool *is_center; // whether a point is a center +static int *center_table; // index table of centers +static int nproc; //# of threads +bool isCoordChanged; + +// GPU Timing Info +double serial_t; +double cpu_to_gpu_t; +double gpu_to_cpu_t; +double alloc_t; +double kernel_t; +double free_t; + +// instrumentation code +#ifdef PROFILE +double time_local_search; +double time_speedy; +double time_select_feasible; +double time_gain; +double time_shuffle; +double time_gain_dist; +double time_gain_init; +#endif + +void inttofile(int data, char *filename) { + FILE *fp = fopen(filename, "w"); + fprintf(fp, "%d ", data); + fclose(fp); +} + +double gettime() { + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec + t.tv_usec * 1e-6; +} + +int isIdentical(float *i, float *j, int D) { + // tells whether two points of D dimensions are identical + + int a = 0; + int equal = 1; + + while (equal && a < D) { + if (i[a] != j[a]) + equal = 0; + else + a++; + } + if (equal) + return 1; + else + return 0; +} + +/* comparator for floating point numbers */ +static int floatcomp(const void *i, const void *j) { + float a, b; + a = *(float *)(i); + b = *(float *)(j); + if (a > b) + return (1); + if (a < b) + return (-1); + return (0); +} + +/* shuffle points into random order */ +void shuffle(Points *points) { +#ifdef PROFILE + double t1 = gettime(); +#endif + long i, j; + Point temp; + for (i = 0; i < points->num - 1; i++) { + j = (lrand48() % (points->num - i)) + i; + temp = points->p[i]; + points->p[i] = points->p[j]; + points->p[j] = temp; + } +#ifdef PROFILE + double t2 = gettime(); + time_shuffle += t2 - t1; +#endif +} + +/* shuffle an array of integers */ +void intshuffle(int *intarray, int length) { +#ifdef PROFILE + double t1 = gettime(); +#endif + long i, j; + int temp; + for (i = 0; i < length; i++) { + j = (lrand48() % (length - i)) + i; + temp = intarray[i]; + intarray[i] = intarray[j]; + intarray[j] = temp; + } +#ifdef PROFILE + double t2 = gettime(); + time_shuffle += t2 - t1; +#endif +} + +#ifdef INSERT_WASTE +float waste(float s) { + for (int i = 0; i < 4; i++) { + s += pow(s, 0.78); + } + return s; +} +#endif + +/* compute Euclidean distance squared between two points */ +float dist(Point p1, Point p2, int dim) { + int i; + float result = 0.0; + for (i = 0; i < dim; i++) + result += (p1.coord[i] - p2.coord[i]) * (p1.coord[i] - p2.coord[i]); +#ifdef INSERT_WASTE + float s = waste(result); + result += s; + result -= s; +#endif + return (result); +} + +/* run speedy on the points, return total cost of solution */ +float pspeedy(Points *points, float z, long *kcenter, int pid, + pthread_barrier_t *barrier) { +#ifdef PROFILE + double t1 = gettime(); +#endif + +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + // my block + long bsize = points->num / nproc; + long k1 = bsize * pid; + long k2 = k1 + bsize; + if (pid == nproc - 1) + k2 = points->num; + + static float totalcost; + + static bool open = false; + static float *costs; // cost for each thread. + static int i; + +#ifdef ENABLE_THREADS + static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +#endif + +#ifdef PRINTINFO + if (pid == 0) { + fprintf(stderr, "Speedy: facility cost %lf\n", z); + } +#endif + + /* create center at first point, send it to itself */ + for (int k = k1; k < k2; k++) { + float distance = dist(points->p[k], points->p[0], points->dim); + points->p[k].cost = distance * points->p[k].weight; + points->p[k].assign = 0; + } + + if (pid == 0) { + *kcenter = 1; + costs = (float *)malloc(sizeof(float) * nproc); + } + + if (pid != + 0) { // we are not the master threads. we wait until a center is opened. + while (1) { +#ifdef ENABLE_THREADS + pthread_mutex_lock(&mutex); + while (!open) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); +#endif + if (i >= points->num) + break; + for (int k = k1; k < k2; k++) { + float distance = dist(points->p[i], points->p[k], points->dim); + if (distance * points->p[k].weight < points->p[k].cost) { + points->p[k].cost = distance * points->p[k].weight; + points->p[k].assign = i; + } + } +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); + pthread_barrier_wait(barrier); +#endif + } + } else { // I am the master thread. I decide whether to open a center and + // notify others if so. + for (i = 1; i < points->num; i++) { + bool to_open = + ((float)lrand48() / (float)INT_MAX) < (points->p[i].cost / z); + if (to_open) { + (*kcenter)++; +#ifdef ENABLE_THREADS + pthread_mutex_lock(&mutex); +#endif + open = true; +#ifdef ENABLE_THREADS + pthread_mutex_unlock(&mutex); + pthread_cond_broadcast(&cond); +#endif + for (int k = k1; k < k2; k++) { + float distance = dist(points->p[i], points->p[k], points->dim); + if (distance * points->p[k].weight < points->p[k].cost) { + points->p[k].cost = distance * points->p[k].weight; + points->p[k].assign = i; + } + } +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + open = false; +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + } + } +#ifdef ENABLE_THREADS + pthread_mutex_lock(&mutex); +#endif + open = true; +#ifdef ENABLE_THREADS + pthread_mutex_unlock(&mutex); + pthread_cond_broadcast(&cond); +#endif + } +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + open = false; + float mytotal = 0; + for (int k = k1; k < k2; k++) { + mytotal += points->p[k].cost; + } + costs[pid] = mytotal; +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + // aggregate costs from each thread + if (pid == 0) { + totalcost = z * (*kcenter); + for (int i = 0; i < nproc; i++) { + totalcost += costs[i]; + } + free(costs); + } +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + +#ifdef PRINTINFO + if (pid == 0) { + fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n", + *kcenter, totalcost); + fprintf(stderr, "Distance Cost %lf\n", totalcost - z * (*kcenter)); + } +#endif + +#ifdef PROFILE + double t2 = gettime(); + if (pid == 0) { + time_speedy += t2 - t1; + } +#endif + return (totalcost); +} + +/* facility location on the points using local search */ +/* z is the facility cost, returns the total cost and # of centers */ +/* assumes we are seeded with a reasonable solution */ +/* cost should represent this solution's cost */ +/* halt if there is < e improvement after iter calls to gain */ +/* feasible is an array of numfeasible points which may be centers */ + +float pFL(Points *points, int *feasible, int numfeasible, float z, long *k, + int kmax, float cost, long iter, float e, int pid, + pthread_barrier_t *barrier) { +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + long i; + long x; + float change; + long numberOfPoints; + + change = cost; + /* continue until we run iter iterations without improvement */ + /* stop instead if improvement is less than e */ + while (change / cost > 1.0 * e) { + change = 0.0; + numberOfPoints = points->num; + /* randomize order in which centers are considered */ + + if (pid == 0) { + intshuffle(feasible, numfeasible); + } +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + + for (i = 0; i < iter; i++) { + x = i % numfeasible; + change += + pgain(feasible[x], points, z, k, kmax, is_center, center_table, + switch_membership, isCoordChanged, &serial_t, &cpu_to_gpu_t, + &gpu_to_cpu_t, &alloc_t, &kernel_t, &free_t); + } + + cost -= change; +#ifdef PRINTINFO + if (pid == 0) { + fprintf(stderr, "%d centers, cost %lf, total distance %lf\n", *k, cost, + cost - z * (*k)); + } +#endif +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + } + return (cost); +} + +int selectfeasible_fast(Points *points, int **feasible, int kmin, int pid, + pthread_barrier_t *barrier) { +#ifdef PROFILE + double t1 = gettime(); +#endif + + int numfeasible = points->num; + if (numfeasible > (ITER * kmin * log((float)kmin))) + numfeasible = (int)(ITER * kmin * log((float)kmin)); + *feasible = (int *)malloc(numfeasible * sizeof(int)); + + float *accumweight; + float totalweight; + + /* + Calcuate my block. + For now this routine does not seem to be the bottleneck, so it is not + parallelized. When necessary, this can be parallelized by setting k1 and k2 + to proper values and calling this routine from all threads ( it is called + only by thread 0 for now ). Note that when parallelized, the randomization + might not be the same and it might not be difficult to measure the parallel + speed-up for the whole program. + */ + // long bsize = numfeasible; + long k1 = 0; + long k2 = numfeasible; + + float w; + int l, r, k; + + /* not many points, all will be feasible */ + if (numfeasible == points->num) { + for (int i = k1; i < k2; i++) + (*feasible)[i] = i; + return numfeasible; + } + + accumweight = (float *)malloc(sizeof(float) * points->num); + accumweight[0] = points->p[0].weight; + totalweight = 0; + for (int i = 1; i < points->num; i++) { + accumweight[i] = accumweight[i - 1] + points->p[i].weight; + } + totalweight = accumweight[points->num - 1]; + + for (int i = k1; i < k2; i++) { + w = (lrand48() / (float)INT_MAX) * totalweight; + // binary search + l = 0; + r = points->num - 1; + if (accumweight[0] > w) { + (*feasible)[i] = 0; + continue; + } + while (l + 1 < r) { + k = (l + r) / 2; + if (accumweight[k] > w) { + r = k; + } else { + l = k; + } + } + (*feasible)[i] = r; + } + + free(accumweight); + +#ifdef PROFILE + double t2 = gettime(); + time_select_feasible += t2 - t1; +#endif + return numfeasible; +} + +/* compute approximate kmedian on the points */ +float pkmedian(Points *points, long kmin, long kmax, long *kfinal, int pid, + pthread_barrier_t *barrier) { + int i; + float cost; + float lastcost; + float hiz, loz, z; + + static long k; + static int *feasible; + static int numfeasible; + static float *hizs; + + if (pid == 0) + hizs = (float *)calloc(nproc, sizeof(float)); + hiz = loz = 0.0; + long numberOfPoints = points->num; + long ptDimension = points->dim; + + // my block + long bsize = points->num / nproc; + long k1 = bsize * pid; + long k2 = k1 + bsize; + if (pid == nproc - 1) + k2 = points->num; + +#ifdef PRINTINFO + if (pid == 0) { + printf("Starting Kmedian procedure\n"); + printf("%i points in %i dimensions\n", numberOfPoints, ptDimension); + } +#endif + +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + + float myhiz = 0; + for (long kk = k1; kk < k2; kk++) { + myhiz += + dist(points->p[kk], points->p[0], ptDimension) * points->p[kk].weight; + } + hizs[pid] = myhiz; + +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + + for (int i = 0; i < nproc; i++) { + hiz += hizs[i]; + } + + loz = 0.0; + z = (hiz + loz) / 2.0; + /* NEW: Check whether more centers than points! */ + if (points->num <= kmax) { + /* just return all points as facilities */ + for (long kk = k1; kk < k2; kk++) { + points->p[kk].assign = kk; + points->p[kk].cost = 0; + } + cost = 0; + if (pid == 0) { + free(hizs); + *kfinal = k; + } + return cost; + } + + if (pid == 0) + shuffle(points); + cost = pspeedy(points, z, &k, pid, barrier); + +#ifdef PRINTINFO + if (pid == 0) + printf("thread %d: Finished first call to speedy, cost=%lf, k=%i\n", pid, + cost, k); +#endif + i = 0; + /* give speedy SP chances to get at least kmin/2 facilities */ + while ((k < kmin) && (i < SP)) { + cost = pspeedy(points, z, &k, pid, barrier); + i++; + } + +#ifdef PRINTINFO + if (pid == 0) + printf("thread %d: second call to speedy, cost=%lf, k=%d\n", pid, cost, k); +#endif + /* if still not enough facilities, assume z is too high */ + while (k < kmin) { +#ifdef PRINTINFO + if (pid == 0) { + printf("%lf %lf\n", loz, hiz); + printf("Speedy indicates we should try lower z\n"); + } +#endif + if (i >= SP) { + hiz = z; + z = (hiz + loz) / 2.0; + i = 0; + } + if (pid == 0) + shuffle(points); + cost = pspeedy(points, z, &k, pid, barrier); + i++; + } + + /* now we begin the binary search for real */ + /* must designate some points as feasible centers */ + /* this creates more consistancy between FL runs */ + /* helps to guarantee correct # of centers at the end */ + + if (pid == 0) { + numfeasible = selectfeasible_fast(points, &feasible, kmin, pid, barrier); + for (int i = 0; i < points->num; i++) { + is_center[points->p[i].assign] = true; + } + } + +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + + while (1) { + +#ifdef PRINTINFO + if (pid == 0) { + printf("loz = %lf, hiz = %lf\n", loz, hiz); + printf("Running Local Search...\n"); + } +#endif + /* first get a rough estimate on the FL solution */ + // pthread_barrier_wait(barrier); + lastcost = cost; + cost = pFL(points, feasible, numfeasible, z, &k, kmax, cost, + (long)(ITER * kmax * log((float)kmax)), 0.1, pid, barrier); + + /* if number of centers seems good, try a more accurate FL */ + if (((k <= (1.1) * kmax) && (k >= (0.9) * kmin)) || + ((k <= kmax + 2) && (k >= kmin - 2))) { + +#ifdef PRINTINFO + if (pid == 0) { + printf("Trying a more accurate local search...\n"); + } +#endif + /* may need to run a little longer here before halting without + improvement */ + + cost = pFL(points, feasible, numfeasible, z, &k, kmax, cost, + (long)(ITER * kmax * log((float)kmax)), 0.001, pid, barrier); + } + + if (k > kmax) { + /* facilities too cheap */ + /* increase facility cost and up the cost accordingly */ + loz = z; + z = (hiz + loz) / 2.0; + cost += (z - loz) * k; + } + if (k < kmin) { + /* facilities too expensive */ + /* decrease facility cost and reduce the cost accordingly */ + hiz = z; + z = (hiz + loz) / 2.0; + cost += (z - hiz) * k; + } + + /* if k is good, return the result */ + /* if we're stuck, just give up and return what we have */ + if (((k <= kmax) && (k >= kmin)) || ((loz >= (0.999) * hiz))) { + break; + } +#ifdef ENABLE_THREADS + pthread_barrier_wait(barrier); +#endif + } + + // clean up... + if (pid == 0) { + free(feasible); + free(hizs); + *kfinal = k; + } + + return cost; +} + +/* compute the means for the k clusters */ +int contcenters(Points *points) { + long i, ii; + float relweight; + + for (i = 0; i < points->num; i++) { + /* compute relative weight of this point to the cluster */ + if (points->p[i].assign != i) { + relweight = points->p[points->p[i].assign].weight + points->p[i].weight; + relweight = points->p[i].weight / relweight; + for (ii = 0; ii < points->dim; ii++) { + points->p[points->p[i].assign].coord[ii] *= 1.0 - relweight; + points->p[points->p[i].assign].coord[ii] += + points->p[i].coord[ii] * relweight; + } + points->p[points->p[i].assign].weight += points->p[i].weight; + } + } + + return 0; +} + +/* copy centers from points to centers */ +void copycenters(Points *points, Points *centers, long *centerIDs, + long offset) { + long i; + long k; + + bool *is_a_median = (bool *)calloc(points->num, sizeof(bool)); + + /* mark the centers */ + for (i = 0; i < points->num; i++) { + is_a_median[points->p[i].assign] = 1; + } + + k = centers->num; + + /* count how many */ + for (i = 0; i < points->num; i++) { + if (is_a_median[i]) { + memcpy(centers->p[k].coord, points->p[i].coord, + points->dim * sizeof(float)); + centers->p[k].weight = points->p[i].weight; + centerIDs[k] = i + offset; + k++; + } + } + + centers->num = k; + + free(is_a_median); +} + +void *localSearchSub(void *arg_) { + pkmedian_arg_t *arg = (pkmedian_arg_t *)arg_; + pkmedian(arg->points, arg->kmin, arg->kmax, arg->kfinal, arg->pid, + arg->barrier); + + return NULL; +} + +void localSearch(Points *points, long kmin, long kmax, long *kfinal) { +#ifdef PROFILE + double t1 = gettime(); +#endif + + pthread_barrier_t barrier; +#ifdef ENABLE_THREADS + pthread_barrier_init(&barrier, NULL, nproc); +#endif + pthread_t *threads = new pthread_t[nproc]; + pkmedian_arg_t *arg = new pkmedian_arg_t[nproc]; + + for (int i = 0; i < nproc; i++) { + arg[i].points = points; + arg[i].kmin = kmin; + arg[i].kmax = kmax; + arg[i].pid = i; + arg[i].kfinal = kfinal; + + arg[i].barrier = &barrier; +#ifdef ENABLE_THREADS + pthread_create(threads + i, NULL, localSearchSub, (void *)&arg[i]); +#else + localSearchSub(&arg[0]); +#endif + } + + for (int i = 0; i < nproc; i++) { +#ifdef ENABLE_THREADS + pthread_join(threads[i], NULL); +#endif + } + + delete[] threads; + delete[] arg; +#ifdef ENABLE_THREADS + pthread_barrier_destroy(&barrier); +#endif + +#ifdef PROFILE + double t2 = gettime(); + time_local_search += t2 - t1; +#endif +} + +void outcenterIDs(Points *centers, long *centerIDs, char *outfile) { + FILE *fp = fopen(outfile, "w"); + if (fp == NULL) { + fprintf(stderr, "error opening %s\n", outfile); + exit(1); + } + int *is_a_median = (int *)calloc(sizeof(int), centers->num); + for (int i = 0; i < centers->num; i++) { + is_a_median[centers->p[i].assign] = 1; + } + + for (int i = 0; i < centers->num; i++) { + if (is_a_median[i]) { + fprintf(fp, "%u\n", centerIDs[i]); + fprintf(fp, "%lf\n", centers->p[i].weight); + for (int k = 0; k < centers->dim; k++) { + fprintf(fp, "%lf ", centers->p[i].coord[k]); + } + fprintf(fp, "\n\n"); + } + } + fclose(fp); +} + +void streamCluster(PStream *stream, long kmin, long kmax, int dim, + long chunksize, long centersize, char *outfile) { + float *block = (float *)malloc(chunksize * dim * sizeof(float)); + float *centerBlock = (float *)malloc(centersize * dim * sizeof(float)); + long *centerIDs = (long *)malloc(centersize * dim * sizeof(long)); + + if (block == NULL) { + fprintf(stderr, "not enough memory for a chunk!\n"); + exit(1); + } + + Points points; + points.dim = dim; + points.num = chunksize; + points.p = (Point *)malloc(chunksize * sizeof(Point)); + for (int i = 0; i < chunksize; i++) { + points.p[i].coord = &block[i * dim]; + } + + Points centers; + centers.dim = dim; + centers.p = (Point *)malloc(centersize * sizeof(Point)); + centers.num = 0; + + for (int i = 0; i < centersize; i++) { + centers.p[i].coord = ¢erBlock[i * dim]; + centers.p[i].weight = 1.0; + } + + long IDoffset = 0; + long kfinal; + while (1) { + + size_t numRead = stream->read(block, dim, chunksize); + fprintf(stderr, "read %d points\n", numRead); + + if (stream->ferror() || + numRead < (unsigned int)chunksize && !stream->feof()) { + fprintf(stderr, "error reading data!\n"); + exit(1); + } + + points.num = numRead; + for (int i = 0; i < points.num; i++) { + points.p[i].weight = 1.0; + } + + switch_membership = (bool *)malloc(points.num * sizeof(bool)); + is_center = (bool *)calloc(points.num, sizeof(bool)); + center_table = (int *)malloc(points.num * sizeof(int)); + + localSearch(&points, kmin, kmax, &kfinal); + + fprintf(stderr, "finish local search\n"); + + contcenters(&points); + isCoordChanged = true; + + if (kfinal + centers.num > centersize) { + // here we don't handle the situation where # of centers gets too large. + fprintf(stderr, "oops! no more space for centers\n"); + exit(1); + } + +#ifdef PRINTINFO + printf("finish cont center\n"); +#endif + + copycenters(&points, ¢ers, centerIDs, IDoffset); + IDoffset += numRead; + +#ifdef PRINTINFO + printf("finish copy centers\n"); +#endif + + free(is_center); + free(switch_membership); + free(center_table); + + if (stream->feof()) { + break; + } + } + + // finally cluster all temp centers + switch_membership = (bool *)malloc(centers.num * sizeof(bool)); + is_center = (bool *)calloc(centers.num, sizeof(bool)); + center_table = (int *)malloc(centers.num * sizeof(int)); + + localSearch(¢ers, kmin, kmax, &kfinal); + contcenters(¢ers); + outcenterIDs(¢ers, centerIDs, outfile); +} + +int main(int argc, char **argv) { + cudaSetDevice(0); + char *outfilename = new char[MAXNAMESIZE]; + char *infilename = new char[MAXNAMESIZE]; + long kmin, kmax, n, chunksize, clustersize; + int dim; +#ifdef PARSEC_VERSION +#define __PARSEC_STRING(x) #x +#define __PARSEC_XSTRING(x) __PARSEC_STRING(x) + printf( + "PARSEC Benchmark Suite Version "__PARSEC_XSTRING(PARSEC_VERSION) "\n"); + fflush(NULL); +#else + printf("PARSEC Benchmark Suite\n"); + fflush(NULL); +#endif // PARSEC_VERSION +#ifdef ENABLE_PARSEC_HOOKS + __parsec_bench_begin(__parsec_streamcluster); +#endif + + if (argc < 10) { + fprintf(stderr, + "usage: %s k1 k2 d n chunksize clustersize infile outfile nproc\n", + argv[0]); + fprintf(stderr, " k1: Min. number of centers allowed\n"); + fprintf(stderr, " k2: Max. number of centers allowed\n"); + fprintf(stderr, " d: Dimension of each data point\n"); + fprintf(stderr, " n: Number of data points\n"); + fprintf(stderr, + " chunksize: Number of data points to handle per step\n"); + fprintf(stderr, " clustersize: Maximum number of intermediate centers\n"); + fprintf(stderr, " infile: Input file (if n<=0)\n"); + fprintf(stderr, " outfile: Output file\n"); + fprintf(stderr, " nproc: Number of threads to use\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "if n > 0, points will be randomly generated instead of " + "reading from infile.\n"); + exit(1); + } + kmin = atoi(argv[1]); + kmax = atoi(argv[2]); + dim = atoi(argv[3]); + n = atoi(argv[4]); + chunksize = atoi(argv[5]); + clustersize = atoi(argv[6]); + strcpy(infilename, argv[7]); + strcpy(outfilename, argv[8]); + nproc = atoi(argv[9]); + + srand48(SEED); + PStream *stream; + if (n > 0) { + stream = new SimStream(n); + } else { + stream = new FileStream(infilename); + } + + double t1 = gettime(); + +#ifdef ENABLE_PARSEC_HOOKS + __parsec_roi_begin(); +#endif + + serial_t = 0.0; + cpu_to_gpu_t = 0.0; + gpu_to_cpu_t = 0.0; + alloc_t = 0.0; + free_t = 0.0; + kernel_t = 0.0; + + isCoordChanged = false; + + streamCluster(stream, kmin, kmax, dim, chunksize, clustersize, outfilename); + + freeDevMem(); + freeHostMem(); + +#ifdef ENABLE_PARSEC_HOOKS + __parsec_roi_end(); +#endif + + double t2 = gettime(); + + printf("time = %lfs\n", t2 - t1); + + delete stream; + +#ifdef PROFILE + printf("time pgain = %lfs\n", time_gain); + printf("time pgain_dist = %lfs\n", time_gain_dist); + printf("time pgain_init = %lfs\n", time_gain_init); + printf("time pselect = %lfs\n", time_select_feasible); + printf("time pspeedy = %lfs\n", time_speedy); + printf("time pshuffle = %lfs\n", time_shuffle); + printf("time localSearch = %lfs\n", time_local_search); + printf("\n\n"); + printf("====CUDA Timing info (pgain)====\n"); + printf("time serial = %lfs\n", serial_t / 1000); + printf("time CPU to GPU memory copy = %lfs\n", cpu_to_gpu_t / 1000); + printf("time GPU to CPU memory copy back = %lfs\n", gpu_to_cpu_t / 1000); + printf("time GPU malloc = %lfs\n", alloc_t / 1000); + printf("time GPU free = %lfs\n", free_t / 1000); + printf("time kernel = %lfs\n", kernel_t / 1000); +#endif + +#ifdef ENABLE_PARSEC_HOOKS + __parsec_bench_end(); +#endif + + return 0; +} diff --git a/examples/streamcluster/streamcluster_header.h b/examples/streamcluster/streamcluster_header.h new file mode 100644 index 0000000..cc9a240 --- /dev/null +++ b/examples/streamcluster/streamcluster_header.h @@ -0,0 +1,143 @@ +/************************************************ + streamcluster_cuda_header.cu + : header file to streamcluster + + - original code from PARSEC Benchmark Suite + - parallelization with CUDA API has been applied by + + Sang-Ha (a.k.a Shawn) Lee - sl4ge@virginia.edu + University of Virginia + Department of Electrical and Computer Engineering + Department of Computer Science + +***********************************************/ + +#ifndef STREAMCLUSTER_CUDA_HEADER_CU +#define STREAMCLUSTER_CUDA_HEADER_CU + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef ENABLE_PARSEC_HOOKS +#include +#endif + +using namespace std; + +/* this structure represents a point */ +/* these will be passed around to avoid copying coordinates */ +typedef struct { + float weight; + float *coord; + long assign; /* number of point where this one is assigned */ + float cost; /* cost of that assignment, weight*distance */ +} Point; + +/* this is the array of points */ +typedef struct { + long num; /* number of points; may not be N if this is a sample */ + int dim; /* dimensionality */ + Point *p; /* the array itself */ +} Points; + +struct pkmedian_arg_t { + Points *points; + long kmin; + long kmax; + long *kfinal; + int pid; + pthread_barrier_t *barrier; +}; + +class PStream { +public: + virtual size_t read(float *dest, int dim, int num) = 0; + virtual int ferror() = 0; + virtual int feof() = 0; + virtual ~PStream() {} +}; + +// synthetic stream +class SimStream : public PStream { +public: + SimStream(long n_) { n = n_; } + size_t read(float *dest, int dim, int num) { + size_t count = 0; + for (int i = 0; i < num && n > 0; i++) { + for (int k = 0; k < dim; k++) { + dest[i * dim + k] = lrand48() / (float)INT_MAX; + } + n--; + count++; + } + return count; + } + int ferror() { return 0; } + int feof() { return n <= 0; } + ~SimStream() {} + +private: + long n; +}; + +class FileStream : public PStream { +public: + FileStream(char *filename) { + fp = fopen(filename, "rb"); + if (fp == NULL) { + fprintf(stderr, "error opening file %s\n.", filename); + exit(1); + } + } + size_t read(float *dest, int dim, int num) { + return std::fread(dest, sizeof(float) * dim, num, fp); + } + int ferror() { return std::ferror(fp); } + int feof() { return std::feof(fp); } + ~FileStream() { + printf("closing file stream\n"); + fclose(fp); + } + +private: + FILE *fp; +}; + +/* function prototypes */ +double gettime(); +int isIdentical(float *, float *, int); +// static int floatcomp(const void*, const void*); +void shuffle(Points *); +void intshuffle(int *, int); +float waste(float); +float dist(Point, Point, int); +float pspeedy(Points *, float, long, int, pthread_barrier_t *); +float pgain_old(long, Points *, float, long int *, int, pthread_barrier_t *); +float pFL(Points *, int *, int, float, long *, float, long, float, int, + pthread_barrier_t *); +int selectfeasible_fast(Points *, int **, int, int, pthread_barrier_t *); +float pkmedian(Points *, long, long, long *, int, pthread_barrier_t *); +int contcenters(Points *); +void copycenters(Points *, Points *, long *, long); +void *localSearchSub(void *); +void localSearch(Points *, long, long, long *); +void outcenterIDs(Points *, long *, char *); +void streamCluster(PStream *, long, long, int, long, long, char *); +float pgain(long, Points *, float, long int *, int, bool *, int *, bool *, bool, + double *, double *, double *, double *, double *, double *); +void allocDevMem(int, int, int); +void allocHostMem(int, int, int); +void freeDevMem(); +void freeHostMem(); + +#endif diff --git a/runtime/include/cudaKernelImpl.h b/runtime/include/cudaKernelImpl.h new file mode 100644 index 0000000..4aa94de --- /dev/null +++ b/runtime/include/cudaKernelImpl.h @@ -0,0 +1,25 @@ +#ifndef __RUNTIME_IMPL__ +#define __KERNEL_IMPL__ +#include "cudaStatus.h" +#include "structures.h" +#include +extern "C" { +double __nv_exp(double); +double __nv_sqrt(double); +float __nv_sqrtf(float); +float __nv_powif(float, int); +float __nv_logf(float); +float __nv_expf(float); +float __nv_log10f(float); +float __nv_fast_log2f(float); +double __nv_powi(double, int); +float __nv_powf(float, float); +float __nv_fast_powf(float, float); +float __nv_fmodf(float, float); +int __nv_isnanf(float); +int __nv_isinff(float); +float __nv_fabsf(float); +int __nvvm_mul24_i(int, int); +double _ZL3expd(double); +} +#endif diff --git a/runtime/include/cudaRuntimeImpl.h b/runtime/include/cudaRuntimeImpl.h index 0f5b8ae..1823206 100644 --- a/runtime/include/cudaRuntimeImpl.h +++ b/runtime/include/cudaRuntimeImpl.h @@ -2,8 +2,13 @@ #define __RUNTIME_IMPL__ #include "cudaStatus.h" #include "structures.h" +#include +extern "C" { +cudaError_t cudaGetDevice(int *devPtr); +const char *cudaGetErrorname(cudaError_t); cudaError_t cudaDeviceReset(void); cudaError_t cudaDeviceSynchronize(void); +cudaError_t cudaThreadSynchronize(void); cudaError_t cudaFree(void *devPtr); cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, @@ -11,9 +16,13 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, cudaError_t cudaMalloc(void **devPtr, size_t size); cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind); +cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count, + size_t offset, cudaMemcpyKind kind); +cudaError_t cudaMemset(void *devPtr, int value, size_t count); cudaError_t cudaSetDevice(int device); cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src); cudaError_t cudaStreamCreate(cudaStream_t *pStream); cudaError_t cudaStreamDestroy(cudaStream_t stream); cudaError_t cudaStreamSynchronize(cudaStream_t stream); +} #endif diff --git a/runtime/include/cudaStatus.h b/runtime/include/cudaStatus.h index 8b60edc..79b5b19 100644 --- a/runtime/include/cudaStatus.h +++ b/runtime/include/cudaStatus.h @@ -2,17 +2,17 @@ #define __RUNTIME_STATUS__ #include -enum cudaError_t { - CudaSuccess = 0, - CudaErrorInvalidValue = 1, - CudaErrorInvalidMemoryAllocation = 2, -}; +// enum cudaError_t { +// CudaSuccess = 0, +// CudaErrorInvalidValue = 1, +// CudaErrorInvalidMemoryAllocation = 2, +// }; -enum cudaMemcpyKind { - cudaMemcpyHostToHost = 0, - cudaMemcpyHostToDevice = 1, - cudaMemcpyDeviceToHost = 2, - cudaMemcpyDeviceToDevice = 3, - cudaMemcpyDefault = 4, -}; +// enum cudaMemcpyKind { +// cudaMemcpyHostToHost = 0, +// cudaMemcpyHostToDevice = 1, +// cudaMemcpyDeviceToHost = 2, +// cudaMemcpyDeviceToDevice = 3, +// cudaMemcpyDefault = 4, +// }; #endif diff --git a/runtime/lib/cudaKernelImpl.cpp b/runtime/lib/cudaKernelImpl.cpp new file mode 100644 index 0000000..56803c4 --- /dev/null +++ b/runtime/lib/cudaKernelImpl.cpp @@ -0,0 +1,19 @@ +#include "cudaKernelImpl.h" +#include +double __nv_exp(double base) { return exp(base); } +double __nv_sqrt(double v) { return sqrt(v); } +float __nv_sqrtf(float v) { return sqrt(v); } +float __nv_powif(float base, int exp) { return pow(base, exp); } +float __nv_logf(float v) { return logf(v); } +float __nv_expf(float v) { return expf(v); } +float __nv_log10f(float v) { return log10f(v); } +float __nv_fast_log2f(float v) { return log2f(v); } +double __nv_powi(double base, int exp) { return pow(base, exp); } +float __nv_powf(float base, float exp) { return pow(base, exp); } +float __nv_fast_powf(float base, float exp) { return pow(base, exp); } +float __nv_fmodf(float x, float y) { return fmod(x, y); } +int __nv_isnanf(float v) { return isnan(v); } +int __nv_isinff(float v) { return isinf(v); } +float __nv_fabsf(float v) { return abs(v); } +int __nvvm_mul24_i(int a, int b) { return a * b; } +double _ZL3expd(double base) { return exp(base); } diff --git a/runtime/lib/cudaRuntimeImpl.cpp b/runtime/lib/cudaRuntimeImpl.cpp index 83054f3..d15dae1 100644 --- a/runtime/lib/cudaRuntimeImpl.cpp +++ b/runtime/lib/cudaRuntimeImpl.cpp @@ -1,19 +1,37 @@ #include "cudaRuntimeImpl.h" #include "api.h" +#include "cuda_runtime.h" +#include "def.h" +#include "macros.h" +#include "structures.h" +#include +#include #include #include +#include +cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; } +const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; } cudaError_t cudaDeviceReset(void) { scheduler_uninit(); } cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); } +cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); } cudaError_t cudaFree(void *devPtr) { free(devPtr); } +cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); } + cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) { // if scheduler is null init device + // printf( + // "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n + // ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + // sharedMem); cu_kernel *ker = - create_kernel(func, gridDim, blockDim, &args, sharedMem, stream); + create_kernel(func, gridDim, blockDim, args, sharedMem, stream); int lstatus = cuLaunchKernel(&ker); + + // std::cout << "ret cudaLKernel" << std::endl; } cudaError_t cudaMalloc(void **devPtr, size_t size) { *devPtr = malloc(size); @@ -21,6 +39,10 @@ cudaError_t cudaMalloc(void **devPtr, size_t size) { return cudaErrorMemoryAllocation; return cudaSuccess; } +cudaError_t cudaMemset(void *devPtr, int value, size_t count) { + memset(devPtr, value, count); + return cudaSuccess; +} cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind) { if (kind == cudaMemcpyHostToHost) { @@ -43,9 +65,18 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, return cudaSuccess; } +cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count, + size_t offset, cudaMemcpyKind kind) { + assert(offset == 0 && "DO not support offset !=0\n"); + memcpy(dst, src + offset, count); + return cudaSuccess; +} + cudaError_t cudaSetDevice(int device) { // error checking + // std::cout << "cudaSetDevice Called" << std::endl; init_device(); + // std::cout << "cudaSetDevice Ret" << std::endl; } cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { @@ -62,6 +93,14 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { return cudaSuccess; // 0 } +static int stream_counter = 1; +/* + cudaStream_t is a Opaque Structure + + Overwrites cudaStream_t into custom cstreamData structure + (does hardware uses the cudaStream_t stream) + +*/ cudaError_t cudaStreamCreate(cudaStream_t *pStream) { cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData)); if (s == NULL) @@ -98,3 +137,109 @@ cudaError_t cudaStreamSynchronize(cudaStream_t stream) { e->ev.numKernelsToWait = e->kernelQueue->waiting_count; MUTEX_UNLOCK(e->stream_lock); } + +cudaError_t cudaGetDeviceCount(int *count) { + // dummy value + *count = 1; +} + +cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) { + + // dummy values + if (device == 0) { + strcpy(deviceProp->name, "pthread"); + deviceProp->totalGlobalMem = 0; + deviceProp->sharedMemPerBlock = 0; + deviceProp->regsPerBlock = 0; + deviceProp->warpSize = 0; + deviceProp->memPitch = 0; + deviceProp->maxThreadsPerBlock = 0; + deviceProp->maxThreadsDim[0] = 1; + deviceProp->maxThreadsDim[1] = 1; + deviceProp->maxThreadsDim[2] = 1; + + deviceProp->maxGridSize[0] = 1; + deviceProp->maxGridSize[1] = 1; + deviceProp->maxGridSize[2] = 1; + + deviceProp->totalConstMem = 0; + deviceProp->major = 0; + deviceProp->minor = 0; + deviceProp->clockRate = 0; + deviceProp->textureAlignment = 0; + deviceProp->deviceOverlap = false; + deviceProp->multiProcessorCount = 0; + } + return cudaSuccess; +} + +static cudaError_t lastError = cudaSuccess; +const char *cudaGetErrorString(cudaError_t error) { + if (error == cudaSuccess) { + return "Cuda Get Error Success"; + } +} + +cudaError_t cudaGetLastError(void) { return lastError; } + +static callParams callParamTemp; + +/* + Internal Cuda Library Functions +*/ +extern "C" { + +extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, + dim3 *blockDim, + size_t *sharedMem, + void **stream) { + // printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d + // ShMem: %lu\n", + // gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z, + // *sharedMem); + + *gridDim = callParamTemp.gridDim; + *blockDim = callParamTemp.blockDim; + *sharedMem = callParamTemp.shareMem; + *stream = callParamTemp.stream; + + // printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d, + // %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x, + // blockDim->y, blockDim->z, *sharedMem); + + // exit(1); + + return cudaSuccess; +} + +extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration( + dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) { + + // printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d + // " + // "ShMem: %lu\n ", + // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + // sharedMem); + + // memory checks allocations + callParamTemp.gridDim = gridDim; + + // std::cout << "assign gridDim" << std::endl; + + callParamTemp.blockDim = blockDim; + // std::cout << "assign blockDim" << std::endl; + callParamTemp.shareMem = sharedMem; + // std::cout << "assign shareMem" << std::endl; + (callParamTemp.stream) = stream; + + // printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d, + // %d, %d ShMem: %lu\n", + // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + // sharedMem); + + // return 0 continues the Pop + return cudaSuccess; + + // return ne 0 skips the Pop +} +} diff --git a/runtime/threadPool/include/api.h b/runtime/threadPool/include/api.h index 0c0aad6..2c2ac92 100644 --- a/runtime/threadPool/include/api.h +++ b/runtime/threadPool/include/api.h @@ -3,7 +3,8 @@ #include "structures.h" -cu_kernel *create_kernel(void (*wrap)(cu_argument *)); +cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem, cudaStream_t stream); int getWorkItem(struct kernel_queue **qu, cu_kernel *ker, struct argument *kernel_arg, int **blockId); int create_KernelQueue(kernel_queue **q); @@ -22,4 +23,7 @@ int set_kernel_arguments(cu_kernel **k, unsigned int arg_num, void **arg_value); int setKernelDimensions(cu_kernel *k, struct argument **arg, void **totalBlockSize, void *blockId); +int init_device(); +int cuLaunchKernel(cu_kernel **k); + #endif diff --git a/runtime/threadPool/include/structures.h b/runtime/threadPool/include/structures.h index d78a034..b274d63 100644 --- a/runtime/threadPool/include/structures.h +++ b/runtime/threadPool/include/structures.h @@ -1,8 +1,9 @@ #ifndef C_STRUCTURES_H #define C_STRUCTURES_H +#include "cuda_runtime.h" #include "pthread.h" -#define cudaStream_t cstreamData + typedef struct device { int max_compute_units; int device_id; @@ -27,6 +28,7 @@ typedef struct scheduler_pool { size_t idle_threads; pthread_cond_t wake_pool; + pthread_cond_t wake_host; int threadpool_shutdown_requested; @@ -85,17 +87,6 @@ typedef struct input_arg { // so that we can parse the arguments p } cu_input; -struct dim3 { - size_t x; - size_t y; - size_t z; - dim3(int d1) { - x = d1; - y = z = 1; - } - dim3() { x = y = z = 1; } -}; - enum StreamType { DEFAULT, LOW, @@ -146,7 +137,7 @@ typedef struct kernel { size_t shared_mem; - cstreamData *stream; + cudaStream_t stream; struct event *barrier; @@ -161,7 +152,8 @@ typedef struct kernel { // current blockId int blockId; - void *shared_mem_loc; + // execute multiple blocks per fetch + int gpu_block_to_execute_per_cpu_thread; } cu_kernel; @@ -188,4 +180,11 @@ typedef struct kernel_image_arg { unsigned int index; } k_arg; +typedef struct callParams { + dim3 gridDim; + dim3 blockDim; + size_t shareMem; + void *stream; +} callParams; + #endif // HEADER_FILE diff --git a/runtime/threadPool/lib/api.cpp b/runtime/threadPool/lib/api.cpp index 491d0d7..cd1c335 100644 --- a/runtime/threadPool/lib/api.cpp +++ b/runtime/threadPool/lib/api.cpp @@ -1,22 +1,31 @@ -#include -#include -#include - #include "api.h" #include "def.h" #include "macros.h" #include "structures.h" +#include +#include +#include +#include + +/* + + +*/ /* Initialize the device */ +int device_max_compute_units = 1; int init_device() { - cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); if (device == NULL) return C_ERROR_MEMALLOC; device->max_compute_units = std::thread::hardware_concurrency(); + std::cout << device->max_compute_units + << " concurrent threads are supported.\n"; + // device->max_compute_units = 64; + device_max_compute_units = device->max_compute_units; // initialize scheduler int ret = scheduler_init(*device); @@ -33,35 +42,34 @@ int init_device() { */ static int kernelIds = 0; cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim, - void ***args, size_t sharedMem, cstreamData *stream) { + void **args, size_t sharedMem, cudaStream_t stream) { cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel)); // set the function pointer ker->start_routine = (void *(*)(void *))func; - // ker->start_routine(args); - ker->args = *args; + ker->args = args; + + // exit(1); ker->gridDim = gridDim; ker->blockDim = blockDim; ker->shared_mem = sharedMem; - // malloc shared memory dynamic (heap , needs to be on the stack) - // each thread create their own shared memory // after the task submission - ker->shared_mem_loc = calloc(1, sharedMem); - + // std::cout << "stream is null" << std::endl; ker->stream = stream; + // std::cout << "stream is null" << std::endl; ker->blockId = 0; - ker->totalBlocks = gridDim.x; + ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z; - ker->N = blockDim.x; + ker->N = blockDim.x * blockDim.y * blockDim.z; ker->kernelId = kernelIds; kernelIds += 1; - ker->blockSize = blockDim.x; + ker->blockSize = blockDim.x * blockDim.y * blockDim.z; return ker; } @@ -107,6 +115,12 @@ int dequeKernelLL(struct kernel_queue **qu) { int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) { struct kernel_queue *q = *qu; cu_kernel *p = *ker; + // calculate gpu_block_to_execute_per_cpu_thread + p->gpu_block_to_execute_per_cpu_thread = + (p->totalBlocks + device_max_compute_units - 1) / + device_max_compute_units; + printf("total: %d execute per cpu: %d\n", p->totalBlocks, + p->gpu_block_to_execute_per_cpu_thread); if (q->head == NULL) { q->head = p; @@ -120,6 +134,12 @@ int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) { q->kernel_count += 1; q->waiting_count += 1; + // float** t1 = (float**)*(q->head->args + 0); + // printf("enqueueKernelTest Args 1: %p \n ", (void *) &t1); + // printf("enqueueKernel Test Args 1: %p \n ", (void *) *(q->head->args + 0)); + // float* t2 = *(t1); + // printf("enqueueKernel G Test Args: %p, val: %f\n ",(void *) &t2, *t2); + // user kernel command return C_SUCCESS; @@ -128,8 +148,23 @@ int enqueueKernel(struct kernel_queue **qu, cu_kernel **ker) { // scheduler static cu_pool *scheduler; +__thread int block_size = 0; +__thread int block_size_x = 0; +__thread int block_size_y = 0; +__thread int block_size_z = 0; +__thread int grid_size_x = 0; +__thread int grid_size_y = 0; +__thread int grid_size_z = 0; __thread int block_index = 0; +__thread int block_index_x = 0; +__thread int block_index_y = 0; +__thread int block_index_z = 0; __thread int thread_memory_size = 0; +__thread int *dynamic_shared_memory = NULL; +__thread int warp_shfl[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; /* Enqueue Kernel (k) to the scheduler kernelQueue @@ -139,6 +174,11 @@ int schedulerEnqueueKernel(cu_kernel **k) { MUTEX_LOCK(scheduler->work_queue_lock); enqueueKernel(&scheduler->kernelQueue, &ker); + // float** t1 = (float**)*(ker->args + 0); + // printf("scheduler enqueue Test Args 1: %p \n ", (void *) &t1); + // printf("scheduler enqueue Test Args 1: %p \n ", (void *) *(ker->args + 0)); + // float* t2 = *(t1); + // printf("scheduler enqueue G Test Args: %p, val: %f\n ",(void *) &t2, *t2); pthread_cond_broadcast(&(scheduler->wake_pool)); MUTEX_UNLOCK(scheduler->work_queue_lock); @@ -148,7 +188,9 @@ int schedulerEnqueueKernel(cu_kernel **k) { Kernel Launch with numBlocks and numThreadsPerBlock */ int cuLaunchKernel(cu_kernel **k) { - + if (!scheduler) { + init_device(); + } // Calculate Block Size N/numBlocks cu_kernel *ker = *k; @@ -157,11 +199,14 @@ int cuLaunchKernel(cu_kernel **k) { MUTEX_LOCK(scheduler->work_queue_lock); scheduler->num_kernel_queued += 1; MUTEX_UNLOCK(scheduler->work_queue_lock); - // stream == 0 add to the kernelQueue if (ker->stream == 0) { - - schedulerEnqueueKernel(&ker); + // float** t1 = (float**)*(ker->args + 0); + // printf("cuLaunchKernel Test Args 1: %p \n ", (void *) &t1); + // printf("cuLaunchKernel Test Args 1: %p \n ", (void *) *(ker->args + 0)); + // float* t2 = *(t1); + // printf("cuLaunchkernel G Test Args: %p, val: %f\n ",(void *) &t2, *t2); + schedulerEnqueueKernel(k); } else { // add to it's stream queue // stream queue can be waiting or running with or without tasks @@ -173,6 +218,7 @@ int cuLaunchKernel(cu_kernel **k) { cstreamData *e = ((cstreamData *)(ker->stream)); // synchronized is called after no job in the queue so stream is stuck on // synchronize + // printf("this way sync\n"); if (e->ev.status == C_SYNCHRONIZE) { if ((e->kernelQueue->finish_count) == (e->kernelQueue->kernel_count)) { e->ev.status = C_RUN; @@ -183,10 +229,11 @@ int cuLaunchKernel(cu_kernel **k) { // change the status to wait e->ev.status == C_WAIT; MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock); - + // printf("this way enqueue\n"); schedulerEnqueueKernel(&ker); } else { // the status of stream queue is wait so just enqueue to the stream + // printf("this way enqwlijs\n"); enqueueKernel(&((cstreamData *)(ker->stream))->kernelQueue, &ker); MUTEX_UNLOCK(((cstreamData *)(ker->stream))->stream_lock); } @@ -220,20 +267,25 @@ int getWorkItem(struct kernel_queue **qu, cu_kernel **kern, int blockId) { Thread Gets Work */ int get_work(c_thread *th) { - cu_kernel ker; + // std::cout << "Before Get Work Mutex Queue" << std::endl; MUTEX_LOCK(scheduler->work_queue_lock); + // std::cout << "After Get Work Mutex Queue" << std::endl; RETRY: int is_exit = 0; int is_command_not_null = 0; + int block_to_execute = 256; int blockId; int localBlockSize; int status; int completion_status = 0; + int dynamic_shared_mem_size = 0; + dim3 gridDim; + dim3 blockDim; is_exit = scheduler->threadpool_shutdown_requested; @@ -245,15 +297,27 @@ RETRY: // if kernel waiting to be complete is not zero if (scheduler->kernelQueue->waiting_count > 0) { + // std::cout << "Waiting Count is greater than 0" << std::endl; + blockId = scheduler->kernelQueue->head->blockId; + + gridDim = scheduler->kernelQueue->head->gridDim; + blockDim = scheduler->kernelQueue->head->blockDim; + dynamic_shared_mem_size = scheduler->kernelQueue->head->shared_mem; + + // std::cout << "Block ID: " << blockId << std::endl; localBlockSize = scheduler->kernelQueue->head->blockSize; // set status as success fully queue status = C_SUCCESS; ker = *(scheduler->kernelQueue->head); + + block_to_execute = + scheduler->kernelQueue->head->gpu_block_to_execute_per_cpu_thread; // if the blockId + 1 is equal to the goal block size , // then its the last block - - if (blockId + 1 == scheduler->kernelQueue->head->totalBlocks) { + if (blockId + block_to_execute >= + scheduler->kernelQueue->head->totalBlocks) { + block_to_execute = scheduler->kernelQueue->head->totalBlocks - blockId; // deque the head dequeKernelLL(&scheduler->kernelQueue); @@ -262,7 +326,7 @@ RETRY: } else { // increment the blockId scheduler->kernelQueue->head->blockId = - scheduler->kernelQueue->head->blockId + 1; + scheduler->kernelQueue->head->blockId + block_to_execute; } // status = getWorkItem(&scheduler->kernelQueue, &ker, blockId); } else { @@ -272,10 +336,25 @@ RETRY: } if (status != C_QUEUE_EMPTY) { - - block_index = blockId; - thread_memory_size = ker.shared_mem; - ker.start_routine(ker.args); + // set TLS + for (int s = 0; s < block_to_execute; s++) { + block_index = blockId + s; + block_size = localBlockSize; + block_size_x = blockDim.x; + block_size_y = blockDim.y; + block_size_z = blockDim.z; + grid_size_x = gridDim.x; + grid_size_y = gridDim.y; + grid_size_z = gridDim.z; + dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size); + int tmp = block_index; + block_index_x = tmp / (grid_size_y * grid_size_z); + tmp = tmp % (grid_size_y * grid_size_z); + block_index_y = tmp / (grid_size_z); + tmp = tmp % (grid_size_z); + block_index_z = tmp; + ker.start_routine(ker.args); + } is_command_not_null = 1; if (ker.status == C_COMPLETE) { @@ -293,8 +372,6 @@ RETRY: ((cstreamData *)(ker.stream))->ev.numKernelsToWait -= 1; } - MUTEX_LOCK(((cstreamData *)(ker.stream))->stream_lock); - if (((cstreamData *)(ker.stream))->ev.status == C_SYNCHRONIZE) { // synchronize stream if (((cstreamData *)(ker.stream))->ev.numKernelsToWait > 0) { @@ -311,7 +388,6 @@ RETRY: if (((cstreamData *)(ker.stream))->kernelQueue->waiting_count > 0) { ((cstreamData *)(ker.stream))->ev.status = C_WAIT; - MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock); cu_kernel *kern = ((cstreamData *)(ker.stream))->kernelQueue->head; schedulerEnqueueKernel(&kern); @@ -321,11 +397,10 @@ RETRY: // switch the stream to run to allow for the next execution ((cstreamData *)(ker.stream))->ev.status = C_RUN; - - MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock); } } } + MUTEX_UNLOCK(((cstreamData *)(ker.stream))->stream_lock); } MUTEX_LOCK(scheduler->work_queue_lock); scheduler->num_kernel_finished += 1; @@ -338,6 +413,9 @@ RETRY: if ((is_exit == 0 && is_command_not_null == 0)) { // all threads in condition wait scheduler->idle_threads += 1; + if (scheduler->idle_threads == scheduler->num_worker_threads) { + pthread_cond_broadcast(&(scheduler->wake_host)); + } pthread_cond_wait(&(scheduler->wake_pool), &(scheduler->work_queue_lock)); scheduler->idle_threads -= 1; goto RETRY; @@ -351,7 +429,6 @@ void *driver_thread(void *p) { struct c_thread *td = (struct c_thread *)p; int is_exit = 0; td->exit = false; - while (1) { // get work is_exit = get_work(td); @@ -369,9 +446,9 @@ void *driver_thread(void *p) { Initialize the scheduler */ int scheduler_init(cu_device device) { - scheduler = (cu_pool *)calloc(1, sizeof(cu_pool)); scheduler->num_worker_threads = device.max_compute_units; + scheduler->num_kernel_queued = 0; scheduler->thread_pool = (struct c_thread *)calloc( scheduler->num_worker_threads, sizeof(c_thread)); @@ -381,8 +458,8 @@ int scheduler_init(cu_device device) { INIT_LOCK(scheduler->work_queue_lock); pthread_cond_init(&scheduler->wake_pool, NULL); + pthread_cond_init(&scheduler->wake_host, NULL); scheduler->idle_threads = 0; - for (int i = 0; i < scheduler->num_worker_threads; i++) { scheduler->thread_pool[i].index = i; pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread, @@ -412,6 +489,7 @@ void scheduler_uninit() { pthread_mutex_destroy(&scheduler->work_queue_lock); pthread_cond_destroy(&scheduler->wake_pool); + pthread_cond_destroy(&scheduler->wake_host); scheduler->threadpool_shutdown_requested = 0; } @@ -442,15 +520,13 @@ AGAIN: Counting Barrier basically */ void cuSynchronizeBarrier() { -AGAIN: - + // std::cout << "cuSynchronizeBarrier" << std::endl; MUTEX_LOCK(scheduler->work_queue_lock); if (scheduler->num_kernel_finished != scheduler->num_kernel_queued || scheduler->idle_threads != scheduler->num_worker_threads) { - MUTEX_UNLOCK(scheduler->work_queue_lock); - goto AGAIN; - } else { - MUTEX_UNLOCK(scheduler->work_queue_lock); + // scheduler->idle_threads, scheduler->num_worker_threads); + pthread_cond_wait(&(scheduler->wake_host), &(scheduler->work_queue_lock)); } + MUTEX_UNLOCK(scheduler->work_queue_lock); }