diff --git a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp index ec2bcab..f86adf6 100644 --- a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp +++ b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp @@ -8,9 +8,12 @@ #include #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -23,9 +26,12 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/InitializePasses.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/PassInfo.h" #include "llvm/PassRegistry.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -85,7 +91,6 @@ bool ShouldNotBeContextSaved(llvm::Instruction *instr) { if (load_addr == M->getGlobalVariable("warp_vote")) return true; } - // TODO: we should further analyze whether the local variable // is same among all threads within a wrap return false; @@ -314,7 +319,8 @@ void handle_alloc(llvm::Function *F) { } } -void handle_local_variable_intra_warp(std::vector PRs) { +void handle_local_variable_intra_warp(std::vector PRs, + DivergenceInfo &DI) { bool intra_warp_loop = 1; // we should handle allocation generated by PHI { @@ -324,6 +330,24 @@ void handle_local_variable_intra_warp(std::vector PRs) { for (auto ii = bb->begin(); ii != bb->end(); ii++) { if (isa(&(*ii))) { auto alloc = dyn_cast(&(*ii)); + // if this alloc's write are all non-divergence, then no need to + // replicate + bool allStoreNonDivergence = true; + for (Instruction::use_iterator ui = alloc->use_begin(), + ue = alloc->use_end(); + ui != ue; ++ui) { + llvm::Instruction *user = dyn_cast(ui->getUser()); + if (isa(user)) { + auto storeVar = user->getOperand(0); + if (DI.isDivergent(*storeVar)) { + allStoreNonDivergence = false; + break; + } + } + } + if (allStoreNonDivergence) { + continue; + } // Do not duplicate var used outside PRs bool used_in_non_PR = false; for (Instruction::use_iterator ui = alloc->use_begin(), @@ -595,8 +619,6 @@ class InsertWarpLoopPass : public llvm::FunctionPass { public: static char ID; bool intra_warp_loop; - DominatorTree *DT; - PostDominatorTree *PDT; InsertWarpLoopPass(bool intra_warp = 0) : FunctionPass(ID), intra_warp_loop(intra_warp) {} @@ -604,6 +626,8 @@ public: virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop, @@ -789,8 +813,22 @@ public: tempInstructionIds.clear(); tempInstructionIndex = 0; - DT = &getAnalysis().getDomTree(); - PDT = &getAnalysis().getPostDomTree(); + // get DivergenceInfo + auto DT = &getAnalysis().getDomTree(); + auto PDT = &getAnalysis().getPostDomTree(); + auto &LI = getAnalysis().getLoopInfo(); + llvm::Triple triple("nvptx64-nvidia-cuda"); + std::string Error; + const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error); + llvm::TargetOptions Options; + llvm::TargetMachine *target_machine = TheTarget->createTargetMachine( + triple.getTriple(), "sm_35", "+ptx50", Options, llvm::Reloc::Static, + llvm::CodeModel::Small, llvm::CodeGenOpt::Aggressive); + + llvm::FunctionAnalysisManager DummyFAM; + llvm::TargetTransformInfo TTI = + target_machine->getTargetIRAnalysis().run(F, DummyFAM); + DivergenceInfo DI(F, *DT, *PDT, LI, TTI, /*KnownReducible*/ true); // find parallel region we need to wrap auto parallel_regions = getParallelRegions(&F, intra_warp_loop); @@ -800,7 +838,7 @@ public: #endif if (intra_warp_loop) { - handle_local_variable_intra_warp(parallel_regions); + handle_local_variable_intra_warp(parallel_regions, DI); } add_warp_loop(parallel_regions, intra_warp_loop); remove_barrier(&F, intra_warp_loop);