diff --git a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
index ec2bcab..f86adf6 100644
--- a/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp
@@ -8,9 +8,12 @@
 #include <set>
 
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -23,9 +26,12 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -85,7 +91,6 @@ bool ShouldNotBeContextSaved(llvm::Instruction *instr) {
     if (load_addr == M->getGlobalVariable("warp_vote"))
       return true;
   }
-
   // TODO: we should further analyze whether the local variable
   // is same among all threads within a wrap
   return false;
@@ -314,7 +319,8 @@ void handle_alloc(llvm::Function *F) {
   }
 }
 
-void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
+void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs,
+                                      DivergenceInfo &DI) {
   bool intra_warp_loop = 1;
   // we should handle allocation generated by PHI
   {
@@ -324,6 +330,24 @@ void handle_local_variable_intra_warp(std::vector<ParallelRegion> PRs) {
       for (auto ii = bb->begin(); ii != bb->end(); ii++) {
         if (isa<AllocaInst>(&(*ii))) {
           auto alloc = dyn_cast<AllocaInst>(&(*ii));
+          // if this alloc's write are all non-divergence, then no need to
+          // replicate
+          bool allStoreNonDivergence = true;
+          for (Instruction::use_iterator ui = alloc->use_begin(),
+                                         ue = alloc->use_end();
+               ui != ue; ++ui) {
+            llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
+            if (isa<StoreInst>(user)) {
+              auto storeVar = user->getOperand(0);
+              if (DI.isDivergent(*storeVar)) {
+                allStoreNonDivergence = false;
+                break;
+              }
+            }
+          }
+          if (allStoreNonDivergence) {
+            continue;
+          }
           // Do not duplicate var used outside PRs
           bool used_in_non_PR = false;
           for (Instruction::use_iterator ui = alloc->use_begin(),
@@ -595,8 +619,6 @@ class InsertWarpLoopPass : public llvm::FunctionPass {
 public:
   static char ID;
   bool intra_warp_loop;
-  DominatorTree *DT;
-  PostDominatorTree *PDT;
 
   InsertWarpLoopPass(bool intra_warp = 0)
       : FunctionPass(ID), intra_warp_loop(intra_warp) {}
@@ -604,6 +626,8 @@ public:
   virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
   void getParallelRegionBefore(llvm::BasicBlock *B, bool intra_warp_loop,
@@ -789,8 +813,22 @@ public:
     tempInstructionIds.clear();
     tempInstructionIndex = 0;
 
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    // get DivergenceInfo
+    auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    llvm::Triple triple("nvptx64-nvidia-cuda");
+    std::string Error;
+    const Target *TheTarget = TargetRegistry::lookupTarget("", triple, Error);
+    llvm::TargetOptions Options;
+    llvm::TargetMachine *target_machine = TheTarget->createTargetMachine(
+        triple.getTriple(), "sm_35", "+ptx50", Options, llvm::Reloc::Static,
+        llvm::CodeModel::Small, llvm::CodeGenOpt::Aggressive);
+
+    llvm::FunctionAnalysisManager DummyFAM;
+    llvm::TargetTransformInfo TTI =
+        target_machine->getTargetIRAnalysis().run(F, DummyFAM);
+    DivergenceInfo DI(F, *DT, *PDT, LI, TTI, /*KnownReducible*/ true);
 
     // find parallel region we need to wrap
     auto parallel_regions = getParallelRegions(&F, intra_warp_loop);
@@ -800,7 +838,7 @@ public:
 #endif
 
     if (intra_warp_loop) {
-      handle_local_variable_intra_warp(parallel_regions);
+      handle_local_variable_intra_warp(parallel_regions, DI);
     }
     add_warp_loop(parallel_regions, intra_warp_loop);
     remove_barrier(&F, intra_warp_loop);