diff --git a/src/Conversion/ONNXToKrnl/CMakeLists.txt b/src/Conversion/ONNXToKrnl/CMakeLists.txt
index bab2dd3..6b6f799 100644
--- a/src/Conversion/ONNXToKrnl/CMakeLists.txt
+++ b/src/Conversion/ONNXToKrnl/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(OMONNXToKrnl
         Tensor/Split.cpp
         Tensor/Gather.cpp
         Tensor/Size.cpp
+        Tensor/Tile.cpp
         ConvertONNXToKrnl.cpp)
 target_link_libraries(OMONNXToKrnl
         onnx)
diff --git a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
index a559c40..de4f9c5 100644
--- a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
+++ b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
@@ -104,6 +104,7 @@ void FrontendToKrnlLoweringPass::runOnOperation() {
   populateLoweringONNXSqueezeOpPattern(patterns, &getContext());
   populateLoweringONNXSplitOpPattern(patterns, &getContext());
   populateLoweringONNXSizeOpPattern(patterns, &getContext());
+  populateLoweringONNXTileOpPattern(patterns, &getContext());
   // Neural network
   populateLoweringONNXConvOpPattern(patterns, &getContext());
   populateLoweringONNXNormalizationOpPattern(patterns, &getContext());
diff --git a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
index aa3d848..6045abb 100644
--- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
+++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
@@ -256,6 +256,9 @@ void populateLoweringONNXSplitOpPattern(
 void populateLoweringONNXSizeOpPattern(
     OwningRewritePatternList &patterns, MLIRContext *ctx);
 
+void populateLoweringONNXTileOpPattern(
+    OwningRewritePatternList &patterns, MLIRContext *ctx);
+
 bool checkOpResultIsUsedByGetRef(AllocOp *allocOp);
 
 int64_t getMemRefSizeInBytes(Value val);
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp b/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp
new file mode 100644
index 0000000..b2883f2
--- /dev/null
+++ b/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp
@@ -0,0 +1,228 @@
+//===----------------Tile.cpp - Lowering Tile Op----------------------=== //
+//
+// Copyright 2020 The IBM Research Authors.
+//
+// =============================================================================
+//
+// This file lowers the ONNX Tile Operator to Krnl dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Helper function to insert alloc and dealloc ops for memref of dynamic shape.
+//
+
+Value insertAllocAndDeallocForTile(MemRefType memRefType, Location loc,
+    ConversionPatternRewriter &rewriter, bool insertDealloc, Value inputOperand,
+    Value repeatsOperand) {
+  AllocOp alloc;
+  auto inputShape = inputOperand.getType().cast<MemRefType>().getShape();
+  auto inputRank = inputShape.size();
+
+  SmallVector<Value, 4> allocOperands;
+  for (int i = 0; i < inputRank; ++i) {
+    auto indexVal = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
+    SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
+    auto repeatsLoadVal =
+        rewriter.create<AffineLoadOp>(loc, repeatsOperand, repeatsMemRefVal);
+    auto repeatsElementVal = rewriter.create<IndexCastOp>(
+        loc, repeatsLoadVal, rewriter.getIndexType());
+    auto dimVal = rewriter.create<DimOp>(loc, inputOperand, i);
+    Value allocDimVal = rewriter.create<MulIOp>(loc, dimVal, repeatsElementVal);
+    allocOperands.emplace_back(allocDimVal);
+  }
+  alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
+  if (insertDealloc) {
+    auto *parentBlock = alloc.getOperation()->getBlock();
+    auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
+    dealloc.getOperation()->moveBefore(&parentBlock->back());
+  }
+  return alloc;
+}
+
+struct ONNXTileOpLowering : public ConversionPattern {
+  ONNXTileOpLowering(MLIRContext *ctx)
+      : ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    ONNXTileOpAdaptor operandAdaptor(operands);
+    ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
+    auto loc = op->getLoc();
+
+    // get input operands, shapes, and rank
+    Value input = operandAdaptor.input();
+    auto inputMemRefType = input.getType().cast<MemRefType>();
+    auto inputShape = inputMemRefType.getShape();
+    int64_t inputRank = inputShape.size();
+    Value repeats = operandAdaptor.repeats();
+
+    // get output info
+    auto resultOperand = tileOp.output();
+    auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
+    auto outputMemRefShape = outputMemRefType.getShape();
+    int64_t outputRank = outputMemRefShape.size();
+
+    bool insertDealloc = checkInsertDealloc(op);
+    Value alloc;
+    if (hasAllConstantDimensions(outputMemRefType))
+      alloc =
+          insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
+    else
+      alloc = insertAllocAndDeallocForTile(
+          outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
+
+    // Define loops and iteration trip counts (equivalent to size of output)
+    std::vector<Value> originalLoops;
+    defineLoops(rewriter, loc, originalLoops, outputRank);
+    KrnlIterateOperandPack pack(rewriter, originalLoops);
+    for (int ii = 0; ii < outputRank; ++ii)
+      addDimensionToPack(rewriter, loc, pack, alloc, ii);
+
+    // Create the loops
+    auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
+    Block &iterationBlock = iterateOp.bodyRegion().front();
+
+    // Now perform the insertions into the body of the just generated loops.
+    // Insert instructions inside the KernelIterateOp body.
+    rewriter.setInsertionPointToStart(&iterationBlock);
+
+    // Handle the operations.
+
+    // This implementation is to iterate the output tensor.
+    // The store has simple affine subscript expression.
+    // Alternative implementation is to iterate the input tensor and repeats.
+    // The load of elements in input tensor can be reused explicitly.
+    // But the subscript of store is not contigous, or even not affine.
+    // Alternative implementation can be found at the end of this file.
+    SmallVector<Value, 4> inputMemRefVal;
+    for (int i = 0; i < outputRank; ++i) {
+      auto indexAE = rewriter.getAffineDimExpr(0);
+      auto offsetAE = rewriter.getAffineSymbolExpr(0);
+      auto dimMap = AffineMap::get(1, 1, indexAE % offsetAE);
+
+      auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
+      auto loopVarVal = iterationBlock.getArguments()[i];
+      auto exprVal = rewriter.create<AffineApplyOp>(
+          loc, dimMap, ArrayRef<Value>{loopVarVal, inputDimSizeVal});
+      inputMemRefVal.emplace_back(exprVal);
+    }
+
+    // Load the value from input
+    // Tried to use affine load when the input has constant shape
+    Value inputVal;
+    if (hasAllConstantDimensions(inputMemRefType))
+      inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
+    else
+      inputVal = rewriter.create<LoadOp>(loc, input, inputMemRefVal);
+    SmallVector<Value, 4> outputMemRefVal(iterationBlock.getArguments().begin(),
+        iterationBlock.getArguments().end());
+
+    // Then store the value in the output.
+    rewriter.create<AffineStoreOp>(loc, inputVal, alloc, outputMemRefVal);
+
+    rewriter.replaceOp(op, alloc);
+
+    return success();
+  }
+};
+
+// This is the alternative way of lowering.
+// It is kept here for record in case this implementation is needed
+struct ONNXTileOpLoweringAlternative : public ConversionPattern {
+  ONNXTileOpLoweringAlternative(MLIRContext *ctx)
+      : ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    ONNXTileOpAdaptor operandAdaptor(operands);
+    ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
+    auto loc = op->getLoc();
+    // get input operands, shapes, and rank
+    Value input = operandAdaptor.input();
+    auto inputShape = input.getType().cast<MemRefType>().getShape();
+    int64_t inputRank = inputShape.size();
+    Value repeats = operandAdaptor.repeats();
+
+    // get output info
+    auto resultOperand = tileOp.output();
+    auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
+    auto outputMemRefShape = outputMemRefType.getShape();
+    int64_t outputRank = outputMemRefShape.size();
+
+    bool insertDealloc = checkInsertDealloc(op);
+    Value alloc;
+    if (hasAllConstantDimensions(outputMemRefType))
+      alloc =
+          insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
+    else
+      alloc = insertAllocAndDeallocForTile(
+          outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
+
+    // Define loops and iteration trip counts (equivalent to size of output)
+    std::vector<Value> originalLoops;
+    defineLoops(rewriter, loc, originalLoops, outputRank * 2);
+    KrnlIterateOperandPack pack(rewriter, originalLoops);
+    for (int ii = 0; ii < outputRank; ++ii) {
+      addDimensionToPack(rewriter, loc, pack, input, ii);
+      pack.pushConstantBound(0);
+      auto indexVal =
+          emitConstantOp(rewriter, loc, rewriter.getIndexType(), ii);
+      SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
+      auto repeatsLoadVal =
+          rewriter.create<AffineLoadOp>(loc, repeats, repeatsMemRefVal);
+      auto repeatsElementVal = rewriter.create<IndexCastOp>(
+          loc, repeatsLoadVal, rewriter.getIndexType());
+      pack.pushOperandBound(repeatsElementVal);
+    }
+
+    // Create the loops
+    auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
+    Block &iterationBlock = iterateOp.bodyRegion().front();
+
+    // Now perform the insertions into the body of the just generated loops.
+    // Insert instructions inside the KernelIterateOp body.
+    rewriter.setInsertionPointToStart(&iterationBlock);
+
+    // Handle the operations.
+
+    SmallVector<Value, 4> inputMemRefVal;
+    for (int j = 0; j < inputRank; ++j) {
+      inputMemRefVal.emplace_back(iterationBlock.getArguments()[j * 2]);
+    }
+
+    SmallVector<Value, 4> outputMemRefVal;
+    for (int i = 0; i < inputRank; ++i) {
+
+      auto inputIndexAE = rewriter.getAffineDimExpr(0);
+      auto repeatsIndexAE = rewriter.getAffineDimExpr(1);
+      auto inputDimAE = rewriter.getAffineSymbolExpr(0);
+
+      auto dimMap =
+          AffineMap::get(2, 1, inputDimAE * repeatsIndexAE + inputIndexAE);
+
+      auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
+
+      auto dimExprVal = rewriter.create<AffineApplyOp>(loc, dimMap,
+          ArrayRef<Value>{iterationBlock.getArguments()[2 * i],
+              iterationBlock.getArguments()[2 * i + 1], inputDimSizeVal});
+      outputMemRefVal.emplace_back(dimExprVal);
+    }
+
+    auto inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
+    rewriter.create<StoreOp>(loc, inputVal, alloc, outputMemRefVal);
+
+    rewriter.replaceOp(op, alloc);
+
+    return success();
+  }
+};
+
+void populateLoweringONNXTileOpPattern(
+    OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  patterns.insert<ONNXTileOpLowering>(ctx);
+}
diff --git a/test/backend/test.py b/test/backend/test.py
index 817e788..51a9224 100644
--- a/test/backend/test.py
+++ b/test/backend/test.py
@@ -419,7 +419,11 @@ test_to_enable = [
     "test_split_variable_parts_1d_cpu",
     "test_split_variable_parts_2d_cpu",
     "test_split_variable_parts_default_axis_cpu",
-
+    
+    # Tile
+    "test_tile_cpu",
+    "test_tile_precomputed_cpu",
+    
     # ConstantOfShape
     "test_constantofshape_float_ones_cpu",
     
diff --git a/test/mlir/onnx/onnx_lowering.mlir b/test/mlir/onnx/onnx_lowering.mlir
index de3511d..6b437a5 100644
--- a/test/mlir/onnx/onnx_lowering.mlir
+++ b/test/mlir/onnx/onnx_lowering.mlir
@@ -2297,3 +2297,78 @@ func @test_constant_of_shape_static_dims() -> tensor<*xf32> {
   // CHECK: }
   // CHECK: return [[RES]] : memref<3x4x5xf32>
 }
+
+// -----
+
+// Test Tile with 2D input and constant repeats
+func @test_tile1(%arg0 : tensor<4x8xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Constant"() { value = dense<[3, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %1 = "onnx.Tile"(%arg0, %0) : (tensor<4x8xf32>, tensor<2xi64>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+  // CHECK-LABEL: test_tile1
+  // CHECK:  [[R0:%.+]] = alloc() : memref<12x16xf32>
+  // CHECK:  [[R1:%.+]] = "krnl.global"() {name = "constant_0", shape = [2], value = dense<[3, 2]> : tensor<2xi64>} : () -> memref<2xi64>
+  // CHECK:  [[R2:%.+]]:2 = krnl.define_loops 2
+  // CHECK:  krnl.iterate([[R2]]#0, [[R2]]#1) with ([[R2]]#0 -> [[ARG1:%.+]] = 0 to 12, [[R2]]#1 -> [[ARG2:%.+]] = 0 to 16) {
+  // CHECK:    [[C0:%.+]] = constant 0 : index
+  // CHECK:    [[R3:%.+]] = dim %arg0, [[C0]] : memref<4x8xf32>
+  // CHECK:    [[R4:%.+]] = affine.apply [[INDEX_MAP]]([[ARG1]]){{\[}}[[R3]]{{\]}}
+  // CHECK:    [[C1:%.+]] = constant 1 : index
+  // CHECK:    [[R5:%.+]] = dim %arg0, [[C1]] : memref<4x8xf32>
+  // CHECK:    [[R6:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R5]]{{\]}}
+  // CHECK:    [[R7:%.+]] = affine.load %arg0{{\[}}[[R4]], [[R6]]{{\]}} : memref<4x8xf32>
+  // CHECK:    affine.store [[R7]], %0{{\[}}[[ARG1]], [[ARG2]]{{\]}} : memref<12x16xf32>
+}
+
+// -----
+
+// Test Tile with 1D input and unknown repeats
+func @test_tile2(%arg0 : tensor<8xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
+  %1 = "onnx.Tile"(%arg0, %arg1) : (tensor<8xf32>, tensor<1xi64>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+  // CHECK-LABEL test_tile2
+  // CHECK:  [[C0:%.+]] = constant 0 : index
+  // CHECK:  [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
+  // CHECK:  [[R1:%.+]] = index_cast [[R0]] : i64 to index
+  // CHECK:  [[C0_0:%.+]] = constant 0 : index
+  // CHECK:  [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<8xf32>
+  // CHECK:  [[R3:%.+]] = muli [[R2]], [[R1]] : index
+  // CHECK:  [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
+  // CHECK:  [[R5:%.+]] = krnl.define_loops 1
+  // CHECK:  [[C0_1:%.+]] = constant 0 : index
+  // CHECK:  [[R6:%.+]] = dim [[R4]], [[C0_1]] : memref<?xf32>
+  // CHECK:  krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
+  // CHECK:    [[C0_2:%.+]] = constant 0 : index
+  // CHECK:    [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<8xf32>
+  // CHECK:    [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
+  // CHECK:    [[R9:%.+]] = affine.load %arg0{{\[}}[[R8]]{{\]}} : memref<8xf32>
+  // CHECK:    affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
+}
+
+// -----
+
+// Test Tile with 1D unknown input 
+func @test_tile3(%arg0 : tensor<?xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
+  %1 = "onnx.Tile"(%arg0, %arg1) : (tensor<?xf32>, tensor<1xi64>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+  // CHECK-LABEL test_tile3
+  // CHECK:  [[C0:%.+]] = constant 0 : index
+  // CHECK:  [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
+  // CHECK:  [[R1:%.+]] = index_cast [[R0]] : i64 to index
+  // CHECK:  [[C0_0:%.+]] = constant 0 : index
+  // CHECK:  [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<?xf32>
+  // CHECK:  [[R3:%.+]] = muli [[R2]], [[R1]] : index
+  // CHECK:  [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
+  // CHECK:  [[R5:%.+]] = krnl.define_loops 1
+  // CHECK:  [[C0_1:%.+]] = constant 0 : index
+  // CHECK:  [[R6:%.+]] = dim %4, [[C0_1]] : memref<?xf32>
+  // CHECK:  krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
+  // CHECK:    [[C0_2:%.+]] = constant 0 : index
+  // CHECK:    [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<?xf32>
+  // CHECK:    [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
+  // CHECK:    [[R9:%.+]] = load %arg0{{\[}}[[R8]]{{\]}} : memref<?xf32>
+  // CHECK:    affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
+}