From 93f20429ea65f2314c34e5574ef9f4a3da4ee529 Mon Sep 17 00:00:00 2001
From: chxin66 <57057788+chxin66@users.noreply.github.com>
Date: Wed, 13 Apr 2022 10:12:37 +0800
Subject: [PATCH] Fixed layout inference bug for stride_slice (#329)

Signed-off-by: Chen Xin <jack.chen@verisilicon.com>
---
 include/tim/vx/ops/stridedslice.h             |   2 +-
 src/tim/transform/ops/op_layout_inference.cc  |   8 +
 src/tim/transform/ops/op_layout_inference.h   |  18 +-
 .../ops/stridedslice_layout_inference.h       |  31 ++-
 .../stridedslice_layout_inference_test.cc     | 213 ++++++++++++++++++
 src/tim/vx/ops/stridedslice_test.cc           |   1 +
 6 files changed, 261 insertions(+), 12 deletions(-)
 create mode 100644 src/tim/transform/stridedslice_layout_inference_test.cc
diff --git a/include/tim/vx/ops/stridedslice.h b/include/tim/vx/ops/stridedslice.h
index c0d75f1..7ba6e5c 100644
--- a/include/tim/vx/ops/stridedslice.h
+++ b/include/tim/vx/ops/stridedslice.h
@@ -32,7 +32,7 @@ namespace ops {
 /**
  * ## StridedSlice
  *
- * Extracts a strided slice of a tensor.
+ * Extracts a strided slice of a tensor.Same as tensorflow.
  *
  * Roughly speaking, this op extracts a slice of size (end - begin) / stride from
  * the given input tensor. Starting at the location specified by begin the slice
diff --git a/src/tim/transform/ops/op_layout_inference.cc b/src/tim/transform/ops/op_layout_inference.cc
index 65608d8..1828159 100644
--- a/src/tim/transform/ops/op_layout_inference.cc
+++ b/src/tim/transform/ops/op_layout_inference.cc
@@ -393,5 +393,13 @@ std::vector<int32_t> OpLayoutInfer::MapMultipleAxis(
   return r;
 }
 
+int32_t OpLayoutInfer::MapMask(const std::vector<uint32_t>& perm,
+                               int32_t mask) {
+  int32_t m = 0;
+  for (uint32_t i = 0; i < perm.size(); ++i)
+    if (mask & 1 << perm[i]) m |= (0x01 << i);
+  return m;
+}
+
 }  // namespace transform
 }  // namespace tim
diff --git a/src/tim/transform/ops/op_layout_inference.h b/src/tim/transform/ops/op_layout_inference.h
index ea7e5e5..cdaf178 100644
--- a/src/tim/transform/ops/op_layout_inference.h
+++ b/src/tim/transform/ops/op_layout_inference.h
@@ -54,17 +54,18 @@ class OpLayoutInfer {
       const std::shared_ptr<vx::Operation> op,
       std::shared_ptr<layout_inference_impl::LayoutInferContext>& context)
       : op_(op), context_(context) {}
-  virtual void OnInputs(std::vector<std::shared_ptr<vx::Tensor>>& next_tensors) = 0;
+  virtual void OnInputs(
+      std::vector<std::shared_ptr<vx::Tensor>>& next_tensors) = 0;
   virtual void OnOutputs(
       std::vector<std::shared_ptr<vx::Tensor>>& next_tensors);
 
   virtual ~OpLayoutInfer() = default;
 
  protected:
-  std::shared_ptr<vx::Tensor> InsertPermute(std::shared_ptr<vx::Tensor> input,
-                                            std::shared_ptr<IPermuteVector> perm,
-                                            bool is_graph_output = false,
-                                            std::shared_ptr<vx::Tensor> src_out = nullptr);
+  std::shared_ptr<vx::Tensor> InsertPermute(
+      std::shared_ptr<vx::Tensor> input, std::shared_ptr<IPermuteVector> perm,
+      bool is_graph_output = false,
+      std::shared_ptr<vx::Tensor> src_out = nullptr);
   std::vector<std::shared_ptr<vx::Tensor>> CreateOutputsTensor(
       std::shared_ptr<IPermuteVector> required_pv);
 
@@ -95,11 +96,12 @@ class OpLayoutInfer {
   std::shared_ptr<vx::Tensor> PermuteConstTensor(
       const std::shared_ptr<vx::Tensor>& input,
       const std::shared_ptr<IPermuteVector>& pv);
-  
+
   std::vector<uint32_t> MapMultipleAxis(const std::vector<uint32_t>& perm,
-                                   const std::vector<uint32_t>& axises);
+                                        const std::vector<uint32_t>& axises);
   std::vector<int32_t> MapMultipleAxis(const std::vector<uint32_t>& perm,
-                                   const std::vector<int32_t>& axises);
+                                       const std::vector<int32_t>& axises);
+  int32_t MapMask(const std::vector<uint32_t>& perm, int32_t mask);
 
  protected:
   const std::shared_ptr<vx::Operation> op_;
diff --git a/src/tim/transform/ops/stridedslice_layout_inference.h b/src/tim/transform/ops/stridedslice_layout_inference.h
index 058d2db..f8c8ca5 100644
--- a/src/tim/transform/ops/stridedslice_layout_inference.h
+++ b/src/tim/transform/ops/stridedslice_layout_inference.h
@@ -70,15 +70,40 @@ class StridedSliceLayoutInfer : public OpLayoutInfer {
     end_dims = MapMultipleAxis(input_pv->AsStdVec(), end_dims);
     stride_dims = MapMultipleAxis(input_pv->AsStdVec(), stride_dims);
 
+    shrink_axis_mask = MapMask(input_pv->AsStdVec(), shrink_axis_mask);
+    begin_mask = MapMask(input_pv->AsStdVec(), begin_mask);
+    end_mask = MapMask(input_pv->AsStdVec(), end_mask);
     auto strided_slice =
         context_->infer_graph_->CreateOperation<vx::ops::StridedSlice>(
             begin_dims, end_dims, stride_dims, begin_mask, end_mask,
             shrink_axis_mask);
-    auto infer_out = CreateOutputsTensor(input_pv);
+    // The following is the normalized dimension calculation
+    std::set<uint32_t> remaind_set;
+    std::vector<uint32_t> remaind_axis;
+    for (uint32_t i = 0; i < input_pv->AsStdVec().size(); ++i)
+      if ((shrink_axis_mask & (1 << i)) == 0) {
+        remaind_axis.push_back(
+            input_pv->AsStdVec()
+                [i]);  // Store unnormalized dimensionality reduction pv values
+        remaind_set.insert(input_pv->AsStdVec()[i]);
+      }
+    // Traverse the input pv to find a dimension smaller than the current remaining dimension
+    auto out_pv = MakeShared(remaind_axis.size());
+    for (uint32_t i = 0; i < remaind_axis.size(); ++i) {
+      uint32_t cnt = 0;
+      for (uint32_t j = 0; j < input_pv->AsStdVec().size(); j++) {
+        if (input_pv->AsStdVec()[j] < remaind_axis[i] &&
+            remaind_set.end() == remaind_set.find(input_pv->AsStdVec()[j])) {
+          cnt++;  // Record the number of dimensions smaller than the current dimension
+        }
+      }
+      out_pv->At(i) = remaind_axis[i] - cnt;
+    }
+
+    auto infer_out = CreateOutputsTensor(out_pv);
     (*strided_slice).BindInput(context_->GetMapedTensor(src_input));
     (*strided_slice).BindOutput(infer_out[0]);
-
-    context_->SetPermuteVector(op_->impl()->OutputsTensor()[0], input_pv);
+    context_->SetPermuteVector(op_->impl()->OutputsTensor()[0], out_pv);
     next_tensors.push_back(op_->impl()->OutputsTensor()[0]);
   }
 };
diff --git a/src/tim/transform/stridedslice_layout_inference_test.cc b/src/tim/transform/stridedslice_layout_inference_test.cc
new file mode 100644
index 0000000..0f678a4
--- /dev/null
+++ b/src/tim/transform/stridedslice_layout_inference_test.cc
@@ -0,0 +1,213 @@
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/ops.h"
+#include "tim/transform/layout_inference.h"
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+
+TEST(StridedSlice, endmask_2_shrinkmask_2) {
+  auto ctx = tim::vx::Context::Create();
+  auto graph = ctx->CreateGraph();
+
+  tim::vx::ShapeType input_shape({2, 4, 6, 1});
+  tim::vx::ShapeType kernel_shape({3, 2, 2, 2});
+  tim::vx::ShapeType conv2dout_shape({3, 3, 5, 1});
+  tim::vx::ShapeType output_shape({2, 3, 1});
+
+  tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32, input_shape,
+                                 tim::vx::TensorAttribute::INPUT);
+  tim::vx::TensorSpec kernel_spec(tim::vx::DataType::FLOAT32, kernel_shape,
+                                  tim::vx::TensorAttribute::CONSTANT);
+  tim::vx::TensorSpec conv2dout_spec(tim::vx::DataType::FLOAT32,
+                                     conv2dout_shape,
+                                     tim::vx::TensorAttribute::TRANSIENT);
+  tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
+                                  tim::vx::TensorAttribute::OUTPUT);
+
+  auto input_tensor = graph->CreateTensor(input_spec);
+  auto conv2dout_tensor = graph->CreateTensor(conv2dout_spec);
+  auto output_tensor = graph->CreateTensor(output_spec);
+
+  std::vector<float> in_data = {
+      1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1,
+      1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1,
+  };
+  std::vector<float> kernel_data = {
+      1, 0, 3, 4, 4, 2, 1, 2, 3, 1, 3, 1, 1, 3, 1, 0, 2, 0, 3, 1, 4, 0, 0, 2,
+  };
+  std::vector<float> golden = {
+      55, 30, 71, 40, 40, 38,
+  };
+  auto kernel_tensor = graph->CreateTensor(kernel_spec, kernel_data.data());
+
+  // The following parameters have been reverse
+  std::vector<int> begin = {0, 0, 0, 0};
+  std::vector<int> end = {2, 3, 4, 1};
+  std::vector<int> strides = {1, 1, 1, 1};
+  uint32_t MASK_BEGIN = 0, MASK_END = 0b0100, MASK_SHRINK = 0b0100;
+
+  std::array<uint32_t, 2> stride({1, 1});
+  std::array<uint32_t, 2> dilation({1, 1});
+
+  auto op1 = graph->CreateOperation<tim::vx::ops::Conv2d>(
+      tim::vx::PadType::VALID, stride, dilation, 0, tim::vx::DataLayout::CWHN);
+  (*op1)
+      .BindInputs({input_tensor, kernel_tensor})
+      .BindOutputs({conv2dout_tensor});
+  auto op2 = graph->CreateOperation<tim::vx::ops::StridedSlice>(
+      begin, end, strides, MASK_BEGIN, MASK_END, MASK_SHRINK);
+  (*op2).BindInputs({conv2dout_tensor}).BindOutputs({output_tensor});
+
+  auto transform = tim::transform::LayoutInference(graph, ctx);
+  auto infer_graph = transform.first;
+  auto graph_io_map = transform.second;
+  auto infer_input = graph_io_map[graph->InputsTensor()[0]];
+  auto infer_output = graph_io_map[graph->OutputsTensor()[0]];
+  infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float));
+
+  EXPECT_TRUE(infer_graph->Compile());
+  EXPECT_TRUE(infer_graph->Run());
+
+  std::vector<float> output(golden.size());
+  EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
+  EXPECT_EQ(golden, output);
+}
+
+TEST(StridedSlice, endmask_6_shrinkmask_5) {
+  auto ctx = tim::vx::Context::Create();
+  auto graph = ctx->CreateGraph();
+
+  tim::vx::ShapeType input_shape({2, 4, 6, 1});
+  tim::vx::ShapeType kernel_shape({3, 2, 2, 2});
+  tim::vx::ShapeType conv2dout_shape({3, 3, 5, 1});
+  tim::vx::ShapeType output_shape({2, 5});
+
+  tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32, input_shape,
+                                 tim::vx::TensorAttribute::INPUT);
+  tim::vx::TensorSpec kernel_spec(tim::vx::DataType::FLOAT32, kernel_shape,
+                                  tim::vx::TensorAttribute::CONSTANT);
+  tim::vx::TensorSpec conv2dout_spec(tim::vx::DataType::FLOAT32,
+                                     conv2dout_shape,
+                                     tim::vx::TensorAttribute::TRANSIENT);
+  tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
+                                  tim::vx::TensorAttribute::OUTPUT);
+
+  auto input_tensor = graph->CreateTensor(input_spec);
+  auto conv2dout_tensor = graph->CreateTensor(conv2dout_spec);
+  auto output_tensor = graph->CreateTensor(output_spec);
+
+  std::vector<float> in_data = {
+      1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1,
+      1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1,
+  };
+  std::vector<float> kernel_data = {
+      1, 0, 3, 4, 4, 2, 1, 2, 3, 1, 3, 1, 1, 3, 1, 0, 2, 0, 3, 1, 4, 0, 0, 2,
+  };
+  std::vector<float> golden = {
+      55, 30, 55, 30, 55, 30, 55, 30, 55, 30
+  };
+  auto kernel_tensor = graph->CreateTensor(kernel_spec, kernel_data.data());
+
+  // The following parameters have been reverse
+  std::vector<int> begin = {0, 0, 0, 0};
+  std::vector<int> end = {2, 3, 4, 1};
+  std::vector<int> strides = {1, 1, 1, 1};
+  uint32_t MASK_BEGIN = 0, MASK_END = 0b0110, MASK_SHRINK = 0b1010;
+
+  std::array<uint32_t, 2> stride({1, 1});
+  std::array<uint32_t, 2> dilation({1, 1});
+
+  auto op1 = graph->CreateOperation<tim::vx::ops::Conv2d>(
+      tim::vx::PadType::VALID, stride, dilation, 0, tim::vx::DataLayout::CWHN);
+  (*op1)
+      .BindInputs({input_tensor, kernel_tensor})
+      .BindOutputs({conv2dout_tensor});
+  auto op2 = graph->CreateOperation<tim::vx::ops::StridedSlice>(
+      begin, end, strides, MASK_BEGIN, MASK_END, MASK_SHRINK);
+  (*op2).BindInputs({conv2dout_tensor}).BindOutputs({output_tensor});
+
+  auto transform = tim::transform::LayoutInference(graph, ctx);
+  auto infer_graph = transform.first;
+  auto graph_io_map = transform.second;
+  auto infer_input = graph_io_map[graph->InputsTensor()[0]];
+  auto infer_output = graph_io_map[graph->OutputsTensor()[0]];
+  infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float));
+
+  EXPECT_TRUE(infer_graph->Compile());
+  EXPECT_TRUE(infer_graph->Run());
+
+  std::vector<float> output(golden.size());
+  EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
+  EXPECT_EQ(golden, output);
+}
+
+TEST(StridedSlice, endmask_1_shrinkmask_1) {
+  auto ctx = tim::vx::Context::Create();
+  auto graph = ctx->CreateGraph();
+
+  tim::vx::ShapeType input_shape({2, 4, 6, 1});   //tf layout
+  tim::vx::ShapeType kernel_shape({2, 2, 2, 3});  //tf layout
+  tim::vx::ShapeType conv2dout_shape({3, 3, 5, 1});
+  tim::vx::ShapeType output_shape({2, 3, 4});
+
+  tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32, input_shape,
+                                 tim::vx::TensorAttribute::INPUT);
+  tim::vx::TensorSpec kernel_spec(tim::vx::DataType::FLOAT32, kernel_shape,
+                                  tim::vx::TensorAttribute::CONSTANT);
+  tim::vx::TensorSpec conv2dout_spec(tim::vx::DataType::FLOAT32,
+                                     conv2dout_shape,
+                                     tim::vx::TensorAttribute::TRANSIENT);
+  tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
+                                  tim::vx::TensorAttribute::OUTPUT);
+
+  auto input_tensor = graph->CreateTensor(input_spec);
+  auto conv2dout_tensor = graph->CreateTensor(conv2dout_spec);
+  auto output_tensor = graph->CreateTensor(output_spec);
+
+  std::vector<float> in_data = {
+      1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1,
+      1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1, 1, 4, 2, 5, 3, 6, 3, 1,
+  };
+  std::vector<float> kernel_data = {
+      1, 0, 3, 4, 4, 2, 1, 2, 3, 1, 3, 1, 1, 3, 1, 0, 2, 0, 3, 1, 4, 0, 0, 2,
+  };
+  std::vector<float> golden = {
+      51, 33, 68, 46, 45, 49, 51, 33, 68, 46, 45, 49,
+      51, 33, 68, 46, 45, 49, 51, 33, 68, 46, 45, 49,
+  };
+  auto kernel_tensor = graph->CreateTensor(kernel_spec, kernel_data.data());
+
+  // The following parameters have been reverse
+  std::vector<int> begin = {0, 0, 0, 0};
+  std::vector<int> end = {2, 3, 4, 1};
+  std::vector<int> strides = {1, 1, 1, 1};
+  uint32_t MASK_BEGIN = 0, MASK_END = 0b1000, MASK_SHRINK = 0b1000;
+
+  std::array<uint32_t, 2> stride({1, 1});
+  std::array<uint32_t, 2> dilation({1, 1});
+
+  auto op1 = graph->CreateOperation<tim::vx::ops::Conv2d>(
+      tim::vx::PadType::VALID, stride, dilation, 0, tim::vx::DataLayout::CWHN);
+  (*op1)
+      .BindInputs({input_tensor, kernel_tensor})
+      .BindOutputs({conv2dout_tensor});
+  auto op2 = graph->CreateOperation<tim::vx::ops::StridedSlice>(
+      begin, end, strides, MASK_BEGIN, MASK_END, MASK_SHRINK);
+  (*op2).BindInputs({conv2dout_tensor}).BindOutputs({output_tensor});
+
+  auto transform = tim::transform::LayoutInference(graph, ctx);
+  auto infer_graph = transform.first;
+  auto graph_io_map = transform.second;
+  auto infer_input = graph_io_map[graph->InputsTensor()[0]];
+  auto infer_output = graph_io_map[graph->OutputsTensor()[0]];
+  infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float));
+
+  EXPECT_TRUE(infer_graph->Compile());
+  EXPECT_TRUE(infer_graph->Run());
+
+  std::vector<float> output(golden.size());
+  EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
+  EXPECT_EQ(golden, output);
+}
diff --git a/src/tim/vx/ops/stridedslice_test.cc b/src/tim/vx/ops/stridedslice_test.cc
index ab7f232..933c02e 100644
--- a/src/tim/vx/ops/stridedslice_test.cc
+++ b/src/tim/vx/ops/stridedslice_test.cc
@@ -24,6 +24,7 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/ops/stridedslice.h"
+#include "tim/transform/layout_inference.h"
 
 #include <algorithm>