add op: maxpoolwithargmax2 and maxpoolgrad

2022-07-27 14:35:54 +08:00 · 2022-07-27 14:35:54 +08:00 · 9ebddb5452
parent 84d76e5251
commit 9ebddb5452
7 changed files with 945 additions and 0 deletions
--- a/include/tim/vx/ops.h
+++ b/include/tim/vx/ops.h
@ -53,6 +53,8 @@
 #include "tim/vx/ops/logsoftmax.h"
 #include "tim/vx/ops/matmul.h"
 #include "tim/vx/ops/maxpoolwithargmax.h"
+#include "tim/vx/ops/maxpoolwithargmax2.h"
+#include "tim/vx/ops/maxpoolgrad.h"
 #include "tim/vx/ops/maxunpool2d.h"
 #include "tim/vx/ops/moments.h"
 #include "tim/vx/ops/nbg.h"
--- a/include/tim/vx/ops/maxpoolgrad.h
+++ b/include/tim/vx/ops/maxpoolgrad.h
@ -0,0 +1,70 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_OPS_MAXPOOLGRAD_H_
+#define TIM_VX_OPS_MAXPOOLGRAD_H_
+
+#include "tim/vx/operation.h"
+namespace tim {
+namespace vx {
+namespace ops {
+
+/**
+ * ## MaxpooGrad
+ *
+ * Acquire the gradient of 2-D Max pooling operation's input tensor. \
+ * Like the tensorflow_XLA op SelectAndScatter, see https://tensorflow.google.cn/xla/operation_semantics?hl=en#selectandscatter.
+ *
+ * - padding : AUTO, VALID or SAME.
+ * - ksize : filter size.
+ * - stride : stride along each spatial axis.
+ * - round_type : CEILING or FLOOR.
+ * 
+ *  * Inputs:
+ * 
+ * - 0 : input tensor of 2-D Max pooling.
+ * - 1 : gradient of 2-D Max pooling output tensor.
+ */
+
+class MaxpoolGrad: public Operation {
+ public:
+  MaxpoolGrad(Graph* graph, PadType padding,
+              const std::array<uint32_t, 2>& ksize,
+              const std::array<uint32_t, 2>& stride,
+              RoundType round_type = RoundType::FLOOR,
+              DataLayout layout = DataLayout::WHCN);
+  std::shared_ptr<Operation> Clone(
+      std::shared_ptr<Graph>& graph) const override;
+
+ protected:
+  const PadType padding_;
+  const std::array<uint32_t, 2> ksize_;
+  const std::array<uint32_t, 2> stride_;
+  const RoundType round_type_;
+};
+
+}  // namespace ops
+}  // namespace vx
+}  // namespace tim
+
+#endif /*TIM_VX_OPS_MAXPOOLGRAD_H_*/
--- a/include/tim/vx/ops/maxpoolwithargmax2.h
+++ b/include/tim/vx/ops/maxpoolwithargmax2.h
@ -0,0 +1,68 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_OPS_MAXPOOLWITHARGMAX2_H_
+#define TIM_VX_OPS_MAXPOOLWITHARGMAX2_H_
+
+#include <array>
+
+#include "tim/vx/direct_map_op.h"
+#include "tim/vx/types.h"
+
+namespace tim {
+namespace vx {
+namespace ops {
+
+/**
+ * ## MaxpoolWithArgmax2
+ *
+ * Performs an 2-D Max pooling operation and return indices(which start at the beginning of the input tensor).
+ *
+ * - padding : AUTO, VALID or SAME.
+ * - ksize : filter size.
+ * - stride : stride along each spatial axis.
+ * - round_type : CEILING or FLOOR.
+ */
+
+class MaxpoolWithArgmax2 : public DirectMapOp {
+ public:
+  MaxpoolWithArgmax2(Graph* graph, PadType padding,
+         const std::array<uint32_t, 2>& ksize,
+         const std::array<uint32_t, 2>& stride,
+         RoundType round_type = RoundType::FLOOR,
+         DataLayout layout = DataLayout::WHCN);
+
+  std::shared_ptr<Operation> Clone(std::shared_ptr<Graph>& graph) const override;
+
+ protected:
+  const PadType padding_;
+  const std::array<uint32_t, 2> ksize_;
+  const std::array<uint32_t, 2> stride_;
+  const RoundType round_type_;
+};
+
+}  // namespace ops
+}  // namespace vx
+}  // namespace tim
+
+#endif /* TIM_VX_OPS_MAXPOOLWITHARGMAX2_H_ */
--- a/src/tim/vx/ops/maxpoolgrad.cc
+++ b/src/tim/vx/ops/maxpoolgrad.cc
@ -0,0 +1,162 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/ops.h"
+#include "vsi_nn_pub.h"
+#include "op_impl.h"
+
+#include <array>
+namespace tim {
+namespace vx {
+namespace ops {
+
+class MaxpoolGradImpl : public OpImpl {
+ public:
+  enum {
+    TENSOR_BEFORE_POOL = 0,
+    UPDATES_TENSOR,
+    INPUT_CNT,
+    OUT_CNT = 1,
+  };
+  MaxpoolGradImpl(Graph* graph, PadType padding,
+                  const std::array<uint32_t, 2>& ksize,
+                  const std::array<uint32_t, 2>& stride,
+                  int input_cnt, int output_cnt,
+                  RoundType round_type,
+                  DataLayout layout)
+    : OpImpl(graph, -1, input_cnt, output_cnt, layout),
+      padding_(padding),
+      ksize_(ksize),
+      stride_(stride),
+      round_type_(round_type) {
+        maxpoolwithargmax2_ = graph->CreateOperation<tim::vx::ops::MaxpoolWithArgmax2>(
+          padding_, ksize_, stride_, round_type_, layout_);
+  }
+  ~MaxpoolGradImpl() {}
+
+
+
+  MaxpoolGradImpl& BindInput(const std::shared_ptr<Tensor>& tensor) override {
+    in_tensors_[input_tensor_index] = tensor;
+    if (this->input_tensor_index == INPUT_CNT - 1) {
+      tim::vx::ShapeType in_shape = in_tensors_[TENSOR_BEFORE_POOL]->GetShape();
+      tim::vx::ShapeType updates_shape = in_tensors_[UPDATES_TENSOR]->GetShape();
+      tim::vx::ShapeType idx_flattened_shape({CalFlattenedShape(updates_shape)});
+      tim::vx::ShapeType out_flattened_shape({CalFlattenedShape(in_shape)});
+
+      tim::vx::TensorSpec pool_out_spec_indices(tim::vx::DataType::INT32,
+                              updates_shape, tim::vx::TensorAttribute::TRANSIENT);
+      tim::vx::TensorSpec pool_out_spec_values(tim::vx::DataType::FLOAT32,
+                              updates_shape, tim::vx::TensorAttribute::OUTPUT);
+      tim::vx::TensorSpec idx_flattened_spec(tim::vx::DataType::INT32,
+                              idx_flattened_shape, tim::vx::TensorAttribute::TRANSIENT);
+      tim::vx::TensorSpec upd_flattened_spec(tim::vx::DataType::FLOAT32,
+                              idx_flattened_shape, tim::vx::TensorAttribute::TRANSIENT);
+      tim::vx::TensorSpec out_flattened_spec(tim::vx::DataType::FLOAT32,
+                              out_flattened_shape, tim::vx::TensorAttribute::TRANSIENT);
+      
+      auto pool_out_indices_tensor = graph_->CreateTensor(pool_out_spec_indices);
+      auto pool_out_values_tensor = graph_->CreateTensor(pool_out_spec_values);
+      auto idx_flattened_tensor = graph_->CreateTensor(idx_flattened_spec);
+      auto upd_flattened_tensor = graph_->CreateTensor(upd_flattened_spec);
+      auto out_flattened_tensor = graph_->CreateTensor(out_flattened_spec);
+
+      (*maxpoolwithargmax2_).BindInput(in_tensors_[TENSOR_BEFORE_POOL])
+        .BindOutputs({pool_out_values_tensor, pool_out_indices_tensor});
+      
+      flatten_idx = graph_->CreateOperation<tim::vx::ops::Reshape>(idx_flattened_shape);
+      (*flatten_idx).BindInput(pool_out_indices_tensor).BindOutput(idx_flattened_tensor);
+
+      flatten_upd = graph_->CreateOperation<tim::vx::ops::Reshape>(idx_flattened_shape);
+      (*flatten_upd).BindInput(in_tensors_[UPDATES_TENSOR]).BindOutput(upd_flattened_tensor);
+
+      scatternd_ = graph_->CreateOperation<tim::vx::ops::ScatterND>(out_flattened_shape);
+      (*scatternd_).BindInputs({idx_flattened_tensor, upd_flattened_tensor}).BindOutput(out_flattened_tensor);
+
+      reshape_like_input_ = graph_->CreateOperation<tim::vx::ops::Reshape>(in_shape);
+      (*reshape_like_input_).BindInput(out_flattened_tensor);
+    }
+    this->input_tensor_index++;
+    return *this;
+  }
+
+  MaxpoolGradImpl& BindOutput(const std::shared_ptr<Tensor>& tensor) override {
+    out_tensors_[output_tensor_index] = tensor;
+    (*reshape_like_input_).BindOutput(tensor);
+    this->output_tensor_index++;
+    return *this;
+  }
+
+  vsi_nn_node_t* node() override { return nullptr; }
+
+  std::vector<std::shared_ptr<Tensor>> InputsTensor() override {
+    return inputs_tensor_;
+  }
+  std::vector<std::shared_ptr<Tensor>> OutputsTensor() override {
+    return outputs_tensor_;
+  }
+
+ private:
+  const PadType padding_;
+  const std::array<uint32_t, 2> ksize_;
+  const std::array<uint32_t, 2> stride_;
+  const RoundType round_type_;
+
+  std::shared_ptr<tim::vx::Operation> maxpoolwithargmax2_;
+  std::shared_ptr<tim::vx::Operation> flatten_idx;
+  std::shared_ptr<tim::vx::Operation> flatten_upd;
+  std::shared_ptr<tim::vx::Operation> scatternd_;
+  std::shared_ptr<tim::vx::Operation> reshape_like_input_;
+  std::array<std::shared_ptr<tim::vx::Tensor>, INPUT_CNT> in_tensors_;
+  std::array<std::shared_ptr<tim::vx::Tensor>, OUT_CNT> out_tensors_;
+  uint32_t CalFlattenedShape(const tim::vx::ShapeType& shape) {
+    uint32_t out = 1;
+    for(auto& x: shape) {
+      out *= x;
+    }
+    return out;
+  }
+};
+
+MaxpoolGrad::MaxpoolGrad(Graph* graph, PadType padding,
+                         const std::array<uint32_t, 2>& ksize,
+                         const std::array<uint32_t, 2>& stride,
+                         RoundType round_type,
+                         DataLayout layout)
+  : padding_(padding),
+    ksize_(ksize),
+    stride_(stride),
+    round_type_(round_type) {
+  impl_ = std::make_unique<MaxpoolGradImpl>(graph, padding, ksize, stride, 0, 0, round_type, layout);
+}
+
+std::shared_ptr<Operation> MaxpoolGrad::Clone(
+    std::shared_ptr<Graph>& graph) const {
+  return graph->CreateOperation<MaxpoolGrad>(
+      this->padding_, this->ksize_, this->stride_, this->round_type_,
+      this->impl_->layout_);
+}
+
+}  // namespace ops
+}  // namespace vx
+}  // namespace tim
--- a/src/tim/vx/ops/maxpoolgrad_test.cc
+++ b/src/tim/vx/ops/maxpoolgrad_test.cc
@ -0,0 +1,209 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/ops/maxpoolgrad.h"
+#include "tim/vx/ops/scatternd.h"
+#include "tim/vx/ops/reshape.h"
+
+#include "gtest/gtest.h"
+
+TEST(Fuse_MaxpoolGrad, without_overlay) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({6, 4, 1, 1});
+    tim::vx::ShapeType updates_shape({2, 2, 1, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec updates_spec(tim::vx::DataType::FLOAT32,
+                            updates_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto updates_tensor = graph->CreateTensor(updates_spec);
+    auto output_tensor = graph->CreateTensor(output_spec);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 10, 2,
+        3, 8, 9, 3, 4, 2,
+        1, 5, 7, 5, 6, 1,
+        0, 6, 2, 7, 2, 8};
+    std::vector<float> updates_data = {
+        2, 6,
+        3, 1
+        };
+    std::vector<float> golden = {
+        0, 0, 0, 0, 6, 0,
+        0, 0, 2, 0, 0, 0,
+        0, 0, 3, 0, 0, 0,
+        0, 0, 0, 0, 0, 1};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float)));
+    EXPECT_TRUE(updates_tensor->CopyDataToTensor(updates_data.data(), updates_data.size() * sizeof(float)));
+
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {3, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolGrad>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor, updates_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+
+    std::vector<float> output_values(golden.size());
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output_values.data()));
+    EXPECT_EQ(golden, output_values);
+}
+
+TEST(Fuse_MaxpoolGrad, with_overlay) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({5, 4, 1, 1});
+    tim::vx::ShapeType updates_shape({2, 2, 1, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec updates_spec(tim::vx::DataType::FLOAT32,
+                            updates_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto updates_tensor = graph->CreateTensor(updates_spec);
+    auto output_tensor = graph->CreateTensor(input_spec);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2};
+    std::vector<float> updates_data = {
+        2, 6,
+        3, 1
+        };
+    std::vector<float> golden = {
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float)));
+    EXPECT_TRUE(updates_tensor->CopyDataToTensor(updates_data.data(), updates_data.size() * sizeof(float)));
+
+
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {2, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolGrad>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor, updates_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    
+    std::vector<float> output_values(golden.size());
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output_values.data()));
+    EXPECT_EQ(golden, output_values);
+}
+
+TEST(Fuse_MaxpoolGrad, with_overlay_multi_channel_multi_batch) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({5, 4, 2, 2});
+    tim::vx::ShapeType updates_shape({2, 2, 2, 2});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec updates_spec(tim::vx::DataType::FLOAT32,
+                            updates_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto updates_tensor = graph->CreateTensor(updates_spec);
+    auto output_tensor = graph->CreateTensor(input_spec);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2,
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2,
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2,
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2};
+    std::vector<float> updates_data = {
+        2, 6,
+        3, 1,
+        2, 6,
+        3, 1,
+        2, 6,
+        3, 1,
+        2, 6,
+        3, 1,
+        };
+    std::vector<float> golden = {
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float)));
+    EXPECT_TRUE(updates_tensor->CopyDataToTensor(updates_data.data(), updates_data.size() * sizeof(float)));
+
+
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {2, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolGrad>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor, updates_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    
+    std::vector<float> output_values(golden.size());
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output_values.data()));
+    EXPECT_EQ(golden, output_values);
+}
--- a/src/tim/vx/ops/maxpoolwithargmax2.cc
+++ b/src/tim/vx/ops/maxpoolwithargmax2.cc
@ -0,0 +1,64 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/ops/maxpoolwithargmax2.h"
+
+#include "direct_map_op_impl.h"
+#include "type_utils.h"
+#include "vsi_nn_pub.h"
+
+namespace tim {
+namespace vx {
+namespace ops {
+
+MaxpoolWithArgmax2::MaxpoolWithArgmax2(Graph* graph, PadType padding,
+                                   const std::array<uint32_t, 2>& ksize,
+                                   const std::array<uint32_t, 2>& stride,
+                                   RoundType round_type,
+                                   DataLayout layout)
+    : DirectMapOp(graph, VSI_NN_OP_MAXPOOLWITHARGMAX, 1, 2, layout),
+      padding_(padding),
+      ksize_(ksize),
+      stride_(stride),
+      round_type_(round_type) {
+  this->impl()->node()->nn_param.pool.type = TranslatePoolType(PoolType::MAX);
+  this->impl()->node()->nn_param.pool.round_type =
+      TranslateRoundType(round_type_);
+  this->impl()->node()->nn_param.pool.ksize[0] = ksize_[0];
+  this->impl()->node()->nn_param.pool.ksize[1] = ksize_[1];
+  this->impl()->node()->nn_param.pool.stride[0] = stride_[0];
+  this->impl()->node()->nn_param.pool.stride[1] = stride_[1];
+  this->impl()->node()->nn_param.pool.pad_type = TranslatePadType(padding_);
+  this->SetRoundingPolicy(OverflowPolicy::SATURATE, RoundingPolicy::RTNE, round_type_);
+}
+
+std::shared_ptr<Operation> MaxpoolWithArgmax2::Clone(
+    std::shared_ptr<Graph>& graph) const {
+  return graph->CreateOperation<MaxpoolWithArgmax2>(
+      this->padding_, this->ksize_, this->stride_, this->round_type_,
+      this->impl_->layout_);
+}
+
+}  // namespace ops
+}  // namespace vx
+}  // namespace tim
--- a/src/tim/vx/ops/maxpoolwithargmax2_test.cc
+++ b/src/tim/vx/ops/maxpoolwithargmax2_test.cc
@ -0,0 +1,370 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/ops/maxpoolwithargmax2.h"
+#include "tim/vx/ops/scatternd.h"
+#include "tim/vx/ops/reshape.h"
+
+#include "gtest/gtest.h"
+
+TEST(MaxpoolWithArgmax2, without_overlay) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({6, 4, 1, 1});
+    tim::vx::ShapeType out_shape({2, 2, 1, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec_indices(tim::vx::DataType::INT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+    tim::vx::TensorSpec output_spec_values(tim::vx::DataType::FLOAT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor_indices = graph->CreateTensor(output_spec_indices);
+    auto output_tensor_values = graph->CreateTensor(output_spec_values);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 10, 2,
+        3, 8, 9, 3, 4, 2,
+        1, 5, 7, 5, 6, 1,
+        0, 6, 2, 7, 2, 8};
+    std::vector<float> values_golden = {
+        9, 10,
+        7, 8 };
+    std::vector<int32_t> indices_golden = {
+        8, 4,
+        14, 23 };
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * 4));
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {3, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolWithArgmax2>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor_values, output_tensor_indices});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output_values(4);
+    std::vector<int32_t> output_indices(4);
+
+    EXPECT_TRUE(output_tensor_values->CopyDataFromTensor(output_values.data()));
+    EXPECT_TRUE(output_tensor_indices->CopyDataFromTensor(output_indices.data()));
+    EXPECT_EQ(values_golden, output_values);
+    EXPECT_EQ(indices_golden, output_indices);
+}
+
+TEST(MaxpoolWithArgmax2, with_overlay) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({5, 4, 1, 1});
+    tim::vx::ShapeType out_shape({2, 2, 1, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec_indices(tim::vx::DataType::INT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+    tim::vx::TensorSpec output_spec_values(tim::vx::DataType::FLOAT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor_indices = graph->CreateTensor(output_spec_indices);
+    auto output_tensor_values = graph->CreateTensor(output_spec_values);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2};
+    std::vector<float> values_golden = {
+        9, 9,
+        7, 10 };
+    std::vector<int32_t> indices_golden = {
+        7, 7,
+        12, 18 };
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * 4));
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {2, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolWithArgmax2>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor_values, output_tensor_indices});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output_values(4);
+    std::vector<int32_t> output_indices(4);
+
+    EXPECT_TRUE(output_tensor_values->CopyDataFromTensor(output_values.data()));
+    EXPECT_TRUE(output_tensor_indices->CopyDataFromTensor(output_indices.data()));
+    EXPECT_EQ(values_golden, output_values);
+    EXPECT_EQ(indices_golden, output_indices);
+}
+
+TEST(MaxpoolGrad, without_overlay) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({6, 4, 1, 1});
+    tim::vx::ShapeType out_shape({2, 2, 1, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec_indices(tim::vx::DataType::INT32,
+                            out_shape, tim::vx::TensorAttribute::TRANSIENT);
+    tim::vx::TensorSpec output_spec_values(tim::vx::DataType::FLOAT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor_indices = graph->CreateTensor(output_spec_indices);
+    auto output_tensor_values = graph->CreateTensor(output_spec_values);
+    auto output_tensor = graph->CreateTensor(input_spec);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 10, 2,
+        3, 8, 9, 3, 4, 2,
+        1, 5, 7, 5, 6, 1,
+        0, 6, 2, 7, 2, 8};
+    std::vector<float> updates_data = {
+        2, 6,
+        3, 1
+        };
+    std::vector<float> golden = {
+        0, 0, 0, 0, 6, 0,
+        0, 0, 2, 0, 0, 0,
+        0, 0, 3, 0, 0, 0,
+        0, 0, 0, 0, 0, 1};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * 4));
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {3, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolWithArgmax2>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor_values, output_tensor_indices});
+
+    std::vector<uint32_t> shape = {4};
+    tim::vx::TensorSpec input_spec_indices(tim::vx::DataType::INT32,
+                            shape, tim::vx::TensorAttribute::TRANSIENT);
+    auto input_tensor_indices = graph->CreateTensor(input_spec_indices);
+
+    auto op1 = graph->CreateOperation<tim::vx::ops::Reshape>(shape);
+    (*op1).BindInputs({output_tensor_indices}).BindOutputs({input_tensor_indices});
+
+    std::vector<uint32_t> out2_shape = {24};
+    tim::vx::TensorSpec updates_spec(tim::vx::DataType::FLOAT32,
+                            shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output2_spec(tim::vx::DataType::FLOAT32,
+                            out2_shape, tim::vx::TensorAttribute::TRANSIENT);
+    auto updates_tensor = graph->CreateTensor(updates_spec);
+    auto output2_tensor = graph->CreateTensor(output2_spec);
+    EXPECT_TRUE(updates_tensor->CopyDataToTensor(
+        updates_data.data(), updates_data.size() * 4));
+
+    auto op2 = graph->CreateOperation<tim::vx::ops::ScatterND>(out2_shape);
+    (*op2).BindInputs({input_tensor_indices, updates_tensor}).BindOutputs({output2_tensor});
+
+    auto op3 = graph->CreateOperation<tim::vx::ops::Reshape>(in_shape);
+    (*op3).BindInputs({output2_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output_values(24);
+
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output_values.data()));
+    EXPECT_EQ(golden, output_values);
+}
+
+TEST(MaxpoolGrad, with_overlay) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({5, 4, 1, 1});
+    tim::vx::ShapeType out_shape({2, 2, 1, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec_indices(tim::vx::DataType::INT32,
+                            out_shape, tim::vx::TensorAttribute::TRANSIENT);
+    tim::vx::TensorSpec output_spec_values(tim::vx::DataType::FLOAT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor_indices = graph->CreateTensor(output_spec_indices);
+    auto output_tensor_values = graph->CreateTensor(output_spec_values);
+    auto output_tensor = graph->CreateTensor(input_spec);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2};
+    std::vector<float> updates_data = {
+        2, 6,
+        3, 1
+        };
+    std::vector<float> golden = {
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * 4));
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {2, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolWithArgmax2>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor_values, output_tensor_indices});
+
+    std::vector<uint32_t> shape = {4};
+    tim::vx::TensorSpec input_spec_indices(tim::vx::DataType::INT32,
+                            shape, tim::vx::TensorAttribute::TRANSIENT);
+    auto input_tensor_indices = graph->CreateTensor(input_spec_indices);
+
+    auto op1 = graph->CreateOperation<tim::vx::ops::Reshape>(shape);
+    (*op1).BindInputs({output_tensor_indices}).BindOutputs({input_tensor_indices});
+
+    std::vector<uint32_t> out2_shape = {20};
+    tim::vx::TensorSpec updates_spec(tim::vx::DataType::FLOAT32,
+                            shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output2_spec(tim::vx::DataType::FLOAT32,
+                            out2_shape, tim::vx::TensorAttribute::TRANSIENT);
+    auto updates_tensor = graph->CreateTensor(updates_spec);
+    auto output2_tensor = graph->CreateTensor(output2_spec);
+    EXPECT_TRUE(updates_tensor->CopyDataToTensor(
+        updates_data.data(), updates_data.size() * 4));
+
+    auto op2 = graph->CreateOperation<tim::vx::ops::ScatterND>(out2_shape);
+    (*op2).BindInputs({input_tensor_indices, updates_tensor}).BindOutputs({output2_tensor});
+
+    auto op3 = graph->CreateOperation<tim::vx::ops::Reshape>(in_shape);
+    (*op3).BindInputs({output2_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output_values(20);
+
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output_values.data()));
+    EXPECT_EQ(golden, output_values);
+}
+
+TEST(MaxpoolGrad, with_overlay_multi_channel_multi_batch) {
+    auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType in_shape({5, 4, 2, 2});
+    tim::vx::ShapeType out_shape({2, 2, 2, 2});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            in_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec_indices(tim::vx::DataType::INT32,
+                            out_shape, tim::vx::TensorAttribute::TRANSIENT);
+    tim::vx::TensorSpec output_spec_values(tim::vx::DataType::FLOAT32,
+                            out_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor_indices = graph->CreateTensor(output_spec_indices);
+    auto output_tensor_values = graph->CreateTensor(output_spec_values);
+    auto output_tensor = graph->CreateTensor(input_spec);
+
+    std::vector<float> in_data = {
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2,
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2,
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2,
+        7, 2, 5, 3, 8,
+        3, 8, 9, 3, 4,
+        1, 5, 7, 5, 6,
+        0, 6, 2, 10, 2};
+    std::vector<float> updates_data = {
+        2, 6,
+        3, 1,
+        2, 6,
+        3, 1,
+        2, 6,
+        3, 1,
+        2, 6,
+        3, 1,
+        };
+    std::vector<float> golden = {
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0,
+        0, 0, 8, 0, 0,
+        0, 0, 3, 0, 0,
+        0, 0, 0, 1, 0};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size() * 4));
+    std::array<uint32_t, 2> ksize = {3, 2};
+    std::array<uint32_t, 2> stride = {2, 2};
+    auto op = graph->CreateOperation<tim::vx::ops::MaxpoolWithArgmax2>(
+        tim::vx::PadType::VALID, ksize, stride);
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor_values, output_tensor_indices});
+
+    std::vector<uint32_t> shape = {16};
+    tim::vx::TensorSpec input_spec_indices(tim::vx::DataType::INT32,
+                            shape, tim::vx::TensorAttribute::TRANSIENT);
+    auto input_tensor_indices = graph->CreateTensor(input_spec_indices);
+
+    auto op1 = graph->CreateOperation<tim::vx::ops::Reshape>(shape);
+    (*op1).BindInputs({output_tensor_indices}).BindOutputs({input_tensor_indices});
+
+    std::vector<uint32_t> out2_shape = {80};
+    tim::vx::TensorSpec updates_spec(tim::vx::DataType::FLOAT32,
+                            shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output2_spec(tim::vx::DataType::FLOAT32,
+                            out2_shape, tim::vx::TensorAttribute::TRANSIENT);
+    auto updates_tensor = graph->CreateTensor(updates_spec);
+    auto output2_tensor = graph->CreateTensor(output2_spec);
+    EXPECT_TRUE(updates_tensor->CopyDataToTensor(
+        updates_data.data(), updates_data.size() * 4));
+
+    auto op2 = graph->CreateOperation<tim::vx::ops::ScatterND>(out2_shape);
+    (*op2).BindInputs({input_tensor_indices, updates_tensor}).BindOutputs({output2_tensor});
+
+    auto op3 = graph->CreateOperation<tim::vx::ops::Reshape>(in_shape);
+    (*op3).BindInputs({output2_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output_values(80);
+
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output_values.data()));
+    EXPECT_EQ(golden, output_values);
+}