Added op fusion for mean_stddev_normalization (#629)

Added op fusion for mean_stddev_normalization ops such as layernorm and instance norm Type: New Feature Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2023-08-09 22:10:45 +08:00 · 2023-08-09 22:10:45 +08:00 · 35e50d7692
parent bff26a32c4
commit 35e50d7692
8 changed files with 573 additions and 131 deletions
--- a/include/tim/transform/mean_stddev_normalize_fusion.h
+++ b/include/tim/transform/mean_stddev_normalize_fusion.h
@ -0,0 +1,22 @@
 #ifndef TIM_MEAN_STD_DEV_NORMALIZE_FUSION_H
 #define TIM_MEAN_STD_DEV_NORMALIZE_FUSION_H
 #include <map>
 #include <vector>
 #include <memory>
 namespace tim {
 namespace vx {
 class Context;
 class Graph;
 class Tensor;
 class Operation;
 }  // namespace vx
 namespace transform {
 void MeanStdDevNormalization(std::shared_ptr<vx::Graph>& src_graph);
 }  // namespace transform
 }  // namespace tim
 #endif
--- a/include/tim/vx/graph.h
+++ b/include/tim/vx/graph.h
@ -33,6 +33,7 @@
 #endif
 #include <memory>
 #include <vector>
 #include <map>
 namespace tim {
 namespace vx {
 #ifdef ENABLE_TENSOR_CACHE
@ -102,6 +103,12 @@ class Graph {
  virtual void PrintGraph() const = 0;
  const std::vector<std::shared_ptr<Tensor>> GetConstantInputs() const;
  virtual std::vector<std::shared_ptr<Operation>>& OpVector() = 0;
  virtual std::map<std::shared_ptr<Tensor>,
                   std::vector<std::shared_ptr<Operation>>>&
  TensorConsumer() = 0;
  virtual std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>>&
  TensorProducer() = 0;
 protected:
  std::vector<std::shared_ptr<tim::vx::Operation>> op_vector_;
--- a/include/tim/vx/tensor.h
+++ b/include/tim/vx/tensor.h
@ -48,9 +48,7 @@ class Quantization {
        channel_dim_(channel_dim),
        scales_(std::move(scales)),
        zero_points_(std::move(zero_points)) {}
-  Quantization(QuantType type, int8_t fl)
+  Quantization(QuantType type, int8_t fl) : type_(type), fl_(fl) {}
      : type_(type),
        fl_(fl){}
  QuantType& Type() { return type_; }
  const QuantType& Type() const { return type_; }
  Quantization& SetType(QuantType type) {
@ -148,7 +146,8 @@ class Tensor {
  virtual const Quantization& GetQuantization() = 0;
  virtual TensorSpec& GetSpec() = 0;
  virtual uint32_t GetId() = 0;
-  virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes = 0) = 0;
+  virtual bool CopyDataToTensor(const void* data,
                                uint32_t size_in_bytes = 0) = 0;
  virtual bool CopyDataFromTensor(void* data) = 0;
  virtual bool FlushCacheForHandle() = 0;
  virtual bool InvalidateCacheForHandle() = 0;
@ -158,10 +157,13 @@ class Tensor {
  virtual bool IsConstTensor() = 0;
  virtual bool SaveTensorToTextByFp32(std::string filename) = 0;
  virtual void* ConvertTensorToData(uint8_t* tensorData) = 0;
  virtual float* ConvertTensorToFloat32Data() = 0;
 };
 namespace utils {
-  bool Float32ToDtype(std::shared_ptr<tim::vx::Tensor> tensor, std::vector<float> fval, uint8_t* tensorData);
+bool Float32ToDtype(std::shared_ptr<tim::vx::Tensor> tensor,
-  bool DtypeToFloat32(std::shared_ptr<tim::vx::Tensor> tensor, uint8_t* tensorData, float* data);
+                    std::vector<float> fval, uint8_t* tensorData);
 bool DtypeToFloat32(std::shared_ptr<tim::vx::Tensor> tensor,
                    uint8_t* tensorData, float* data);
 }  //namespace utils
 }  // namespace vx
 }  // namespace tim
--- a/src/tim/transform/mean_stddev_normalize_fusion.cc
+++ b/src/tim/transform/mean_stddev_normalize_fusion.cc
@ -0,0 +1,368 @@
 #include <algorithm>
 #include <stdarg.h>
 #include "tim/transform/mean_stddev_normalize_fusion.h"
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/operation.h"
 #include "tim/vx/ops/layernormalization.h"
 #include "tim/vx/ops/instancenormalization.h"
 #include "builtin_op_impl.h"
 namespace tim {
 namespace transform {
 enum {
  NORMALIZATION_INDEX_MEAN_0 = 0,
  NORMALIZATION_INDEX_SUB_0 = 1,
  NORMALIZATION_INDEX_MUL_2 = 2,
  NORMALIZATION_INDEX_POW = 3,
  NORMALIZATION_INDEX_MEAN_1 = 4,
  NORMALIZATION_INDEX_ADD_0 = 5,
  NORMALIZATION_INDEX_RSQRT = 6,
  NORMALIZATION_INDEX_MUL_0 = 7,
  NORMALIZATION_INDEX_MUL_1 = 8,
  NORMALIZATION_INDEX_ADD_1 = 9,
  NORMALIZATION_INDEX_SUB_1 = 10
 };
 //Determine whether the needed opkind is in given consumer op list
 bool OpkindInConsumers(std::vector<std::shared_ptr<vx::Operation>> consumers,
                       int32_t op_id) {
  if (consumers.size() == 1 && consumers[0]->impl()->kind_ != op_id) {
    return false;
  }
  auto op_iter = std::find_if(consumers.begin(), consumers.end(),
                              [op_id](std::shared_ptr<vx::Operation> oper) {
                                return oper.get()->impl()->kind_ == op_id;
                              });
  return op_iter != consumers.end();
 }
 // Check if one of op's consumers has already in list.
 // Only if the consumer is same to the compared one with given index
 // can be considered as pattern matched.
 bool OpInConsumer(const std::shared_ptr<vx::Graph>& graph,
                  const std::shared_ptr<vx::Operation>& current,
                  const std::shared_ptr<vx::Operation>& compared) {
  auto output_tensor = current->impl()->OutputsTensor()[0];
  auto ops = graph->GetConsumersOp(output_tensor);
  for (auto op : ops) {
    if (op == compared) {
      return true;
    }
  }
  return false;
 }
 // Determine whether the current op is suitable for pattern matching with specified
 // consumers. The possible ops will be stored in a temporary vector created during
 // each Pattern Matching. Special situation that the consumer has already in list will
 // NOT be concerned in this function.
 bool UpdateTempVector(std::vector<std::shared_ptr<vx::Operation>>& temp,
                      int32_t curr_index,
                      const std::shared_ptr<vx::Graph>& graph,
                      std::vector<int32_t> op_kind) {
  auto outputs = temp[curr_index]->impl()->OutputsTensor();
  auto ops = graph->GetConsumersOp(outputs[0]);
  if (outputs.size() > 1 || ops.size() != op_kind.size() || op_kind.size() > 2)
    return false;
  else {
    for (int32_t op_k : op_kind) {
      if (!OpkindInConsumers(ops, op_k)) return false;
    }
    if (op_kind.size() == 2) {
      int32_t first_index = ops[0]->impl()->kind_ == op_kind[0] ? 0 : 1;
      //push back ops as same order as need
      temp.push_back(graph->GetConsumersOp(outputs[0])[first_index]);
      temp.push_back(graph->GetConsumersOp(outputs[0])[1 - first_index]);
    } else {
      temp.push_back(ops[0]);
    }
    return true;
  }
 }
 // Remove ops and tensors in each matched normlization patten
 void RemoveTensorsAndOps(
    std::shared_ptr<vx::Graph>& graph,
    std::vector<std::shared_ptr<vx::Operation>>& norm_ops) {
  for (uint32_t i = 0; i < norm_ops.size(); i++) {
    auto it = std::find(graph->OpVector().begin(), graph->OpVector().end(),
                        norm_ops[i]);
    graph->OpVector().erase(it);  //Remove current op from op_vector_
    auto input_tensors = norm_ops[i]->impl()->InputsTensor();
    auto output_tensors = norm_ops[i]->impl()->OutputsTensor();
    switch (i) {
      case NORMALIZATION_INDEX_MEAN_0:
      case NORMALIZATION_INDEX_SUB_0:
      case NORMALIZATION_INDEX_MUL_1:
        for (auto tensor : input_tensors) {
          if (tensor->GetSpec().attr_ == vx::TensorAttribute::CONSTANT)
            graph->TensorConsumer().erase(tensor);
          else {
            it = std::find_if(
                graph->TensorConsumer()[tensor].begin(),
                graph->TensorConsumer()[tensor].end(),
                [i, norm_ops](std::shared_ptr<vx::Operation> oper) {
                  return oper == norm_ops[i];
                });
            if (it != graph->TensorConsumer()[tensor].end())
              graph->TensorConsumer()[tensor].erase(it);
            if (graph->TensorConsumer()[tensor].empty())
              graph->TensorConsumer().erase(tensor);
          }
          graph->TensorProducer().erase(output_tensors[0]);
        }
        break;
      case NORMALIZATION_INDEX_ADD_1:
        break;
      default:
        for (auto tensor : input_tensors) {
          if (tensor->GetSpec().attr_ != vx::TensorAttribute::CONSTANT) {
            if (graph->TensorProducer()[tensor] != nullptr) {
              auto it =
                  std::find(graph->OpVector().begin(), graph->OpVector().end(),
                            graph->GetProducerOp(tensor));
              graph->OpVector().erase(it);
              graph->TensorProducer().erase(tensor);
            }
          }
          graph->TensorConsumer().erase(tensor);
          for (auto tensor : output_tensors)
            graph->TensorProducer().erase(tensor);
        }
        break;
    }
  }
 }
 bool CheckMediumMul(const std::shared_ptr<vx::Graph>& graph,
                    std::vector<std::shared_ptr<vx::Operation>>& norm_ops) {
  auto mul0_output_tensor =
      norm_ops[NORMALIZATION_INDEX_MUL_0]->impl()->OutputsTensor();
  auto mul0_consumers = graph->GetConsumersOp(mul0_output_tensor[0]);
  if (mul0_output_tensor.size() > 1 || mul0_consumers.size() != 2 ||
      mul0_consumers[0]->impl()->kind_ != 1 ||
      mul0_consumers[1]->impl()->kind_ != 1)
    return false;
  if (!OpInConsumer(graph, norm_ops[NORMALIZATION_INDEX_MUL_0],
                    norm_ops[NORMALIZATION_INDEX_MUL_2]))
    return false;
  int32_t mul1_index = graph->GetConsumersOp(mul0_output_tensor[0])[0] ==
                               norm_ops[NORMALIZATION_INDEX_MUL_2]
                           ? 1
                           : 0;
  norm_ops.push_back(mul0_consumers[mul1_index]);
  return true;
 }
 bool HaveASameInput(const std::shared_ptr<vx::Operation>& op1,
                    const std::shared_ptr<vx::Operation>& op2) {
  auto Left = op1->impl()->InputsTensor();
  auto Right = op2->impl()->InputsTensor();
  for (auto left_tensor : Left) {
    if (std::find(Right.begin(), Right.end(), left_tensor) != Right.end())
      return true;
  }
  return false;
 }
 void LayernormConnection(std::shared_ptr<vx::Graph>& graph,
                         std::vector<std::shared_ptr<vx::Operation>> norm_ops) {
  auto src_tensor =
      norm_ops[NORMALIZATION_INDEX_MEAN_0]->impl()->InputsTensor()[0];
  auto final_tensor =
      norm_ops[NORMALIZATION_INDEX_ADD_1]->impl()->OutputsTensor()[0];
  int32_t axis = *norm_ops[NORMALIZATION_INDEX_MEAN_0]
                      ->impl()
                      ->node()
                      ->nn_param.reduce.axis;
  axis = src_tensor->GetShape().size() - axis - 1;  // reverse axis
  // Get eps, gamma,beta;
  // Do datatype convert due to InstanceNormlization op requirements
  int32_t eps_index = graph->GetProducerOp(
                          norm_ops[5]->impl()->InputsTensor()[0]) == norm_ops[4]
                          ? 1
                          : 0;
  auto org_eps = norm_ops[5]->impl()->InputsTensor()[eps_index];
  if (!org_eps->IsConstTensor()) {
    org_eps = graph->GetProducerOp(org_eps)->impl()->InputsTensor()[0];
  }
  auto org_gamma =
      norm_ops[NORMALIZATION_INDEX_MUL_0]->impl()->InputsTensor()[1];
  auto org_beta =
      norm_ops[NORMALIZATION_INDEX_SUB_1]->impl()->InputsTensor()[0];
  float* float_eps = org_eps->ConvertTensorToFloat32Data();
  float* float_gamma = org_gamma->ConvertTensorToFloat32Data();
  float* float_beta = org_beta->ConvertTensorToFloat32Data();
  RemoveTensorsAndOps(graph, norm_ops);
  std::vector<uint32_t> shape(src_tensor->GetShape().size(), 1);
  shape[axis] = src_tensor->GetShape()[axis];
  vx::TensorSpec param_spec(vx::DataType::FLOAT32, shape,
                            vx::TensorAttribute::CONSTANT);
  auto beta = graph->CreateTensor(param_spec);
  auto gamma = graph->CreateTensor(param_spec);
  float eps = *float_eps;
  beta->CopyDataToTensor(float_beta);
  gamma->CopyDataToTensor(float_gamma);
  vsi_nn_Free(float_gamma);
  vsi_nn_Free(float_beta);
  vsi_nn_Free(float_eps);
  auto layernorm =
      graph->CreateOperation<vx::ops::LayerNormalization>(axis, eps);
  graph->TensorConsumer()[src_tensor].push_back(layernorm);
  layernorm->BindInputs({src_tensor, beta, gamma});
  layernorm->BindOutputs({final_tensor});
 }
 void InstancenormConnection(
    std::shared_ptr<vx::Graph>& graph,
    std::vector<std::shared_ptr<vx::Operation>> norm_ops) {
  auto src_tensor =
      norm_ops[NORMALIZATION_INDEX_MEAN_0]->impl()->InputsTensor()[0];
  auto final_tensor =
      norm_ops[NORMALIZATION_INDEX_ADD_1]->impl()->OutputsTensor()[0];
  // Get eps, gamma,beta from graph.
  // Do datatype convert due to InstanceNormlization op requirements
  int32_t eps_index =
      graph->GetProducerOp(
          norm_ops[NORMALIZATION_INDEX_ADD_0]->impl()->InputsTensor()[0]) ==
              norm_ops[NORMALIZATION_INDEX_MEAN_1]
          ? 1
          : 0;
  auto org_eps =
      norm_ops[NORMALIZATION_INDEX_ADD_0]->impl()->InputsTensor()[eps_index];
  if (!org_eps->IsConstTensor()) {
    org_eps = graph->GetProducerOp(org_eps)->impl()->InputsTensor()[0];
  }
  auto org_gamma =
      norm_ops[NORMALIZATION_INDEX_MUL_0]->impl()->InputsTensor()[1];
  auto org_beta =
      norm_ops[NORMALIZATION_INDEX_SUB_1]->impl()->InputsTensor()[0];
  float* float_eps = org_eps->ConvertTensorToFloat32Data();
  float* float_gamma = org_gamma->ConvertTensorToFloat32Data();
  float* float_beta = org_beta->ConvertTensorToFloat32Data();
  RemoveTensorsAndOps(graph, norm_ops);
  std::vector<uint32_t> shape(src_tensor->GetShape().size(), 1);
  shape[0] = src_tensor->GetShape()[0];
  vx::TensorSpec param_spec(vx::DataType::FLOAT32, shape,
                            vx::TensorAttribute::CONSTANT);
  auto beta = graph->CreateTensor(param_spec);
  auto gamma = graph->CreateTensor(param_spec);
  float eps = *float_eps;
  beta->CopyDataToTensor(float_beta);
  gamma->CopyDataToTensor(float_gamma);
  vsi_nn_Free(float_gamma);
  vsi_nn_Free(float_beta);
  vsi_nn_Free(float_eps);
  auto instancenorm = graph->CreateOperation<vx::ops::InstanceNormalization>(
      eps, vx::DataLayout::CWHN);
  graph->TensorConsumer()[src_tensor].push_back(instancenorm);
  instancenorm->BindInputs({src_tensor, beta, gamma});
  instancenorm->BindOutputs({final_tensor});
 }
 /* Checking Mean StdDev Normalization structure:
         input
        /  |  \
       /   |   Mean0
      |    |   / |
      |   Sub0   |
      |    |     |
      |   Pow    |
      |    |     |
      |   Mean1  |
      |    |     |
      |   Add0   |
      |    |     |
      |   Rsqrt  |
      |    |     |
      |   Mul0   |
      |  /    \  |
     Mul1      Mul2
      |         |
      |       Sub1
       \      /
         Add1
          |
        output
 */
 void MeanStdDevNormalization(std::shared_ptr<vx::Graph>& graph) {
  std::vector<std::shared_ptr<vx::Operation>> op_vector = graph->OpVector();
  for (const auto& op : op_vector) {
    if (op->impl()->kind_ != VSI_NN_OP_REDUCE) continue;
    std::vector<std::shared_ptr<vx::Operation>> temp;
    temp.push_back(op);
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_MEAN_0, graph,
                          {VSI_NN_OP_SUBTRACT, VSI_NN_OP_MULTIPLY}))
      continue;
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_SUB_0, graph,
                          {VSI_NN_OP_POW}))
      continue;
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_POW, graph,
                          {VSI_NN_OP_REDUCE}))
      continue;  //Mean1
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_MEAN_1, graph,
                          {VSI_NN_OP_ADD}))
      continue;  //Add0
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_ADD_0, graph,
                          {VSI_NN_OP_RSQRT}))
      continue;  //Rsqrt
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_RSQRT, graph,
                          {VSI_NN_OP_MULTIPLY}))
      continue;  //Mul0
    if (!CheckMediumMul(graph, temp)) continue;
    if (!HaveASameInput(temp[NORMALIZATION_INDEX_MEAN_0],
                        temp[NORMALIZATION_INDEX_SUB_0]) &&
        !HaveASameInput(temp[NORMALIZATION_INDEX_MEAN_0],
                        temp[NORMALIZATION_INDEX_MUL_1]) &&
        !HaveASameInput(temp[NORMALIZATION_INDEX_SUB_0],
                        temp[NORMALIZATION_INDEX_MUL_1]))
      continue;
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_MUL_1, graph,
                          {VSI_NN_OP_ADD}))
      continue;  //Add1
    if (!UpdateTempVector(temp, NORMALIZATION_INDEX_MUL_2, graph,
                          {VSI_NN_OP_SUBTRACT}))  //Sub1
      continue;
    auto sub_outputs = temp[NORMALIZATION_INDEX_SUB_1]->impl()->OutputsTensor();
    if (sub_outputs.size() >= 2 ||
        graph->GetConsumersOp(sub_outputs[0]).size() > 1 ||
        graph->GetConsumersOp(sub_outputs[0])[0]->impl()->kind_ != 0)
      continue;
    if (!OpInConsumer(graph, temp[NORMALIZATION_INDEX_SUB_1],
                      temp[NORMALIZATION_INDEX_ADD_1]))
      continue;
    int axis_num = temp[NORMALIZATION_INDEX_MEAN_0]
                       ->impl()
                       ->node()
                       ->nn_param.reduce.axis_num;
    if (axis_num == 1) {
      LayernormConnection(graph, temp);
    } else {
      InstancenormConnection(graph, temp);
    }
  }
 }
 }  // namespace transform
 }  // namespace tim
--- a/src/tim/vx/graph.cc
+++ b/src/tim/vx/graph.cc
@ -99,11 +99,13 @@ GraphImpl::GraphImpl(ContextImpl* context, const CompileOption& options)
 GraphImpl::~GraphImpl() { vsi_nn_ReleaseGraph(&graph_); }
 #ifdef ENABLE_TENSOR_CACHE
-std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GraphImpl::GetTensorCacheMap() {
+std::map<std::string, std::shared_ptr<tim::vx::Tensor>>&
 GraphImpl::GetTensorCacheMap() {
  return cached_tensor_;
 }
-const std::string GraphImpl::CalculateCacheKey(const TensorSpec& spec, const void* data) {
+const std::string GraphImpl::CalculateCacheKey(const TensorSpec& spec,
                                               const void* data) {
  std::string md5_key;
  uint32_t data_size = 1;
  for (auto it = spec.shape_.begin(); it != spec.shape_.end(); ++it) {
@ -135,12 +137,15 @@ const std::string GraphImpl::CalculateCacheKey(const TensorSpec& spec, const voi
  return md5_key;
 }
-std::shared_ptr<Tensor> GraphImpl::GetTensorFromCache(const TensorSpec& spec, const void* data) {
+std::shared_ptr<Tensor> GraphImpl::GetTensorFromCache(const TensorSpec& spec,
                                                      const void* data) {
  std::shared_ptr<tim::vx::Tensor> tensor;
  std::string md5_key = CalculateCacheKey(spec, data);
  if (GetTensorCacheMap().find(md5_key) != GetTensorCacheMap().end() &&
-      GetTensorCacheMap()[md5_key]->GetQuantization().Scales() == spec.quantization_.Scales() &&
+      GetTensorCacheMap()[md5_key]->GetQuantization().Scales() ==
-      GetTensorCacheMap()[md5_key]->GetQuantization().ZeroPoints() == spec.quantization_.ZeroPoints()) {
+          spec.quantization_.Scales() &&
      GetTensorCacheMap()[md5_key]->GetQuantization().ZeroPoints() ==
          spec.quantization_.ZeroPoints()) {
    tensor = GetTensorCacheMap()[md5_key];
  } else {
    tensor = std::make_shared<TensorImpl>(this, spec, data);
@ -190,6 +195,20 @@ const std::vector<std::shared_ptr<Tensor>> GraphImpl::OutputsTensor() const {
  return outputs_tensor_;
 }
 std::vector<std::shared_ptr<Operation>>& GraphImpl::OpVector() {
  return op_vector_;
 }
 std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>>&
 GraphImpl::TensorConsumer() {
  return tensor_consumers_;
 }
 std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>>&
 GraphImpl::TensorProducer() {
  return tensor_producer_;
 }
 void GraphImpl::UpdateTensorConsumersMap(const std::shared_ptr<Tensor>& tensor,
                                         const Operation* op) {
  for (const auto& added_op : op_vector_) {
@ -320,15 +339,16 @@ bool GraphImpl::Setup() {
  bool is_fast_mode = options_.isRelaxMode();
  if (is_fast_mode) {
-    VSILOGW("Important notice: float model executed in bfloat16 "
+    VSILOGW(
        "Important notice: float model executed in bfloat16 "
        "mode which will have better performance but lower precesion");
  }
  vsi_nn_SetGraphFastMode(graph_, is_fast_mode);
 #if defined(ENABLE_PLATFORM)
  auto id = options_.getDeviceId();
-  vxSetGraphAttribute(graph_->g, VX_GRAPH_DEVICE_INDEX_VIV,
+  vxSetGraphAttribute(graph_->g, VX_GRAPH_DEVICE_INDEX_VIV, (void*)(&id),
-                      (void*)(&id), sizeof(id));
+                      sizeof(id));
 #endif
  std::call_once(setio_once_, [&status, this]() {
@ -348,10 +368,14 @@ bool GraphImpl::Compile() {
  bool status = true;
  if (not_consumed_input_cnt_ > 0) {
    // Tensor can bind to different operations
-    VSILOGW("Graph has free input, INPUT tensor may be created but not consumed.");
+    VSILOGW(
        "Graph has free input, INPUT tensor may be created but not "
        "consumed.");
  }
  if (not_consumed_output_cnt_ != 0) {
-    VSILOGW("Graph has free output, OUTPUT tensor may be created but not consumed.");
+    VSILOGW(
        "Graph has free output, OUTPUT tensor may be created but not "
        "consumed.");
  }
  status = Setup();
  std::call_once(verify_graph_once_, [&status, this]() {
--- a/src/tim/vx/graph_private.h
+++ b/src/tim/vx/graph_private.h
@ -42,10 +42,12 @@ namespace vx {
 class GraphImpl : public Graph {
 public:
-  GraphImpl(ContextImpl* context, const CompileOption& options = CompileOption::DefaultOptions);
+  GraphImpl(ContextImpl* context,
            const CompileOption& options = CompileOption::DefaultOptions);
  ~GraphImpl();
 #ifdef ENABLE_TENSOR_CACHE
-  std::shared_ptr<Tensor> GetTensorFromCache(const TensorSpec& spec, const void* data);
+  std::shared_ptr<Tensor> GetTensorFromCache(const TensorSpec& spec,
                                             const void* data);
  const std::string CalculateCacheKey(const TensorSpec& spec, const void* data);
  std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GetTensorCacheMap();
 #endif
@ -62,7 +64,11 @@ class GraphImpl : public Graph {
  const std::vector<std::shared_ptr<Tensor>> InputsTensor() const override;
  const std::vector<std::shared_ptr<Tensor>> OutputsTensor() const override;
-
+  std::vector<std::shared_ptr<Operation>>& OpVector() override;
  std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>>&
  TensorConsumer() override;
  std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>>&
  TensorProducer() override;
  void UpdateTensorConsumersMap(const std::shared_ptr<Tensor>& tensor,
                                const Operation* op) override;
  void RenewTensorConsumersMap(const std::shared_ptr<Tensor>& org_tensor,
@ -106,12 +112,15 @@ class GraphImpl : public Graph {
  int32_t not_consumed_input_cnt_;
  std::vector<std::shared_ptr<Tensor>> outputs_tensor_;
  int32_t not_consumed_output_cnt_;
-  std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>> tensor_consumers_;
+  std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>>
-  std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>> tensor_producer_;
+      tensor_consumers_;
  std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>>
      tensor_producer_;
 #ifdef ENABLE_TENSOR_CACHE
  std::map<std::string, std::shared_ptr<tim::vx::Tensor>> cached_tensor_;
 #endif
  CompileOption options_;
 private:
  /// Setup graph
  bool Setup();
--- a/src/tim/vx/tensor.cc
+++ b/src/tim/vx/tensor.cc
@ -94,7 +94,8 @@ TensorImpl::TensorImpl(Graph* graph, const TensorSpec& spec, const void* data)
  }
 }
-TensorImpl::TensorImpl(Graph* graph, const TensorSpec& spec, const DmaBufferDesc& dmafd)
+TensorImpl::TensorImpl(Graph* graph, const TensorSpec& spec,
                       const DmaBufferDesc& dmafd)
    : graph_(reinterpret_cast<GraphImpl*>(graph)),
      id_(VSI_NN_TENSOR_ID_NA),
      spec_(spec),
@ -120,7 +121,8 @@ TensorImpl::~TensorImpl() {}
 bool TensorImpl::SaveTensorToTextByFp32(std::string filename) {
  vsi_nn_tensor_t* tensor = vsi_nn_GetTensor(graph_->graph(), id_);
-  vsi_nn_SaveTensorToTextByFp32(graph_->graph(), tensor, filename.c_str(), NULL);
+  vsi_nn_SaveTensorToTextByFp32(graph_->graph(), tensor, filename.c_str(),
                                NULL);
  return true;
 }
@ -154,8 +156,7 @@ bool TensorImpl::CopyDataToTensor(const void* data, uint32_t size_in_bytes) {
        } else {
          VSILOGE("GetTensorHandle fail");
        }
-      }
+      } else {
      else {
        /*
        argument `data` of vsi_nn_CopyDataToTensor is non-const
        convert it from const data to non-const, will be fixed in ovxlib
@ -163,8 +164,8 @@ bool TensorImpl::CopyDataToTensor(const void* data, uint32_t size_in_bytes) {
        const uint8_t* end = static_cast<const uint8_t*>(data) + tensor_bytes;
        std::vector<uint8_t> data_copy(static_cast<const uint8_t*>(data), end);
-        retn = (VSI_SUCCESS ==
+        retn = (VSI_SUCCESS == vsi_nn_CopyDataToTensor(graph_->graph(), tensor,
-             vsi_nn_CopyDataToTensor(graph_->graph(), tensor, data_copy.data()));
+                                                       data_copy.data()));
      }
    }
  }
@ -207,6 +208,11 @@ bool TensorImpl::CopyDataFromTensor(void* data) {
  return retn;
 }
 float* TensorImpl::ConvertTensorToFloat32Data() {
  return vsi_nn_ConvertTensorToFloat32Data(
      graph_->graph(), vsi_nn_GetTensor(graph_->graph(), id_));
 }
 bool TensorImpl::FlushCacheForHandle() {
  if (!(spec_.attr_ & TensorAttribute::INPUT)) {
    return false;
@ -331,7 +337,8 @@ bool TensorImpl::Init(void *external_cache) {
        graph_->graph(),
        VSI_NN_TENSOR_ID_AUTO,  // DMABUF's fd is created by TensorFromHandle as input or output,
        &attr,
-        fd_ != -1 ? (uint8_t*)fd_ : (uint8_t*)external_cache); // and cannot be set to const
+        fd_ != -1 ? (uint8_t*)fd_
                  : (uint8_t*)external_cache);  // and cannot be set to const
 #else
    if (-1 == fd_) {
      id_ = vsi_nn_AddTensorFromHandle(graph_->graph(), VSI_NN_TENSOR_ID_AUTO,
@ -447,25 +454,26 @@ int64_t TensorSpec::GetByteSize() const {
 bool Quantization::operator==(const Quantization& other_quant) const {
  if (type_ != tim::vx::QuantType::DYNAMIC_FIXED_POINT) {
-       if(type_ ==  other_quant.type_ &&
+    if (type_ == other_quant.type_ && scales_ == other_quant.scales_ &&
          scales_ == other_quant.scales_ &&
        zero_points_ == other_quant.zero_points_ &&
        channel_dim_ == other_quant.channel_dim_)
      return true;
-    }
+  } else if (fl_ == other_quant.fl_)
-    else if(fl_ == other_quant.fl_) return true;
+    return true;
  return false;
 }
 namespace utils {
-bool Float32ToDtype(std::shared_ptr<tim::vx::Tensor> tensor, std::vector<float> fval, uint8_t* tensorData){
+bool Float32ToDtype(std::shared_ptr<tim::vx::Tensor> tensor,
                    std::vector<float> fval, uint8_t* tensorData) {
  bool retn = true;
  vsi_nn_tensor_attr_t attr;
  uint32_t sz = tensor->GetSpec().GetElementNum();
  uint32_t stride = tensor->GetSpec().GetElementByteSize();
  PackTensorDtype(tensor->GetSpec(), &attr.dtype);
  for (uint32_t i = 0; i < sz; i++) {
-  retn = (VSI_SUCCESS == vsi_nn_Float32ToDtype(fval[i], &tensorData[i * stride], &attr.dtype));
+    retn = (VSI_SUCCESS == vsi_nn_Float32ToDtype(
                               fval[i], &tensorData[i * stride], &attr.dtype));
    if (!retn) {
      VSILOGE("Convert data fail");
      return retn;
@ -474,7 +482,8 @@ for (uint32_t i = 0; i < sz; i++){
  return retn;
 }
-bool DtypeToFloat32(std::shared_ptr<tim::vx::Tensor> tensor, uint8_t* tensorData, float* data){
+bool DtypeToFloat32(std::shared_ptr<tim::vx::Tensor> tensor,
                    uint8_t* tensorData, float* data) {
  bool retn = true;
  vsi_nn_tensor_attr_t attr;
--- a/src/tim/vx/tensor_private.h
+++ b/src/tim/vx/tensor_private.h
@ -58,7 +58,7 @@ class TensorImpl : public Tensor {
  }
  bool SaveTensorToTextByFp32(std::string filename) override;
  void* ConvertTensorToData(uint8_t* tensorData) override;
-
+  float* ConvertTensorToFloat32Data() override;
  GraphImpl* graph_;
  vsi_nn_tensor_id_t id_;
  TensorSpec spec_;
@ -103,6 +103,7 @@ class TensorPlaceholder : public Tensor {
    (void)tensorData;
    return nullptr;
  }
  float* ConvertTensorToFloat32Data() override { return nullptr; }
  vsi_nn_tensor_id_t id_;
  TensorSpec spec_;